Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

address issues handling large dataset #27

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
examples/*
data/*
Binary file added easymore/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file added easymore/__pycache__/easymore.cpython-38.pyc
Binary file not shown.
122 changes: 77 additions & 45 deletions easymore/easymore.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(self):
self.license = '' # data license
self.tolerance = 10**-5 # tolerance
self.save_csv = False # save csv
self.save_temp = False # option to save temporary shape file
self.sort_ID = False # to sort the remapped based on the target shapfile ID; self.target_shp_ID should be given
self.complevel = 4 # netcdf compression level from 1 to 9. Any other value or object will mean no compression.
self.version = '0.0.2' # version of the easymore
Expand Down Expand Up @@ -83,48 +84,54 @@ def nc_remapper(self):
if (self.case == 1 or self.case == 2) and (self.source_shp == ''):
if self.case == 1:
if hasattr(self, 'lat_expanded') and hasattr(self, 'lon_expanded'):
self.lat_lon_SHP(self.lat_expanded, self.lon_expanded,\
source_shp_gpd= self.lat_lon_SHP(self.lat_expanded, self.lon_expanded,\
self.temp_dir+self.case_name+'_source_shapefile.shp')
else:
self.lat_lon_SHP(self.lat, self.lon,\
source_shp_gpd= self.lat_lon_SHP(self.lat, self.lon,\
self.temp_dir+self.case_name+'_source_shapefile.shp')
else:
self.lat_lon_SHP(self.lat, self.lon,\
source_shp_gpd= self.lat_lon_SHP(self.lat, self.lon,\
self.temp_dir+self.case_name+'_source_shapefile.shp')
print('EASYMORE is creating the shapefile from the netCDF file and saving it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
if self.save_temp:
print('EASYMORE is creating the shapefile from the netCDF file and saving it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
if (self.case == 1 or self.case == 2) and (self.source_shp != ''):
source_shp_gpd = gpd.read_file(self.source_shp)
source_shp_gpd = self.add_lat_lon_source_SHP(source_shp_gpd, self.source_shp_lat,\
self.source_shp_lon, self.source_shp_ID)
source_shp_gpd.to_file(self.temp_dir+self.case_name+'_source_shapefile.shp')
print('EASYMORE detect the shapefile is provided and will resave it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
if self.save_temp:
source_shp_gpd.to_file(self.temp_dir+self.case_name+'_source_shapefile.shp')
print('EASYMORE detect the shapefile is provided and will resave it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
# if case 3 or source shapefile is provided
if (self.case == 3) and (self.source_shp != ''):
self.check_source_nc_shp() # check the lat lon in soure shapefile and nc file
source_shp_gpd = gpd.read_file(self.source_shp)
source_shp_gpd = self.add_lat_lon_source_SHP(source_shp_gpd, self.source_shp_lat,\
self.source_shp_lon, self.source_shp_ID)
source_shp_gpd.to_file(self.temp_dir+self.case_name+'_source_shapefile.shp')
print('EASYMORE is creating the shapefile from the netCDF file and saving it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
if self.save_temp:
source_shp_gpd.to_file(self.temp_dir+self.case_name+'_source_shapefile.shp')
print('EASYMORE is creating the shapefile from the netCDF file and saving it here:')
print(self.temp_dir+self.case_name+'_source_shapefile.shp')
# intersection of the source and sink/target shapefile
shp_1 = gpd.read_file(self.temp_dir+self.case_name+'_target_shapefile.shp')
shp_2 = gpd.read_file(self.temp_dir+self.case_name+'_source_shapefile.shp')
shp_1 = target_shp_gpd
shp_2 = source_shp_gpd
# correction of the source and target shapefile to frame of -180 to 180
shp_1 = self.shp_lon_correction(shp_1)
shp_2 = self.shp_lon_correction(shp_2)
shp_1.to_file(self.temp_dir+self.case_name+'_target_shapefile_corrected_frame.shp')
shp_2.to_file(self.temp_dir+self.case_name+'_source_shapefile_corrected_frame.shp')
if self.save_temp:
shp_1.to_file(self.temp_dir+self.case_name+'_target_shapefile_corrected_frame.shp')
shp_2.to_file(self.temp_dir+self.case_name+'_source_shapefile_corrected_frame.shp')
# reprojections to equal area
if (str(shp_1.crs).lower() == str(shp_2.crs).lower()) and ('epsg:4326' in str(shp_1.crs).lower()):
shp_1 = shp_1.to_crs ("EPSG:6933") # project to equal area
shp_1.to_file(self.temp_dir+self.case_name+'test.shp')
shp_1 = gpd.read_file(self.temp_dir+self.case_name+'test.shp')
if self.save_temp:
shp_1.to_file(self.temp_dir+self.case_name+'test.shp')
shp_1 = gpd.read_file(self.temp_dir+self.case_name+'test.shp')
shp_2 = shp_2.to_crs ("EPSG:6933") # project to equal area
shp_2.to_file(self.temp_dir+self.case_name+'test.shp')
shp_2 = gpd.read_file(self.temp_dir+self.case_name+'test.shp')
if self.save_temp:
shp_2.to_file(self.temp_dir+self.case_name+'test.shp')
shp_2 = gpd.read_file(self.temp_dir+self.case_name+'test.shp')
# remove test files
removeThese = glob.glob(self.temp_dir+self.case_name+'test.*')
for file in removeThese:
Expand All @@ -134,7 +141,8 @@ def nc_remapper(self):
shp_int = self.intersection_shp(shp_1, shp_2)
shp_int = shp_int.sort_values(by=['S_1_ID_t']) # sort based on ID_t
shp_int = shp_int.to_crs ("EPSG:4326") # project back to WGS84
shp_int.to_file(self.temp_dir+self.case_name+'_intersected_shapefile.shp') # save the intersected files
if self.save_temp:
shp_int.to_file(self.temp_dir+self.case_name+'_intersected_shapefile.shp') # save the intersected files
shp_int = shp_int.drop(columns=['geometry']) # remove the geometry
# rename dictionary
dict_rename = {'S_1_ID_t' : 'ID_t',
Expand Down Expand Up @@ -630,24 +638,25 @@ def lat_lon_SHP(self,
# check if lat/lon that are taken in has the same dimension
import geopandas as gpd
from shapely.geometry import Polygon
import shapefile # pyshed library
import shapely
from numba import jit
# import shapefile # pyshed library
# import shapely
#
lat_lon_shape = lat.shape
# write the shapefile
with shapefile.Writer(file_name) as w:
w.autoBalance = 1 # turn on function that keeps file stable if number of shapes and records don't line up
w.field("ID_s",'N') # create (N)umerical attribute fields, integer
w.field("lat_s",'F',decimal=4) # float with 4 decimals
w.field("lon_s",'F',decimal=4) # float with 4 decimals
# preparing the m whcih is a couter for the shapefile arbitrary ID
m = 0.00
# itterating to create the shapes of the result shapefile ignoring the first and last rows and columns
@jit(nopython=True)
def bottleneck(lat: np.ndarray, lon: np.ndarray) -> tuple:
'''
replace two for-loop with C++
'''
lat_lon_shape= lon.shape
geometry = []
records= []
m=0
for i in range(1, lat_lon_shape[0] - 1):
for j in range(1, lat_lon_shape[1] - 1):
# checking is lat and lon is located inside the provided bo
# empty the polygon variable
parts = []
# update records
m += 1 # ID
center_lat = lat[i,j] # lat value of data point in source .nc file
Expand All @@ -670,8 +679,7 @@ def lat_lon_SHP(self,
Lon_LowLeft = (lon[i, j - 1] + lon[i + 1, j - 1] + lon[i + 1, j] + lon[i, j]) / 4
Lon_Left = (lon[i, j - 1] + lon[i, j]) / 2
Lon_UpLeft = (lon[i - 1, j - 1] + lon[i - 1, j] + lon[i, j - 1] + lon[i, j]) / 4
# creating the polygon given the lat and lon
parts.append([ (Lon_Up, Lat_Up),\
geometry.append([ (Lon_Up, Lat_Up),\
(Lon_UpRright, Lat_UpRright), \
(Lon_Right, Lat_Right), \
(Lon_LowRight, Lat_LowRight), \
Expand All @@ -680,10 +688,20 @@ def lat_lon_SHP(self,
(Lon_Left, Lat_Left), \
(Lon_UpLeft, Lat_UpLeft), \
(Lon_Up, Lat_Up)])
# store polygon
w.poly(parts)
# update records/fields for the polygon
w.record(m, center_lat, center_lon)
records.append([m, center_lat, center_lon])

return (geometry, records)

geometry, records= bottleneck(lat, lon)
records= np.stack(records)
gdf= gpd.GeoDataFrame(geometry=[Polygon(_geom) for _geom in geometry])
gdf['ID_s']= records[:,0].astype(np.int32)
gdf['lat_s']= records[:,1].astype(np.float32)
gdf['lon_s']= records[:,2].astype(np.float32)
if self.save_temp:
gdf.to_file(file_name)

return gdf

def add_lat_lon_source_SHP( self,
shp,
Expand Down Expand Up @@ -1365,8 +1383,9 @@ def intersection_shp( self,
shp_2):
import geopandas as gpd
from shapely.geometry import Polygon
import shapefile # pyshed library
import shapely
from numba import jit
# import shapefile # pyshed library
# import shapely
"""
@ author: Shervan Gharari
@ Github: https://github.com/ShervanGharari/EASYMORE
Expand All @@ -1389,6 +1408,15 @@ def intersection_shp( self,
result: a geo data frame that includes the intersected shapefile and area, percent and normalized percent of each shape
elements in another one
"""
@jit(nopython=True)
def normalize(ids: np.ndarray, unique_ids: np.ndarray,
ap: np.ndarray) -> np.ndarray:
ap_new= ap.copy()
for i in unique_ids:
idx= np.where(ids==i)
ap_new[idx]= ap[idx]/ap[idx].sum()
return ap_new

# get the column name of shp_1
column_names = shp_1.columns
column_names = list(column_names)
Expand Down Expand Up @@ -1425,18 +1453,22 @@ def intersection_shp( self,
AP1 = np.array(result['AP1'])
AP1N = AP1 # creating the nnormalized percent area
ID_S1_unique = np.unique(ID_S1) #unique idea
for i in ID_S1_unique:
INDX = np.where(ID_S1==i) # getting the indeces
AP1N[INDX] = AP1[INDX] / AP1[INDX].sum() # normalizing for that sum
AP1N=normalize(ID_S1, ID_S1_unique, AP1)
#replace computationally expensive part with C++
# for i in ID_S1_unique:
# INDX = np.where(ID_S1==i) # getting the indeces
# AP1N[INDX] = AP1[INDX] / AP1[INDX].sum() # normalizing for that sum
# taking the part of data frame as the numpy to incread the spead
# finding the IDs from shapefile one
ID_S2 = np.array (result['IDS2'])
AP2 = np.array(result['AP2'])
AP2N = AP2 # creating the nnormalized percent area
ID_S2_unique = np.unique(ID_S2) #unique idea
for i in ID_S2_unique:
INDX = np.where(ID_S2==i) # getting the indeces
AP2N[INDX] = AP2[INDX] / AP2[INDX].sum() # normalizing for that sum
AP2N= normalize(ID_S2, ID_S2_unique, AP2)
# replace computationally expensive part with C++
# for i in ID_S2_unique:
# INDX = np.where(ID_S2==i) # getting the indeces
# AP2N[INDX] = AP2[INDX] / AP2[INDX].sum() # normalizing for that sum
result ['AP1N'] = AP1N
result ['AP2N'] = AP2N
return result
Expand Down
47 changes: 46 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,46 @@
.
affine==2.3.0
attrs==21.2.0
certifi==2021.10.8
cftime==1.5.1
click==8.0.3
click-plugins==1.1.1
cligj==0.7.2
cycler==0.10.0
DateTime==4.3
Fiona==1.8.20
GDAL==3.3.2
geojson==2.5.0
geopandas==0.10.2
geovoronoi==0.3.0
imageio==2.9.0
json5==0.9.6
kiwisolver==1.3.2
llvmlite==0.37.0
matplotlib==3.4.3
munch==2.5.0
netCDF4==1.5.7
networkx==2.6.3
numba==0.54.1
numpy==1.20.3
pandas==1.3.4
Pillow==8.4.0
pip==21.2.4
pyparsing==2.4.7
pyproj==3.2.1
pysheds==0.2.7
pyshp==2.1.3
python-dateutil==2.8.2
pytz==2021.3
PyWavelets==1.1.1
rasterio==1.2.10
Rtree==0.9.7
scikit-image==0.18.3
scipy==1.6.3
setuptools==58.0.4
Shapely==1.7.1
six==1.16.0
snuggs==1.4.7
tifffile==2021.10.12
wheel==0.37.0
xarray==0.19.0
zope.interface==5.4.0