From c3906c70011e489aa3ecbd09cee98e2e039c0e99 Mon Sep 17 00:00:00 2001 From: knaaptime Date: Wed, 6 Nov 2024 14:39:31 -0800 Subject: [PATCH 1/6] data defs; fix gadm --- geosnap/_data.py | 50 ++++++++++++++++++++++++++++----- geosnap/io/constructors.py | 11 +++----- geosnap/io/gadm.py | 22 ++++----------- geosnap/io/networkio.py | 4 +-- geosnap/io/nlcd_definitions.csv | 17 +++++++++++ 5 files changed, 71 insertions(+), 33 deletions(-) create mode 100644 geosnap/io/nlcd_definitions.csv diff --git a/geosnap/_data.py b/geosnap/_data.py index 20921b01..73022eb7 100644 --- a/geosnap/_data.py +++ b/geosnap/_data.py @@ -83,8 +83,10 @@ def __dir__(self): "ltdb", "msa_definitions", "msas", + "naics_definitions", "ncdb", "nces", + "nlcd_definitions", "seda", "states", "show_data_dir", @@ -109,7 +111,7 @@ def show_data_dir(self, verbose=True): return self.data_dir def lodes_codebook(self): - """_summary_ + """Return a table of descriptive variable names for the LODES data Returns ------- @@ -121,7 +123,7 @@ def lodes_codebook(self): ) def bea_regions(self): - """Table that maps states to their respective BEA regions + """Return a table that maps states to their respective BEA regions Returns ------- @@ -217,10 +219,13 @@ def seda( "long", "poolsub", ], "`pool` argument must be either 'pool', 'long', or 'poolsub'" - assert standardize in [ - "gcs", - "cs", - ], "`standardize` argument must be either 'cs' for cohort-standardized or 'gcs' for grade-cohort-standardized" + assert ( + standardize + in [ + "gcs", + "cs", + ] + ), "`standardize` argument must be either 'cs' for cohort-standardized or 'gcs' for grade-cohort-standardized" if pooling == "poolsub": fn = f"seda_{level}_{pooling}_{standardize}_4.1_corrected" else: @@ -596,7 +601,8 @@ def msa_definitions(self): return pd.read_csv( os.path.join( os.path.dirname(os.path.abspath(__file__)), "io/msa_definitions.csv" - ) + ), + converters={"stcofips": str}, ) def ltdb(self): @@ -645,3 +651,33 @@ def codebook(self): return pd.read_csv( os.path.join(os.path.dirname(os.path.abspath(__file__)), "io/variables.csv") ) + + def naics_definitions(self): + """Table of NAICS 2-digit industry classification system definitions. + + Returns + ------- + pandas.DataFrame + table that stores variable names, definitions, and formulas. + + """ + return pd.read_csv( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "io/naics2_definitions.csv" + ) + ) + + def nlcd_definitions(self): + """Table of NLCD land classification system definitions. + + Returns + ------- + pandas.DataFrame + table that stores variable names, definitions, and formulas. + + """ + return pd.read_csv( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "io/nlcd_definitions.csv" + ) + ) diff --git a/geosnap/io/constructors.py b/geosnap/io/constructors.py index c8e843e9..bba5cdab 100644 --- a/geosnap/io/constructors.py +++ b/geosnap/io/constructors.py @@ -563,7 +563,7 @@ def get_lodes( state_fips=state_fips, county_fips=county_fips, msa_fips=msa_fips, - fips=fips, + fips=allfips, data=gdf, ) if isinstance(boundary, gpd.GeoDataFrame): @@ -608,19 +608,16 @@ def get_lodes( def _msa_to_county(datastore, msa_fips): + msa_defs = datastore.msa_definitions() if msa_fips: pr_metros = set( - datastore.msa_definitions()[ - datastore.msa_definitions()["CBSA Title"].str.contains("PR") - ]["CBSA Code"].tolist() + msa_defs[msa_defs["CBSA Title"].str.contains("PR")]["CBSA Code"].tolist() ) if msa_fips in pr_metros: raise Exception( "geosnap does not yet include built-in data for Puerto Rico" ) - msa_counties = datastore.msa_definitions()[ - datastore.msa_definitions()["CBSA Code"] == msa_fips - ]["stcofips"].tolist() + msa_counties = msa_defs[msa_defs["CBSA Code"] == msa_fips]["stcofips"].tolist() else: msa_counties = None diff --git a/geosnap/io/gadm.py b/geosnap/io/gadm.py index 9abda266..8309e79f 100644 --- a/geosnap/io/gadm.py +++ b/geosnap/io/gadm.py @@ -15,14 +15,6 @@ def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3): three character ISO code for a country level : int, optional which geometry level to collect, by default 0 - use_fsspec : bool - whether to use the `fsspec` library - gpkg : bool - whether to read from a geopackage or shapefile. If True, - geopackage will be read; shapefile if False. Ignored if using fsspec - n_retries : int optional - number of retries in case read fails from direct stream from GADM. - Ignored if using fsspec. Returns ------- @@ -46,13 +38,9 @@ def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3): code = code.upper() import fsspec - with tempfile.TemporaryDirectory() as temp_path: - with fsspec.open( - f"simplecache::zip://*.gpkg::https://biogeo.ucdavis.edu/data/gadm3.6/gpkg/gadm36_{code}_gpkg.zip", - simplecache={"cache_storage": temp_path}, - ): - gdf = gpd.read_file( - os.path.join(temp_path, os.listdir(temp_path)[0]), - layer=f"gadm36_{code}_{level}", + + gdf = gpd.read_file( + f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_{code}.gpkg", + layer=f"ADM_ADM_{level}", ) - return gdf + return gdf diff --git a/geosnap/io/networkio.py b/geosnap/io/networkio.py index 537f62f7..d9fb9ed6 100644 --- a/geosnap/io/networkio.py +++ b/geosnap/io/networkio.py @@ -83,13 +83,13 @@ def get_network_from_gdf( } impedance = "length" - graph = ox.graph_from_polygon(gdf.unary_union, network_type=network_type) + graph = ox.graph_from_polygon(gdf.union_all(), network_type=network_type) if add_travel_times: graph = ox.add_edge_speeds(graph, default_speeds) graph = ox.add_edge_travel_times(graph) impedance = "travel_time" - n, e = ox.utils_graph.graph_to_gdfs(graph) + n, e = ox.graph_to_gdfs(graph) if output_crs is not None: n = _reproject_osm_nodes(n, input_crs=4326, output_crs=output_crs) e = e.to_crs(output_crs) diff --git a/geosnap/io/nlcd_definitions.csv b/geosnap/io/nlcd_definitions.csv new file mode 100644 index 00000000..8093f3d2 --- /dev/null +++ b/geosnap/io/nlcd_definitions.csv @@ -0,0 +1,17 @@ +code,color,classification,description +11,"70,107,159",Open Water,"areas of open water, generally with less than 25% cover of vegetation or soil." +12,"209,222,248",Perennial Snow/Ice,"areas characterized by a perennial cover of ice and/or snow, generally greater than 25% of total cover." +21,"222,197,197","Developed, Open Space","areas with a mixture of some constructed materials, but mostly vegetation in the form of lawn grasses. Impervious surfaces account for less than 20% of total cover. These areas most commonly include large-lot single-family housing units, parks, golf courses, and vegetation planted in developed settings for recreation, erosion control, or aesthetic purposes." +22,"217,146,130","Developed, Low Intensity",areas with a mixture of constructed materials and vegetation. Impervious surfaces account for 20% to 49% percent of total cover. These areas most commonly include single-family housing units +23,"235,0,0","Developed, Medium Intensity",areas with a mixture of constructed materials and vegetation. Impervious surfaces account for 50% to 79% of the total cover. These areas most commonly include single-family housing units. +24,"171,0,0",Developed High Intensity,"highly developed areas where people reside or work in high numbers. Examples include apartment complexes, row houses and commercial/industrial. Impervious surfaces account for 80% to 100% of the total cover" +31,"179,172,159",Barren Land (Rock/Sand/Clay),"areas of bedrock, desert pavement, scarps, talus, slides, volcanic material, glacial debris, sand dunes, strip mines, gravel pits and other accumulations of earthen material. Generally, vegetation accounts for less than 15% of total cover." +41,"104,171,95",Deciduous Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. More than 75% of the tree species shed foliage simultaneously in response to seasonal change." +42,"28,95,44",Evergreen Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. More than 75% of the tree species maintain their leaves all year. Canopy is never without green foliage." +43,"181,197,143",Mixed Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. Neither deciduous nor evergreen species are greater than 75% of total tree cover." +52,"204,184,121",Shrub/Scrub,"areas dominated by shrubs; less than 5 meters tall with shrub canopy typically greater than 20% of total vegetation. This class includes true shrubs, young trees in an early successional stage or trees stunted from environmental conditions" +71,"223,223,194",Grassland/Herbaceous,"areas dominated by gramanoid or herbaceous vegetation, generally greater than 80% of total vegetation. These areas are not subject to intensive management such as tilling, but can be utilized for grazing." +81,"220,217,57",Pasture/Hay,"areas of grasses, legumes, or grass-legume mixtures planted for livestock grazing or the production of seed or hay crops, typically on a perennial cycle. Pasture/hay vegetation accounts for greater than 20% of total vegetation." +82,"171,108,40",Cultivated Crops,"areas used for the production of annual crops, such as corn, soybeans, vegetables, tobacco, and cotton, and also perennial woody crops such as orchards and vineyards. Crop vegetation accounts for greater than 20% of total vegetation. This class also includes all land being actively tilled." +90,"184,217,235",Woody Wetlands,areas where forest or shrubland vegetation accounts for greater than 20% of vegetative cover and the soil or substrate is periodically saturated with or covered with water. +95,"108,159,184",Emergent Herbaceous Wetlands,areas where perennial herbaceous vegetation accounts for greater than 80% of vegetative cover and the soil or substrate is periodically saturated with or covered with water. \ No newline at end of file From 0401a4ea9759cd94d975c9f2fa17b5f3d1c97797 Mon Sep 17 00:00:00 2001 From: knaaptime Date: Wed, 6 Nov 2024 16:15:53 -0800 Subject: [PATCH 2/6] pure gpd for gadm --- geosnap/io/gadm.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/geosnap/io/gadm.py b/geosnap/io/gadm.py index 8309e79f..c819be1b 100644 --- a/geosnap/io/gadm.py +++ b/geosnap/io/gadm.py @@ -1,12 +1,9 @@ """Utilities for fetching data from GADM.""" -import os -import tempfile - import geopandas as gpd -def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3): +def get_gadm(code, level=0): """Collect data from GADM as a geodataframe. Parameters @@ -36,11 +33,9 @@ def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3): with this method would always returns the layer with index 0 in the geopackage file). """ code = code.upper() - import fsspec - gdf = gpd.read_file( - f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_{code}.gpkg", - layer=f"ADM_ADM_{level}", - ) + f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_{code}.gpkg", + layer=f"ADM_ADM_{level}", + ) return gdf From 902a86be8f5fa433374e12371f4e536b284e5a25 Mon Sep 17 00:00:00 2001 From: knaaptime Date: Wed, 6 Nov 2024 16:32:59 -0800 Subject: [PATCH 3/6] rm infty --- geosnap/analyze/_cluster_wrappers.py | 2 +- geosnap/tests/test_isochrones.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/geosnap/analyze/_cluster_wrappers.py b/geosnap/analyze/_cluster_wrappers.py index eda6faf8..adf7549e 100644 --- a/geosnap/analyze/_cluster_wrappers.py +++ b/geosnap/analyze/_cluster_wrappers.py @@ -279,7 +279,7 @@ def gaussian_mixture( # selection routine from # https://plot.ly/scikit-learn/plot-gmm-selection/ - lowest_bic = np.infty + lowest_bic = np.inf bic = [] maxn = max_clusters + 1 n_components_range = range(1, maxn) diff --git a/geosnap/tests/test_isochrones.py b/geosnap/tests/test_isochrones.py index 679be8f2..5e10b195 100644 --- a/geosnap/tests/test_isochrones.py +++ b/geosnap/tests/test_isochrones.py @@ -102,7 +102,7 @@ def test_isos_with_edges(): ) print(alpha.area.round(8)) # this will grow depending on the size of the OSM network when tested... - assert alpha.area.round(8).iloc[0] == 0.00026001 + assert alpha.area.round(8).iloc[0] >= 0.00036433 def test_project_network(): tracts = get_acs(DataStore(), county_fips='48301', level='tract', years=2015) From 712f4cb9abb40f46b4cdb6f5827bdd7276f9d49b Mon Sep 17 00:00:00 2001 From: knaaptime Date: Fri, 8 Nov 2024 11:02:52 -0800 Subject: [PATCH 4/6] seda v5 --- geosnap/_data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/geosnap/_data.py b/geosnap/_data.py index 73022eb7..43135897 100644 --- a/geosnap/_data.py +++ b/geosnap/_data.py @@ -226,12 +226,13 @@ def seda( "cs", ] ), "`standardize` argument must be either 'cs' for cohort-standardized or 'gcs' for grade-cohort-standardized" + if pooling == "poolsub": - fn = f"seda_{level}_{pooling}_{standardize}_4.1_corrected" + fn = f"seda_{level}_{pooling}_{standardize}_5.0" else: - fn = f"seda_{level}_{pooling}_{standardize}_4.1" + fn = f"seda_{level}_{pooling}_{standardize}_5.0" local_path = pathlib.Path(self.data_dir, "seda", f"{fn}.parquet") - remote_path = f"https://stacks.stanford.edu/file/druid:xv742vh9296/{fn}.csv" + remote_path = f"https://stacks.stanford.edu/file/druid:cs829jn7849/{fn}.csv" msg = ( "Streaming data from SEDA archive at .\n" "Use `geosnap.io.store_seda()` to store the data locally for better performance" From 419e821b0153d38e25067206b3716181e923de9b Mon Sep 17 00:00:00 2001 From: knaaptime Date: Fri, 8 Nov 2024 11:11:55 -0800 Subject: [PATCH 5/6] rm redundant naics defs --- geosnap/_data.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/geosnap/_data.py b/geosnap/_data.py index 43135897..e9a37f83 100644 --- a/geosnap/_data.py +++ b/geosnap/_data.py @@ -653,21 +653,6 @@ def codebook(self): os.path.join(os.path.dirname(os.path.abspath(__file__)), "io/variables.csv") ) - def naics_definitions(self): - """Table of NAICS 2-digit industry classification system definitions. - - Returns - ------- - pandas.DataFrame - table that stores variable names, definitions, and formulas. - - """ - return pd.read_csv( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), "io/naics2_definitions.csv" - ) - ) - def nlcd_definitions(self): """Table of NLCD land classification system definitions. From 4479a9be7e2bd232a2c30816b3e1806c69df5870 Mon Sep 17 00:00:00 2001 From: knaaptime Date: Fri, 8 Nov 2024 14:32:33 -0800 Subject: [PATCH 6/6] windows dtypes in pandana --- geosnap/tests/test_isochrones.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/geosnap/tests/test_isochrones.py b/geosnap/tests/test_isochrones.py index 5e10b195..4ee1dbc3 100644 --- a/geosnap/tests/test_isochrones.py +++ b/geosnap/tests/test_isochrones.py @@ -78,7 +78,10 @@ def test_isos_from_gdf_shapely(): ) assert_almost_equal(t.area.astype(float).round(8).tolist()[0], 0.00012474) - +@pytest.mark.skipif( + sys.platform.startswith("win"), + reason="skipping test on windows because of dtype issue", +) def test_network_constructor(): tracts = get_acs(DataStore(), county_fips='48301', level='tract', years=2015) walk_net = get_network_from_gdf(tracts) @@ -104,6 +107,10 @@ def test_isos_with_edges(): # this will grow depending on the size of the OSM network when tested... assert alpha.area.round(8).iloc[0] >= 0.00036433 +@pytest.mark.skipif( + sys.platform.startswith("win"), + reason="skipping test on windows because of dtype issue", +) def test_project_network(): tracts = get_acs(DataStore(), county_fips='48301', level='tract', years=2015) walk_net = get_network_from_gdf(tracts)