From 380b2634c35b4e3034f25e477806ca71cf207665 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 23 Jul 2019 10:32:40 +0100 Subject: [PATCH 1/8] initial commit --- src/exporters/ndvi.py | 120 +++++++++++++++++++++++++++++++++++ tests/exporters/test_ndvi.py | 0 2 files changed, 120 insertions(+) create mode 100644 src/exporters/ndvi.py create mode 100644 tests/exporters/test_ndvi.py diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py new file mode 100644 index 000000000..6fe4dcc27 --- /dev/null +++ b/src/exporters/ndvi.py @@ -0,0 +1,120 @@ +from pathlib import Path +from typing import List, Optional +from bs4 import BeautifulSoup +import urllib.request +import numpy as np +import os +import multiprocessing + +import re + +from .base import BaseExporter + + +class NDVIExporter(BaseExporter): + """Exports Normalised Difference Vegetation Index from NOAA + + https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/ + """ + + def __init__(self, data_folder: Path = Path('data')) -> None: + super().__init__(data_folder) + + self.ndvi_folder = self.raw_folder / "ndvi" + if not self.ndvi_folder.exists(): + self.ndvi_folder.mkdir() + + self.base_url = 'https://www.ncei.noaa.gov/data/' \ + 'avhrr-land-normalized-difference-vegetation-index/' \ + 'access'.replace(' ', '') + + @staticmethod + def beautiful_soup_url(url: str) -> BeautifulSoup: + # use urllib.request to read the page source + req = urllib.request.Request(url) + response = urllib.request.urlopen(req) + the_page = response.read() + + # use BeautifulSoup to parse the html source + soup = BeautifulSoup(the_page, features="lxml") + + return soup + + def get_ndvi_url_paths(self, + selected_years: Optional[List[int]] = None, + ) -> List[str]: + # use BeautifulSoup to parse the html source + soup = self.beautiful_soup_url(self.base_url) + # find all links (to the years) + years = [ + yrs.string.replace('/', '') + for yrs in soup.find_all('a') + if re.match(r'[0-9]{4}', yrs.string) + ] + + # filter for selected_years + if selected_years is not None: + years = [y for y in years if y in selected_years] + + # build the year urls + year_urls = [ + f'{self.base_url}/{y}' + for y in years + ] + + # get the urls for the .nc files + all_urls = [] + for url in year_urls: + links = self.beautiful_soup_url(url).find_all('a') + nc_links = [ + f'{url}/{l.string}' + for l in links + if '.nc' in l.string + ] + all_urls.extend(nc_links) + + return all_urls + + def wget_file(self, url) -> None: + # create year subdirectories + year = url.split('/')[-2] + out_folder = self.ndvi_folder / year + if not out_folder.exists(): + out_folder.mkdir(parents=True, exist_ok=True) + + # check if file already exists + fname = url.split('/')[-1] + if (out_folder / fname).exists(): + print(f'{fname} for {year} already donwloaded!') + return + + os.system(f'wget -np -nH {url} -P {out_folder.as_posix()}') + print(f'{fname} for {year} downloaded!') + + def export(self, years: Optional[List[int]] = None, + parallel_processes: int = 1) -> None: + """Export functionality for the NDVI product from AVHRR (NOAA) + 1981 - 2019. + Arguments + ---------- + years: Optional list of ints, default = None + The years of data to download. If None, all data will be downloaded + parallel_processes: int, default = 1 + number of processes to parallelize the downloading of data + """ + if years is not None: + valid_years = np.arange(1981, 2020) + assert np.isin(years, valid_years).all(), \ + 'Expected `years` argument to be in range 1981-2019' + + urls = self.get_ndvi_url_paths(selected_years=years) + + if parallel_processes <= 1: # sequential + for url in urls: + self.wget_file(url) + else: # parallel + pool = multiprocessing.Pool(processes=parallel_processes) + pool.map(self.wget_file, urls) + + +# diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py new file mode 100644 index 000000000..e69de29bb From f212db0e563a334a63bacd45411f86df4a1ac93e Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 23 Jul 2019 11:51:56 +0100 Subject: [PATCH 2/8] awuful attempt at testing(sorry Gabi) --- src/exporters/__init__.py | 4 +- src/exporters/ndvi.py | 2 +- tests/exporters/test_ndvi.py | 104 +++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/exporters/__init__.py b/src/exporters/__init__.py index 69edf2c1b..5519c894e 100644 --- a/src/exporters/__init__.py +++ b/src/exporters/__init__.py @@ -4,8 +4,10 @@ from .planetOS import ERA5ExporterPOS from .seas5.s5 import S5Exporter from .gleam import GLEAMExporter +from .ndvi import NDVIExporter __all__ = [ 'ERA5Exporter', 'VHIExporter', 'ERA5ExporterPOS', - 'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter' + 'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter', + 'NDVIExporter', ] diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py index 6fe4dcc27..38b975bdc 100644 --- a/src/exporters/ndvi.py +++ b/src/exporters/ndvi.py @@ -54,7 +54,7 @@ def get_ndvi_url_paths(self, # filter for selected_years if selected_years is not None: - years = [y for y in years if y in selected_years] + years = [y for y in years if int(y) in selected_years] # build the year urls year_urls = [ diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py index e69de29bb..860a661cd 100644 --- a/tests/exporters/test_ndvi.py +++ b/tests/exporters/test_ndvi.py @@ -0,0 +1,104 @@ +# from pathlib import Path +from unittest.mock import patch, MagicMock +import urllib.request +# import pytest +import numpy as np + +from src.exporters import NDVIExporter + + +class TestNDVIExporter: + def test_init(self, tmp_path): + e = NDVIExporter(tmp_path) + + assert e.ndvi_folder.name == 'ndvi' + assert (tmp_path / 'raw' / 'ndvi').exists() + + @patch('os.system', autospec=True) + def test_checkpointing(self, mock_system, tmp_path, capsys): + # checks we don't redownload files + exporter = NDVIExporter(tmp_path) + + # setup the already downloaded file + test_filename = '1981/testy_test.nc' + (tmp_path / 'raw/ndvi/1981').mkdir(parents=True, exist_ok=True) + (tmp_path / f'raw/ndvi/{test_filename}').touch() + + exporter.wget_file(test_filename) + captured = capsys.readouterr() + + expected_stdout = f'{test_filename} for 1981 already donwloaded!\n' + assert captured.out == expected_stdout, \ + f'Expected stdout to be {expected_stdout}, got {captured.out}' + mock_system.assert_not_called(), 'os.system was called! Should have been skipped' + + @patch('urllib.request.Request', autospec=True) + def test_get_filenames(self, request_patch, monkeypatch, tmp_path): + # First 1000 characters of the urllib response from the https, + # pulled on July 23 2019 + request_patch.return_value = MagicMock() + + # EXPECTED response for first page (all years) + expected_response = '\n\n \nIndex of' \ + '/data/avhrr-land-normalized-difference-vegetation-index/access\n' \ + '\n \n

Index of' \ + '/data/avhrr-land-normalized-difference-vegetation-index/access

' \ + '\n
\n\n
 NameLast modifiedSizeDescription
 Parent' \ + 'Directory  -' \ + ' 
 1981/14-Jul-2019 16:09' + + # EXPECTED response for second page (1981 all .nc files) + expected_response ='\n\n' \ + '\n Index of' \ + '/data/avhrr-land-normalized-difference-vegetation-index/access/' \ + '1981\n\n \n

Index of' \ + '/data/avhrr-land-normalized-difference-vegetation-index/access/1981' \ + '

\n
\n\n12-Jul-2019 10:37 \n12-Jul-2019 10:37 \n
 NameLast' \ + 'modifiedSizeDescription
 Parent' \ + 'Directory  -' \ + ' 
 ' \ + 'AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc' \ + '51M 
 ' \ + 'AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc' \ + '59M 
  Date: Tue, 23 Jul 2019 13:00:55 +0100 Subject: [PATCH 3/8] update flkae8 --- src/exporters/ndvi.py | 4 ++-- tests/exporters/test_ndvi.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py index 38b975bdc..a8e7602dc 100644 --- a/src/exporters/ndvi.py +++ b/src/exporters/ndvi.py @@ -25,8 +25,8 @@ def __init__(self, data_folder: Path = Path('data')) -> None: self.ndvi_folder.mkdir() self.base_url = 'https://www.ncei.noaa.gov/data/' \ - 'avhrr-land-normalized-difference-vegetation-index/' \ - 'access'.replace(' ', '') + 'avhrr-land-normalized-difference-vegetation-index/' \ + 'access'.replace(' ', '') @staticmethod def beautiful_soup_url(url: str) -> BeautifulSoup: diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py index 860a661cd..cce837c6d 100644 --- a/tests/exporters/test_ndvi.py +++ b/tests/exporters/test_ndvi.py @@ -27,7 +27,7 @@ def test_checkpointing(self, mock_system, tmp_path, capsys): exporter.wget_file(test_filename) captured = capsys.readouterr() - expected_stdout = f'{test_filename} for 1981 already donwloaded!\n' + expected_stdout = f'testy_test.nc for 1981 already donwloaded!\n' assert captured.out == expected_stdout, \ f'Expected stdout to be {expected_stdout}, got {captured.out}' mock_system.assert_not_called(), 'os.system was called! Should have been skipped' @@ -55,7 +55,7 @@ def test_get_filenames(self, request_patch, monkeypatch, tmp_path): 'href="1981/">1981/14-Jul-2019 16:09' # EXPECTED response for second page (1981 all .nc files) - expected_response ='\n\n' \ + expected_response = '\n\n' \ '\n Index of' \ '/data/avhrr-land-normalized-difference-vegetation-index/access/' \ '1981\n\n \n

Index of' \ From 213e0eb707caf27b6780252f10ee200dc57ffa79 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 23 Jul 2019 13:20:43 +0100 Subject: [PATCH 4/8] update scripts/exporters --- scripts/export.py | 49 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/scripts/export.py b/scripts/export.py index 2bc702ed2..a516c4b41 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -4,8 +4,9 @@ sys.path.append('..') from src.exporters import (ERA5Exporter, VHIExporter, CHIRPSExporter, ERA5ExporterPOS, - GLEAMExporter) - + GLEAMExporter, NDVIExporter, + S5Exporter) +import numpy as np def export_era5(): # if the working directory is alread ml_drought don't need ../data @@ -63,9 +64,53 @@ def export_gleam(): exporter.export(['E', 'SMroot', 'SMsurf'], 'monthly') +def export_ndvi(): + if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': + data_path = Path('data') + else: + data_path = Path('../data') + + exporter = NDVIExporter(data_path) + exporter.export() + + + +def export_s5( + granularity='hourly', pressure_level=False, + variable='total_precipitation', min_year=1993, + max_year=1994, min_month=1, max_month=12, +): + if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': + data_path = Path('data') + else: + data_path = Path('../data') + + exporter = S5Exporter( + data_folder=data_path, + granularity=granularity, + pressure_level=pressure_level, + ) + max_leadtime = None + pressure_levels = [200, 500, 925] + selection_request = None + n_parallel_requests = 1 + + exporter.export( + variable=variable, + min_year=min_year, + max_year=max_year, + min_month=min_month, + max_month=max_month, + max_leadtime=max_leadtime, + pressure_levels=pressure_levels, + n_parallel_requests=n_parallel_requests, + ) + if __name__ == '__main__': export_era5() export_vhi() export_chirps() export_era5POS() export_gleam() + export_ndvi() + # export_s5() From 8abff278b89029ee8d4dcf7323aa1b0e0d5a9480 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 23 Jul 2019 13:21:24 +0100 Subject: [PATCH 5/8] remove scripts/export_s5 --- scripts/export_s5.py | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 scripts/export_s5.py diff --git a/scripts/export_s5.py b/scripts/export_s5.py deleted file mode 100644 index 86744268a..000000000 --- a/scripts/export_s5.py +++ /dev/null @@ -1,42 +0,0 @@ -import os; import sys -sys.path.append('..') - -from src.exporters.seas5.s5 import (S5Exporter) -from pathlib import Path -import numpy as np - -# %load_ext autoreload -# %autoreload 2 - -data_dir = Path('data') - -granularity = 'hourly' -pressure_level=False - -s5 = S5Exporter( - data_folder=data_dir, - granularity=granularity, - pressure_level=pressure_level, -) - -variable = 'total_precipitation' -min_year = 1993 -max_year = 2014 -min_month = 1 -max_month = 12 -max_leadtime = None -pressure_levels = [200, 500, 925] -selection_request = None -n_parallel_requests = 20 -show_api_request = True - -s5.export( - variable=variable, - min_year=min_year, - max_year=max_year, - min_month=min_month, - max_month=max_month, - max_leadtime=max_leadtime, - pressure_levels=pressure_levels, - n_parallel_requests=n_parallel_requests, -) From 2b83c5f6b1fc71faa147b708c05255de173aaa6c Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 6 Aug 2019 11:27:45 +0100 Subject: [PATCH 6/8] update tests --- src/exporters/ndvi.py | 2 +- tests/exporters/test_ndvi.py | 86 ++++++------------------------------ 2 files changed, 15 insertions(+), 73 deletions(-) diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py index a8e7602dc..dc23e565d 100644 --- a/src/exporters/ndvi.py +++ b/src/exporters/ndvi.py @@ -94,7 +94,7 @@ def wget_file(self, url) -> None: def export(self, years: Optional[List[int]] = None, parallel_processes: int = 1) -> None: """Export functionality for the NDVI product from AVHRR (NOAA) - 1981 - 2019. + 1981 - 2019 (daily). Arguments ---------- years: Optional list of ints, default = None diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py index cce837c6d..98616c7de 100644 --- a/tests/exporters/test_ndvi.py +++ b/tests/exporters/test_ndvi.py @@ -1,8 +1,4 @@ -# from pathlib import Path -from unittest.mock import patch, MagicMock -import urllib.request -# import pytest -import numpy as np +from unittest.mock import patch from src.exporters import NDVIExporter @@ -32,73 +28,19 @@ def test_checkpointing(self, mock_system, tmp_path, capsys): f'Expected stdout to be {expected_stdout}, got {captured.out}' mock_system.assert_not_called(), 'os.system was called! Should have been skipped' - @patch('urllib.request.Request', autospec=True) - def test_get_filenames(self, request_patch, monkeypatch, tmp_path): - # First 1000 characters of the urllib response from the https, - # pulled on July 23 2019 - request_patch.return_value = MagicMock() - - # EXPECTED response for first page (all years) - expected_response = '\n\n \nIndex of' \ - '/data/avhrr-land-normalized-difference-vegetation-index/access\n' \ - '\n \n

Index of' \ - '/data/avhrr-land-normalized-difference-vegetation-index/access

' \ - '\n
\n\n
 NameLast modifiedSizeDescription
 Parent' \ - 'Directory  -' \ - ' 
 1981/14-Jul-2019 16:09' - - # EXPECTED response for second page (1981 all .nc files) - expected_response = '\n\n' \ - '\n Index of' \ - '/data/avhrr-land-normalized-difference-vegetation-index/access/' \ - '1981\n\n \n

Index of' \ - '/data/avhrr-land-normalized-difference-vegetation-index/access/1981' \ - '

\n
\n\n12-Jul-2019 10:37 \n12-Jul-2019 10:37 \n
 NameLast' \ - 'modifiedSizeDescription
 Parent' \ - 'Directory  -' \ - ' 
 ' \ - 'AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc' \ - '51M 
 ' \ - 'AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc' \ - '59M 
  Date: Fri, 25 Oct 2019 16:21:57 +0100 Subject: [PATCH 7/8] bs4 is an optional install --- src/exporters/ndvi.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py index dc23e565d..7630af5e7 100644 --- a/src/exporters/ndvi.py +++ b/src/exporters/ndvi.py @@ -1,6 +1,5 @@ from pathlib import Path from typing import List, Optional -from bs4 import BeautifulSoup import urllib.request import numpy as np import os @@ -9,6 +8,7 @@ import re from .base import BaseExporter +BeautifulSoup = None class NDVIExporter(BaseExporter): @@ -18,6 +18,10 @@ class NDVIExporter(BaseExporter): """ def __init__(self, data_folder: Path = Path('data')) -> None: + global BeautifulSoup + if BeautifulSoup is None: + from bs4 import BeautifulSoup + super().__init__(data_folder) self.ndvi_folder = self.raw_folder / "ndvi" @@ -28,6 +32,7 @@ def __init__(self, data_folder: Path = Path('data')) -> None: 'avhrr-land-normalized-difference-vegetation-index/' \ 'access'.replace(' ', '') + @staticmethod def beautiful_soup_url(url: str) -> BeautifulSoup: # use urllib.request to read the page source From b90124158169199f68cb5d784f9baee3dd3d4266 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 28 Oct 2019 09:41:01 +0000 Subject: [PATCH 8/8] update mypy errors for bs4 --- src/exporters/ndvi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py index 7630af5e7..a69c8776b 100644 --- a/src/exporters/ndvi.py +++ b/src/exporters/ndvi.py @@ -34,14 +34,14 @@ def __init__(self, data_folder: Path = Path('data')) -> None: @staticmethod - def beautiful_soup_url(url: str) -> BeautifulSoup: + def beautiful_soup_url(url: str) -> BeautifulSoup: # type: ignore # use urllib.request to read the page source req = urllib.request.Request(url) response = urllib.request.urlopen(req) the_page = response.read() # use BeautifulSoup to parse the html source - soup = BeautifulSoup(the_page, features="lxml") + soup = BeautifulSoup(the_page, features="lxml") # type: ignore return soup @@ -53,7 +53,7 @@ def get_ndvi_url_paths(self, # find all links (to the years) years = [ yrs.string.replace('/', '') - for yrs in soup.find_all('a') + for yrs in soup.find_all('a') # type: ignore if re.match(r'[0-9]{4}', yrs.string) ] @@ -70,7 +70,7 @@ def get_ndvi_url_paths(self, # get the urls for the .nc files all_urls = [] for url in year_urls: - links = self.beautiful_soup_url(url).find_all('a') + links = self.beautiful_soup_url(url).find_all('a') # type: ignore nc_links = [ f'{url}/{l.string}' for l in links