From 380b2634c35b4e3034f25e477806ca71cf207665 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 23 Jul 2019 10:32:40 +0100
Subject: [PATCH 1/8] initial commit

---
 src/exporters/ndvi.py        | 120 +++++++++++++++++++++++++++++++++++
 tests/exporters/test_ndvi.py |   0
 2 files changed, 120 insertions(+)
 create mode 100644 src/exporters/ndvi.py
 create mode 100644 tests/exporters/test_ndvi.py

diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
new file mode 100644
index 000000000..6fe4dcc27
--- /dev/null
+++ b/src/exporters/ndvi.py
@@ -0,0 +1,120 @@
+from pathlib import Path
+from typing import List, Optional
+from bs4 import BeautifulSoup
+import urllib.request
+import numpy as np
+import os
+import multiprocessing
+
+import re
+
+from .base import BaseExporter
+
+
+class NDVIExporter(BaseExporter):
+    """Exports Normalised Difference Vegetation Index from NOAA
+
+    https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/
+    """
+
+    def __init__(self, data_folder: Path = Path('data')) -> None:
+        super().__init__(data_folder)
+
+        self.ndvi_folder = self.raw_folder / "ndvi"
+        if not self.ndvi_folder.exists():
+            self.ndvi_folder.mkdir()
+
+        self.base_url = 'https://www.ncei.noaa.gov/data/' \
+        'avhrr-land-normalized-difference-vegetation-index/' \
+        'access'.replace(' ', '')
+
+    @staticmethod
+    def beautiful_soup_url(url: str) -> BeautifulSoup:
+        # use urllib.request to read the page source
+        req = urllib.request.Request(url)
+        response = urllib.request.urlopen(req)
+        the_page = response.read()
+
+        # use BeautifulSoup to parse the html source
+        soup = BeautifulSoup(the_page, features="lxml")
+
+        return soup
+
+    def get_ndvi_url_paths(self,
+                           selected_years: Optional[List[int]] = None,
+                           ) -> List[str]:
+        # use BeautifulSoup to parse the html source
+        soup = self.beautiful_soup_url(self.base_url)
+        # find all links (to the years)
+        years = [
+            yrs.string.replace('/', '')
+            for yrs in soup.find_all('a')
+            if re.match(r'[0-9]{4}', yrs.string)
+        ]
+
+        # filter for selected_years
+        if selected_years is not None:
+            years = [y for y in years if y in selected_years]
+
+        # build the year urls
+        year_urls = [
+            f'{self.base_url}/{y}'
+            for y in years
+        ]
+
+        # get the urls for the .nc files
+        all_urls = []
+        for url in year_urls:
+            links = self.beautiful_soup_url(url).find_all('a')
+            nc_links = [
+                f'{url}/{l.string}'
+                for l in links
+                if '.nc' in l.string
+            ]
+            all_urls.extend(nc_links)
+
+        return all_urls
+
+    def wget_file(self, url) -> None:
+        # create year subdirectories
+        year = url.split('/')[-2]
+        out_folder = self.ndvi_folder / year
+        if not out_folder.exists():
+            out_folder.mkdir(parents=True, exist_ok=True)
+
+        # check if file already exists
+        fname = url.split('/')[-1]
+        if (out_folder / fname).exists():
+            print(f'{fname} for {year} already donwloaded!')
+            return
+
+        os.system(f'wget -np -nH {url} -P {out_folder.as_posix()}')
+        print(f'{fname} for {year} downloaded!')
+
+    def export(self, years: Optional[List[int]] = None,
+               parallel_processes: int = 1) -> None:
+        """Export functionality for the NDVI product from AVHRR (NOAA)
+            1981 - 2019.
+        Arguments
+        ----------
+        years: Optional list of ints, default = None
+            The years of data to download. If None, all data will be downloaded
+        parallel_processes: int, default = 1
+            number of processes to parallelize the downloading of data
+        """
+        if years is not None:
+            valid_years = np.arange(1981, 2020)
+            assert np.isin(years, valid_years).all(), \
+                'Expected `years` argument to be in range 1981-2019'
+
+        urls = self.get_ndvi_url_paths(selected_years=years)
+
+        if parallel_processes <= 1:  # sequential
+            for url in urls:
+                self.wget_file(url)
+        else:  # parallel
+            pool = multiprocessing.Pool(processes=parallel_processes)
+            pool.map(self.wget_file, urls)
+
+
+#
diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py
new file mode 100644
index 000000000..e69de29bb

From f212db0e563a334a63bacd45411f86df4a1ac93e Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 23 Jul 2019 11:51:56 +0100
Subject: [PATCH 2/8] awuful attempt at testing(sorry Gabi)

---
 src/exporters/__init__.py    |   4 +-
 src/exporters/ndvi.py        |   2 +-
 tests/exporters/test_ndvi.py | 104 +++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/src/exporters/__init__.py b/src/exporters/__init__.py
index 69edf2c1b..5519c894e 100644
--- a/src/exporters/__init__.py
+++ b/src/exporters/__init__.py
@@ -4,8 +4,10 @@
 from .planetOS import ERA5ExporterPOS
 from .seas5.s5 import S5Exporter
 from .gleam import GLEAMExporter
+from .ndvi import NDVIExporter
 
 __all__ = [
     'ERA5Exporter', 'VHIExporter', 'ERA5ExporterPOS',
-    'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter'
+    'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter',
+    'NDVIExporter',
 ]
diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
index 6fe4dcc27..38b975bdc 100644
--- a/src/exporters/ndvi.py
+++ b/src/exporters/ndvi.py
@@ -54,7 +54,7 @@ def get_ndvi_url_paths(self,
 
         # filter for selected_years
         if selected_years is not None:
-            years = [y for y in years if y in selected_years]
+            years = [y for y in years if int(y) in selected_years]
 
         # build the year urls
         year_urls = [
diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py
index e69de29bb..860a661cd 100644
--- a/tests/exporters/test_ndvi.py
+++ b/tests/exporters/test_ndvi.py
@@ -0,0 +1,104 @@
+# from pathlib import Path
+from unittest.mock import patch, MagicMock
+import urllib.request
+# import pytest
+import numpy as np
+
+from src.exporters import NDVIExporter
+
+
+class TestNDVIExporter:
+    def test_init(self, tmp_path):
+        e = NDVIExporter(tmp_path)
+
+        assert e.ndvi_folder.name == 'ndvi'
+        assert (tmp_path / 'raw' / 'ndvi').exists()
+
+    @patch('os.system', autospec=True)
+    def test_checkpointing(self, mock_system, tmp_path, capsys):
+        # checks we don't redownload files
+        exporter = NDVIExporter(tmp_path)
+
+        # setup the already downloaded file
+        test_filename = '1981/testy_test.nc'
+        (tmp_path / 'raw/ndvi/1981').mkdir(parents=True, exist_ok=True)
+        (tmp_path / f'raw/ndvi/{test_filename}').touch()
+
+        exporter.wget_file(test_filename)
+        captured = capsys.readouterr()
+
+        expected_stdout = f'{test_filename} for 1981 already donwloaded!\n'
+        assert captured.out == expected_stdout, \
+            f'Expected stdout to be {expected_stdout}, got {captured.out}'
+        mock_system.assert_not_called(), 'os.system was called! Should have been skipped'
+
+    @patch('urllib.request.Request', autospec=True)
+    def test_get_filenames(self, request_patch, monkeypatch, tmp_path):
+        # First 1000 characters of the urllib response from the https,
+        # pulled on July 23 2019
+        request_patch.return_value = MagicMock()
+
+        # EXPECTED response for first page (all years)
+        expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD ' \
+            'HTML 3.2 Final//EN">\n<html>\n <head>\n<title>Index of' \
+            '/data/avhrr-land-normalized-difference-vegetation-index/access</title>\n' \
+            '</head>\n <body>\n<h1>Index of' \
+            '/data/avhrr-land-normalized-difference-vegetation-index/access</h1>' \
+            '\n<table><tr><th>&nbsp;</th><th><a' \
+            'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a' \
+            'href="?C=S;O=A">Size</a></th><th><a' \
+            'href="?C=D;O=A">Description</a></th></tr><tr><th' \
+            'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
+            'href="/data/avhrr-land-normalized-difference-vegetation-index/">Parent' \
+            'Directory</a></td><td>&nbsp;</td><td align="right">  -' \
+            '</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
+            'href="1981/">1981/</a></td><td align="right">14-Jul-2019 16:09'
+
+        # EXPECTED response for second page (1981 all .nc files)
+        expected_response ='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n' \
+            '<head>\n  <title>Index of' \
+            '/data/avhrr-land-normalized-difference-vegetation-index/access/' \
+            '1981</title>\n</head>\n <body>\n<h1>Index of' \
+            '/data/avhrr-land-normalized-difference-vegetation-index/access/1981' \
+            '</h1>\n<table><tr><th>&nbsp;</th><th><a' \
+            'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last' \
+            'modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a' \
+            'href="?C=D;O=A">Description</a></th></tr><tr><th' \
+            'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
+            'href="/data/avhrr-land-normalized-difference-vegetation-index/access/">Parent' \
+            'Directory</a></td><td>&nbsp;</td><td align="right">  -' \
+            '</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
+            'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc">' \
+            'AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc</a></td><td' \
+            'align="right">12-Jul-2019 10:37  </td><td align="right">' \
+            '51M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
+            'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc">' \
+            'AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc</a></td><td' \
+            'align="right">12-Jul-2019 10:37  </td><td align="right">' \
+            '59M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a'
+
+        expected_urls = [
+            'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
+            'difference-vegetation-index/access/1981/AVHRR-Land_'
+            'v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc',
+            'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
+            'difference-vegetation-index/access/1981/AVHRR-Land_'
+            'v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc',
+        ]
+
+        # HOW TO MOCK beautiful_soup_url function
+        def mockreturn(request):
+            class OpenURL:
+                def read(self):
+                    return expected_response
+            open_url = OpenURL()
+            return open_url
+
+        # i want to patch this ::L34-L36 $ the_page = response.read()
+        monkeypatch.setattr(urllib.request, 'urlopen', mockreturn)
+
+        exporter = NDVIExporter(tmp_path)
+        filenames = exporter.get_ndvi_url_paths(selected_years=np.arange(1981, 1985))
+
+        assert filenames is not None
+        assert expected_urls is not None

From 97e89f3d7b8fd83d6cad4d94093b73d042990e53 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 23 Jul 2019 13:00:55 +0100
Subject: [PATCH 3/8] update flkae8

---
 src/exporters/ndvi.py        | 4 ++--
 tests/exporters/test_ndvi.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
index 38b975bdc..a8e7602dc 100644
--- a/src/exporters/ndvi.py
+++ b/src/exporters/ndvi.py
@@ -25,8 +25,8 @@ def __init__(self, data_folder: Path = Path('data')) -> None:
             self.ndvi_folder.mkdir()
 
         self.base_url = 'https://www.ncei.noaa.gov/data/' \
-        'avhrr-land-normalized-difference-vegetation-index/' \
-        'access'.replace(' ', '')
+            'avhrr-land-normalized-difference-vegetation-index/' \
+            'access'.replace(' ', '')
 
     @staticmethod
     def beautiful_soup_url(url: str) -> BeautifulSoup:
diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py
index 860a661cd..cce837c6d 100644
--- a/tests/exporters/test_ndvi.py
+++ b/tests/exporters/test_ndvi.py
@@ -27,7 +27,7 @@ def test_checkpointing(self, mock_system, tmp_path, capsys):
         exporter.wget_file(test_filename)
         captured = capsys.readouterr()
 
-        expected_stdout = f'{test_filename} for 1981 already donwloaded!\n'
+        expected_stdout = f'testy_test.nc for 1981 already donwloaded!\n'
         assert captured.out == expected_stdout, \
             f'Expected stdout to be {expected_stdout}, got {captured.out}'
         mock_system.assert_not_called(), 'os.system was called! Should have been skipped'
@@ -55,7 +55,7 @@ def test_get_filenames(self, request_patch, monkeypatch, tmp_path):
             'href="1981/">1981/</a></td><td align="right">14-Jul-2019 16:09'
 
         # EXPECTED response for second page (1981 all .nc files)
-        expected_response ='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n' \
+        expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n' \
             '<head>\n  <title>Index of' \
             '/data/avhrr-land-normalized-difference-vegetation-index/access/' \
             '1981</title>\n</head>\n <body>\n<h1>Index of' \

From 213e0eb707caf27b6780252f10ee200dc57ffa79 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 23 Jul 2019 13:20:43 +0100
Subject: [PATCH 4/8] update scripts/exporters

---
 scripts/export.py | 49 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/scripts/export.py b/scripts/export.py
index 2bc702ed2..a516c4b41 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -4,8 +4,9 @@
 sys.path.append('..')
 from src.exporters import (ERA5Exporter, VHIExporter,
                            CHIRPSExporter, ERA5ExporterPOS,
-                           GLEAMExporter)
-
+                           GLEAMExporter, NDVIExporter,
+                           S5Exporter)
+import numpy as np
 
 def export_era5():
     # if the working directory is alread ml_drought don't need ../data
@@ -63,9 +64,53 @@ def export_gleam():
     exporter.export(['E', 'SMroot', 'SMsurf'], 'monthly')
 
 
+def export_ndvi():
+    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
+        data_path = Path('data')
+    else:
+        data_path = Path('../data')
+
+    exporter = NDVIExporter(data_path)
+    exporter.export()
+
+
+
+def export_s5(
+    granularity='hourly', pressure_level=False,
+    variable='total_precipitation', min_year=1993,
+    max_year=1994, min_month=1, max_month=12,
+):
+    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
+        data_path = Path('data')
+    else:
+        data_path = Path('../data')
+
+    exporter = S5Exporter(
+        data_folder=data_path,
+        granularity=granularity,
+        pressure_level=pressure_level,
+    )
+    max_leadtime = None
+    pressure_levels = [200, 500, 925]
+    selection_request = None
+    n_parallel_requests = 1
+
+    exporter.export(
+        variable=variable,
+        min_year=min_year,
+        max_year=max_year,
+        min_month=min_month,
+        max_month=max_month,
+        max_leadtime=max_leadtime,
+        pressure_levels=pressure_levels,
+        n_parallel_requests=n_parallel_requests,
+    )
+
 if __name__ == '__main__':
     export_era5()
     export_vhi()
     export_chirps()
     export_era5POS()
     export_gleam()
+    export_ndvi()
+    # export_s5()

From 8abff278b89029ee8d4dcf7323aa1b0e0d5a9480 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 23 Jul 2019 13:21:24 +0100
Subject: [PATCH 5/8] remove scripts/export_s5

---
 scripts/export_s5.py | 42 ------------------------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 scripts/export_s5.py

diff --git a/scripts/export_s5.py b/scripts/export_s5.py
deleted file mode 100644
index 86744268a..000000000
--- a/scripts/export_s5.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import os; import sys
-sys.path.append('..')
-
-from src.exporters.seas5.s5 import (S5Exporter)
-from pathlib import Path
-import numpy as np
-
-# %load_ext autoreload
-# %autoreload 2
-
-data_dir = Path('data')
-
-granularity = 'hourly'
-pressure_level=False
-
-s5 = S5Exporter(
-    data_folder=data_dir,
-    granularity=granularity,
-    pressure_level=pressure_level,
-)
-
-variable = 'total_precipitation'
-min_year = 1993
-max_year = 2014
-min_month = 1
-max_month = 12
-max_leadtime = None
-pressure_levels = [200, 500, 925]
-selection_request = None
-n_parallel_requests = 20
-show_api_request = True
-
-s5.export(
-    variable=variable,
-    min_year=min_year,
-    max_year=max_year,
-    min_month=min_month,
-    max_month=max_month,
-    max_leadtime=max_leadtime,
-    pressure_levels=pressure_levels,
-    n_parallel_requests=n_parallel_requests,
-)

From 2b83c5f6b1fc71faa147b708c05255de173aaa6c Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Tue, 6 Aug 2019 11:27:45 +0100
Subject: [PATCH 6/8] update tests

---
 src/exporters/ndvi.py        |  2 +-
 tests/exporters/test_ndvi.py | 86 ++++++------------------------------
 2 files changed, 15 insertions(+), 73 deletions(-)

diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
index a8e7602dc..dc23e565d 100644
--- a/src/exporters/ndvi.py
+++ b/src/exporters/ndvi.py
@@ -94,7 +94,7 @@ def wget_file(self, url) -> None:
     def export(self, years: Optional[List[int]] = None,
                parallel_processes: int = 1) -> None:
         """Export functionality for the NDVI product from AVHRR (NOAA)
-            1981 - 2019.
+            1981 - 2019 (daily).
         Arguments
         ----------
         years: Optional list of ints, default = None
diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py
index cce837c6d..98616c7de 100644
--- a/tests/exporters/test_ndvi.py
+++ b/tests/exporters/test_ndvi.py
@@ -1,8 +1,4 @@
-# from pathlib import Path
-from unittest.mock import patch, MagicMock
-import urllib.request
-# import pytest
-import numpy as np
+from unittest.mock import patch
 
 from src.exporters import NDVIExporter
 
@@ -32,73 +28,19 @@ def test_checkpointing(self, mock_system, tmp_path, capsys):
             f'Expected stdout to be {expected_stdout}, got {captured.out}'
         mock_system.assert_not_called(), 'os.system was called! Should have been skipped'
 
-    @patch('urllib.request.Request', autospec=True)
-    def test_get_filenames(self, request_patch, monkeypatch, tmp_path):
-        # First 1000 characters of the urllib response from the https,
-        # pulled on July 23 2019
-        request_patch.return_value = MagicMock()
-
-        # EXPECTED response for first page (all years)
-        expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD ' \
-            'HTML 3.2 Final//EN">\n<html>\n <head>\n<title>Index of' \
-            '/data/avhrr-land-normalized-difference-vegetation-index/access</title>\n' \
-            '</head>\n <body>\n<h1>Index of' \
-            '/data/avhrr-land-normalized-difference-vegetation-index/access</h1>' \
-            '\n<table><tr><th>&nbsp;</th><th><a' \
-            'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a' \
-            'href="?C=S;O=A">Size</a></th><th><a' \
-            'href="?C=D;O=A">Description</a></th></tr><tr><th' \
-            'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
-            'href="/data/avhrr-land-normalized-difference-vegetation-index/">Parent' \
-            'Directory</a></td><td>&nbsp;</td><td align="right">  -' \
-            '</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
-            'href="1981/">1981/</a></td><td align="right">14-Jul-2019 16:09'
-
-        # EXPECTED response for second page (1981 all .nc files)
-        expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n' \
-            '<head>\n  <title>Index of' \
-            '/data/avhrr-land-normalized-difference-vegetation-index/access/' \
-            '1981</title>\n</head>\n <body>\n<h1>Index of' \
-            '/data/avhrr-land-normalized-difference-vegetation-index/access/1981' \
-            '</h1>\n<table><tr><th>&nbsp;</th><th><a' \
-            'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last' \
-            'modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a' \
-            'href="?C=D;O=A">Description</a></th></tr><tr><th' \
-            'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
-            'href="/data/avhrr-land-normalized-difference-vegetation-index/access/">Parent' \
-            'Directory</a></td><td>&nbsp;</td><td align="right">  -' \
-            '</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
-            'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc">' \
-            'AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc</a></td><td' \
-            'align="right">12-Jul-2019 10:37  </td><td align="right">' \
-            '51M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
-            'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc">' \
-            'AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc</a></td><td' \
-            'align="right">12-Jul-2019 10:37  </td><td align="right">' \
-            '59M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a'
-
-        expected_urls = [
-            'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
-            'difference-vegetation-index/access/1981/AVHRR-Land_'
-            'v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc',
-            'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
-            'difference-vegetation-index/access/1981/AVHRR-Land_'
-            'v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc',
-        ]
-
-        # HOW TO MOCK beautiful_soup_url function
-        def mockreturn(request):
-            class OpenURL:
-                def read(self):
-                    return expected_response
-            open_url = OpenURL()
-            return open_url
+    @patch('os.system')
+    def test_beautiful_soup_regex_parse(self, mock_system, tmp_path):
+        exporter = NDVIExporter(tmp_path)
+        files = exporter.get_ndvi_url_paths(selected_years=[1981])
 
-        # i want to patch this ::L34-L36 $ the_page = response.read()
-        monkeypatch.setattr(urllib.request, 'urlopen', mockreturn)
+        # check that all netcdf files
+        assert all([f[-3:] == '.nc' for f in files])
 
-        exporter = NDVIExporter(tmp_path)
-        filenames = exporter.get_ndvi_url_paths(selected_years=np.arange(1981, 1985))
+        # check base of string
+        base_url_str = 'https://www.ncei.noaa.gov/data/' \
+            'avhrr-land-normalized-difference-vegetation-index/access/1981/'
+        assert all([f.split('AVHRR')[0] == base_url_str for f in files])
 
-        assert filenames is not None
-        assert expected_urls is not None
+        # check got 31 December
+        timestamp = '19811231'
+        assert files[-1].split('_')[-2] == timestamp

From 7066c325e9ecf840bf03952c637f1b6fa97a32e2 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Fri, 25 Oct 2019 16:21:57 +0100
Subject: [PATCH 7/8] bs4 is an optional install

---
 src/exporters/ndvi.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
index dc23e565d..7630af5e7 100644
--- a/src/exporters/ndvi.py
+++ b/src/exporters/ndvi.py
@@ -1,6 +1,5 @@
 from pathlib import Path
 from typing import List, Optional
-from bs4 import BeautifulSoup
 import urllib.request
 import numpy as np
 import os
@@ -9,6 +8,7 @@
 import re
 
 from .base import BaseExporter
+BeautifulSoup = None
 
 
 class NDVIExporter(BaseExporter):
@@ -18,6 +18,10 @@ class NDVIExporter(BaseExporter):
     """
 
     def __init__(self, data_folder: Path = Path('data')) -> None:
+        global BeautifulSoup
+        if BeautifulSoup is None:
+            from bs4 import BeautifulSoup
+
         super().__init__(data_folder)
 
         self.ndvi_folder = self.raw_folder / "ndvi"
@@ -28,6 +32,7 @@ def __init__(self, data_folder: Path = Path('data')) -> None:
             'avhrr-land-normalized-difference-vegetation-index/' \
             'access'.replace(' ', '')
 
+
     @staticmethod
     def beautiful_soup_url(url: str) -> BeautifulSoup:
         # use urllib.request to read the page source

From b90124158169199f68cb5d784f9baee3dd3d4266 Mon Sep 17 00:00:00 2001
From: tommylees112 <thomas.lees112@gmail.com>
Date: Mon, 28 Oct 2019 09:41:01 +0000
Subject: [PATCH 8/8] update mypy errors for bs4

---
 src/exporters/ndvi.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py
index 7630af5e7..a69c8776b 100644
--- a/src/exporters/ndvi.py
+++ b/src/exporters/ndvi.py
@@ -34,14 +34,14 @@ def __init__(self, data_folder: Path = Path('data')) -> None:
 
 
     @staticmethod
-    def beautiful_soup_url(url: str) -> BeautifulSoup:
+    def beautiful_soup_url(url: str) -> BeautifulSoup:  # type: ignore
         # use urllib.request to read the page source
         req = urllib.request.Request(url)
         response = urllib.request.urlopen(req)
         the_page = response.read()
 
         # use BeautifulSoup to parse the html source
-        soup = BeautifulSoup(the_page, features="lxml")
+        soup = BeautifulSoup(the_page, features="lxml")  # type: ignore
 
         return soup
 
@@ -53,7 +53,7 @@ def get_ndvi_url_paths(self,
         # find all links (to the years)
         years = [
             yrs.string.replace('/', '')
-            for yrs in soup.find_all('a')
+            for yrs in soup.find_all('a')  # type: ignore
             if re.match(r'[0-9]{4}', yrs.string)
         ]
 
@@ -70,7 +70,7 @@ def get_ndvi_url_paths(self,
         # get the urls for the .nc files
         all_urls = []
         for url in year_urls:
-            links = self.beautiful_soup_url(url).find_all('a')
+            links = self.beautiful_soup_url(url).find_all('a')  # type: ignore
             nc_links = [
                 f'{url}/{l.string}'
                 for l in links