From 43d0bc3f11ba61eb09a0fc3e6370fd5fb0e5689a Mon Sep 17 00:00:00 2001 From: Jeff Osundwa Date: Tue, 7 Jan 2025 00:01:43 +0300 Subject: [PATCH 1/5] Add DCAS Farm Registry ingestor --- .gitignore | 2 + default.nix | 3 +- django_project/gap/ingestor/farm_registry.py | 129 +++++++++++++++++++ django_project/gap/models/ingestor.py | 5 + 4 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 django_project/gap/ingestor/farm_registry.py diff --git a/.gitignore b/.gitignore index 15196518..512b4953 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ docs/mkdocs.yml # locust auth files locust_auth.json + +.vscode-extensions/ \ No newline at end of file diff --git a/default.nix b/default.nix index ee1388fe..81c9a7ce 100644 --- a/default.nix +++ b/default.nix @@ -2,7 +2,7 @@ with import { }; let # For packages pinned to a specific version - pinnedHash = "933d7dc155096e7575d207be6fb7792bc9f34f6d"; + pinnedHash = "24.05"; pinnedPkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/${pinnedHash}.tar.gz") { }; pythonPackages = pinnedPkgs.python3Packages; in pkgs.mkShell rec { @@ -41,6 +41,7 @@ in pkgs.mkShell rec { pinnedPkgs.zlib pinnedPkgs.gnused pinnedPkgs.rpl + pinnedPkgs.vscode ]; # Run this command, only after creating the virtual environment diff --git a/django_project/gap/ingestor/farm_registry.py b/django_project/gap/ingestor/farm_registry.py new file mode 100644 index 00000000..60da11fb --- /dev/null +++ b/django_project/gap/ingestor/farm_registry.py @@ -0,0 +1,129 @@ +import csv +import os +import shutil +import uuid +import tempfile +import zipfile +from datetime import datetime, timezone +from django.contrib.gis.geos import Point +import logging +from gap.ingestor.base import BaseIngestor +from gap.models import ( + Farm, Crop, FarmRegistry, FarmRegistryGroup, Country, + IngestorSession, IngestorSessionProgress, IngestorSessionStatus, +) +from django.db import transaction + +logger = logging.getLogger(__name__) + +class DCASFarmRegistryIngestor(BaseIngestor): + """Ingestor for DCAS Farmer Registry data.""" + + def __init__(self, session: IngestorSession, working_dir='/tmp'): + """Initialize the ingestor with session and working directory. + + :param session: Ingestor session object + :type session: IngestorSession + :param working_dir: Directory to extract ZIP files temporarily + :type working_dir: str, optional + """ + super().__init__(session, working_dir) + + # Initialize the FarmRegistryGroup model + self.group_model = FarmRegistryGroup + + # Placeholder for the group created during this session + self.group = None + + def _extract_zip_file(self): + """Extract the ZIP file to a temporary directory.""" + dir_path = os.path.join(self.working_dir, str(uuid.uuid4())) + os.makedirs(dir_path, exist_ok=True) + + with self.session.file.open('rb') as zip_file: + with tempfile.NamedTemporaryFile(delete=False, dir=self.working_dir) as tmp_file: + tmp_file.write(zip_file.read()) + tmp_file_path = tmp_file.name + + with zipfile.ZipFile(tmp_file_path, 'r') as zip_ref: + zip_ref.extractall(dir_path) + + os.remove(tmp_file_path) + return dir_path + + def _create_registry_group(self): + """Create a new FarmRegistryGroup.""" + self.group = FarmRegistryGroup.objects.create( + date_time=datetime.now(timezone.utc), + country=self.session.country, + is_latest=True + ) + # Mark previous groups as not latest for this country (if applicable) + if self.session.country: + FarmRegistryGroup.objects.filter(country=self.session.country).exclude(id=self.group.id).update(is_latest=False) + + def _process_row(self, row): + """Process a single row from the input file.""" + try: + # Parse latitude and longitude to create a geometry point + latitude = float(row['FinalLatitude']) + longitude = float(row['FinalLongitude']) + point = Point(x=longitude, y=latitude, srid=4326) + + # Get or create the Farm instance + farm, _ = Farm.objects.get_or_create( + code=row['FarmerId'], + defaults={ + 'geometry': point + } + ) + + # Get the Crop instance + crop, _ = Crop.objects.get_or_create( + name=row['CropName'] + ) + + # Parse the planting date + planting_date = datetime.strptime(row['PlantingDate'], '%m/%d/%Y').date() + + # Create the FarmRegistry entry + FarmRegistry.objects.update_or_create( + group=self.group, + farm=farm, + crop=crop, + planting_date=planting_date, + defaults={ + # Add additional fields if required here + } + ) + + except Exception as e: + logger.error(f"Error processing row: {row} - {e}") + + def _run(self, dir_path): + """Run the ingestion logic.""" + self._create_registry_group() + logger.info(f"Created new registry group: {self.group.id}") + + for file_name in os.listdir(dir_path): + if file_name.endswith('.csv'): + file_path = os.path.join(dir_path, file_name) + with open(file_path, 'r') as file: + reader = csv.DictReader(file) + with transaction.atomic(): + for row in reader: + self._process_row(row) + break + else: + raise FileNotFoundError("No CSV file found in the extracted ZIP.") + + def run(self): + """Run the ingestion process.""" + if not self.session.file: + raise FileNotFoundError("No file found in the session.") + + dir_path = self._extract_zip_file() + try: + self._run(dir_path) + finally: + shutil.rmtree(dir_path) diff --git a/django_project/gap/models/ingestor.py b/django_project/gap/models/ingestor.py index de2db1c3..284e96d3 100644 --- a/django_project/gap/models/ingestor.py +++ b/django_project/gap/models/ingestor.py @@ -38,6 +38,7 @@ class IngestorType: CABI_PRISE_EXCEL = 'Cabi Prise Excel' CBAM_BIAS_ADJUST = 'CBAM Bias Adjusted' DCAS_RULE = 'DCAS Rules' + FARM_REGISTRY = 'Farm Registry' class IngestorSessionStatus: @@ -82,6 +83,7 @@ class Meta: # noqa: D106 ), (IngestorType.CBAM_BIAS_ADJUST, IngestorType.CBAM_BIAS_ADJUST), (IngestorType.DCAS_RULE, IngestorType.DCAS_RULE), + (IngestorType.FARM_REGISTRY, IngestorType.FARM_REGISTRY), ), max_length=512 ) @@ -208,6 +210,7 @@ def _run(self, working_dir): from gap.ingestor.cabi_prise import CabiPriseIngestor from gap.ingestor.cbam_bias_adjust import CBAMBiasAdjustIngestor from gap.ingestor.dcas_rule import DcasRuleIngestor + from gap.ingestor.farm_registry import DCASFarmRegistryIngestor ingestor = None if self.ingestor_type == IngestorType.TAHMO: @@ -234,6 +237,8 @@ def _run(self, working_dir): ingestor = CBAMBiasAdjustIngestor elif self.ingestor_type == IngestorType.DCAS_RULE: ingestor = DcasRuleIngestor + elif self.ingestor_type == IngestorType.FARM_REGISTRY: + ingestor = DCASFarmRegistryIngestor if ingestor: ingestor(self, working_dir).run() From d44d16ae098c7dca7cbef74b87eb466236c3918b Mon Sep 17 00:00:00 2001 From: Jeff Osundwa Date: Wed, 8 Jan 2025 08:29:45 +0300 Subject: [PATCH 2/5] Add DCAS Farm Registry ingestor tests --- django_project/gap/ingestor/farm_registry.py | 23 +++---- .../data/farm_registry/invalid_test.zip | Bin 0 -> 247 bytes .../data/farm_registry/test_farm_registry.zip | Bin 0 -> 227 bytes .../gap/tests/ingestor/test_farm_registry.py | 61 ++++++++++++++++++ 4 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 django_project/gap/tests/ingestor/data/farm_registry/invalid_test.zip create mode 100644 django_project/gap/tests/ingestor/data/farm_registry/test_farm_registry.zip create mode 100644 django_project/gap/tests/ingestor/test_farm_registry.py diff --git a/django_project/gap/ingestor/farm_registry.py b/django_project/gap/ingestor/farm_registry.py index 60da11fb..cd31ddad 100644 --- a/django_project/gap/ingestor/farm_registry.py +++ b/django_project/gap/ingestor/farm_registry.py @@ -9,13 +9,14 @@ import logging from gap.ingestor.base import BaseIngestor from gap.models import ( - Farm, Crop, FarmRegistry, FarmRegistryGroup, Country, - IngestorSession, IngestorSessionProgress, IngestorSessionStatus, + Farm, Crop, FarmRegistry, FarmRegistryGroup, + IngestorSession, ) from django.db import transaction logger = logging.getLogger(__name__) + class DCASFarmRegistryIngestor(BaseIngestor): """Ingestor for DCAS Farmer Registry data.""" @@ -41,7 +42,9 @@ def _extract_zip_file(self): os.makedirs(dir_path, exist_ok=True) with self.session.file.open('rb') as zip_file: - with tempfile.NamedTemporaryFile(delete=False, dir=self.working_dir) as tmp_file: + with tempfile.NamedTemporaryFile( + delete=False, dir=self.working_dir) as tmp_file: + tmp_file.write(zip_file.read()) tmp_file_path = tmp_file.name @@ -53,14 +56,10 @@ def _extract_zip_file(self): def _create_registry_group(self): """Create a new FarmRegistryGroup.""" - self.group = FarmRegistryGroup.objects.create( + self.group = self.group_model.objects.create( date_time=datetime.now(timezone.utc), - country=self.session.country, is_latest=True ) - # Mark previous groups as not latest for this country (if applicable) - if self.session.country: - FarmRegistryGroup.objects.filter(country=self.session.country).exclude(id=self.group.id).update(is_latest=False) def _process_row(self, row): """Process a single row from the input file.""" @@ -72,7 +71,7 @@ def _process_row(self, row): # Get or create the Farm instance farm, _ = Farm.objects.get_or_create( - code=row['FarmerId'], + unique_id=row['FarmerId'], defaults={ 'geometry': point } @@ -84,7 +83,8 @@ def _process_row(self, row): ) # Parse the planting date - planting_date = datetime.strptime(row['PlantingDate'], '%m/%d/%Y').date() + planting_date = datetime.strptime( + row['PlantingDate'], '%m/%d/%Y').date() # Create the FarmRegistry entry FarmRegistry.objects.update_or_create( @@ -92,9 +92,6 @@ def _process_row(self, row): farm=farm, crop=crop, planting_date=planting_date, - defaults={ - # Add additional fields if required here - } ) except Exception as e: diff --git a/django_project/gap/tests/ingestor/data/farm_registry/invalid_test.zip b/django_project/gap/tests/ingestor/data/farm_registry/invalid_test.zip new file mode 100644 index 0000000000000000000000000000000000000000..b4fe30ade50b27ad69ea02d0823cb8329772808b GIT binary patch literal 247 zcmWIWW@Zs#00DQUdlL|f{CY!K!K;>^6V#GK3&z2xFDx5T2{)FRIm9kboGpkZ{j0}uSbqx%GD8QSMNrV} Date: Sun, 12 Jan 2025 22:19:20 +0300 Subject: [PATCH 3/5] PR fixes for farm registry --- .github/workflows/tests.yaml | 2 +- django_project/gap/ingestor/farm_registry.py | 56 ++++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 2e2c25c5..4fd8fd35 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -15,7 +15,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v1 with: - python-version: 3.7 + python-version: 3.12 architecture: x64 - name: Checkout PyTorch uses: actions/checkout@master diff --git a/django_project/gap/ingestor/farm_registry.py b/django_project/gap/ingestor/farm_registry.py index cd31ddad..cbbe42c2 100644 --- a/django_project/gap/ingestor/farm_registry.py +++ b/django_project/gap/ingestor/farm_registry.py @@ -8,15 +8,53 @@ from django.contrib.gis.geos import Point import logging from gap.ingestor.base import BaseIngestor +from gap.ingestor.exceptions import ( + FileNotFoundException, FileIsNotCorrectException, +) from gap.models import ( Farm, Crop, FarmRegistry, FarmRegistryGroup, - IngestorSession, + IngestorSession, CropStageType ) from django.db import transaction +from django.db.models import Q logger = logging.getLogger(__name__) +class Keys: + """Keys for the data.""" + + CROP = 'crop' + PARAMETER = 'parameter' + GROWTH_STAGE = 'growth_stage' + MIN_RANGE = 'min_range' + MAX_RANGE = 'max_range' + CODE = 'code' + + @staticmethod + def check_columns(df) -> bool: + """Check if all columns exist in dataframe. + + :param df: dataframe from csv + :type df: pd.DataFrame + :raises FileIsNotCorrectException: When column is missing + """ + keys = [ + Keys.CROP, Keys.PARAMETER, Keys.GROWTH_STAGE, + Keys.MIN_RANGE, Keys.MAX_RANGE, Keys.CODE + ] + + missing = [] + for key in keys: + if key not in df.columns: + missing.append(key) + + if missing: + raise FileIsNotCorrectException( + f'Column(s) missing: {",".join(missing)}' + ) + + class DCASFarmRegistryIngestor(BaseIngestor): """Ingestor for DCAS Farmer Registry data.""" @@ -77,9 +115,17 @@ def _process_row(self, row): } ) - # Get the Crop instance + # get crop and stage type + crop_with_stage = row[Keys.CROP].lower().split('_') crop, _ = Crop.objects.get_or_create( - name=row['CropName'] + name__iexact=crop_with_stage[0], + defaults={ + 'name': crop_with_stage[0].title() + } + ) + stage_type = CropStageType.objects.get( + Q(name__iexact=crop_with_stage[1]) | + Q(alias__iexact=crop_with_stage[1]) ) # Parse the planting date @@ -91,6 +137,7 @@ def _process_row(self, row): group=self.group, farm=farm, crop=crop, + crop_stage_type=stage_type, planting_date=planting_date, ) @@ -117,8 +164,7 @@ def _run(self, dir_path): def run(self): """Run the ingestion process.""" if not self.session.file: - raise FileNotFoundError("No file found in the session.") - + raise FileNotFoundException("No file found for ingestion.") dir_path = self._extract_zip_file() try: self._run(dir_path) From 10a273b18dce03b6731c2b56a6a15fb0e3c1d5f9 Mon Sep 17 00:00:00 2001 From: Jeff Osundwa Date: Sun, 12 Jan 2025 22:38:50 +0300 Subject: [PATCH 4/5] fix public docstring --- django_project/gap/ingestor/farm_registry.py | 7 +++++++ django_project/gap/tests/ingestor/test_farm_registry.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/django_project/gap/ingestor/farm_registry.py b/django_project/gap/ingestor/farm_registry.py index cbbe42c2..9887627b 100644 --- a/django_project/gap/ingestor/farm_registry.py +++ b/django_project/gap/ingestor/farm_registry.py @@ -1,3 +1,10 @@ +# coding=utf-8 +""" +Tomorrow Now GAP + +.. note:: Ingestor for DCAS Farmer Registry data +""" + import csv import os import shutil diff --git a/django_project/gap/tests/ingestor/test_farm_registry.py b/django_project/gap/tests/ingestor/test_farm_registry.py index c7ce7026..148bfa07 100644 --- a/django_project/gap/tests/ingestor/test_farm_registry.py +++ b/django_project/gap/tests/ingestor/test_farm_registry.py @@ -1,3 +1,10 @@ +# coding: utf-8 +""" +Tomorrow Now GAP + +.. note:: Unit tests for DCASFarmRegistryIngestor. +""" + import os import logging from django.core.files.uploadedfile import SimpleUploadedFile From e55609eafa7c4bfe94ac6f09eacc088faaeb5a9e Mon Sep 17 00:00:00 2001 From: Jeff Osundwa Date: Sun, 12 Jan 2025 22:41:18 +0300 Subject: [PATCH 5/5] fix docstring 'period' --- django_project/gap/ingestor/farm_registry.py | 2 +- django_project/gap/tests/ingestor/test_farm_registry.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/django_project/gap/ingestor/farm_registry.py b/django_project/gap/ingestor/farm_registry.py index 9887627b..1d1d6ed2 100644 --- a/django_project/gap/ingestor/farm_registry.py +++ b/django_project/gap/ingestor/farm_registry.py @@ -1,6 +1,6 @@ # coding=utf-8 """ -Tomorrow Now GAP +Tomorrow Now GAP. .. note:: Ingestor for DCAS Farmer Registry data """ diff --git a/django_project/gap/tests/ingestor/test_farm_registry.py b/django_project/gap/tests/ingestor/test_farm_registry.py index 148bfa07..9d90e68a 100644 --- a/django_project/gap/tests/ingestor/test_farm_registry.py +++ b/django_project/gap/tests/ingestor/test_farm_registry.py @@ -1,6 +1,6 @@ # coding: utf-8 """ -Tomorrow Now GAP +Tomorrow Now GAP. .. note:: Unit tests for DCASFarmRegistryIngestor. """