Skip to content

Commit

Permalink
Use both location name and code for individual upload
Browse files Browse the repository at this point in the history
To minimize ambiguous location issue
  • Loading branch information
weilu committed Nov 25, 2024
1 parent 263b4ba commit 6f83281
Show file tree
Hide file tree
Showing 13 changed files with 83 additions and 57 deletions.
2 changes: 1 addition & 1 deletion individual/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
'json_ext.educated_level'
],
"individual_base_fields": [
'first_name', 'last_name', 'dob', 'location_name', 'id'
'first_name', 'last_name', 'dob', 'location_name', 'location_code', 'id'
]
}

Expand Down
3 changes: 2 additions & 1 deletion individual/management/commands/fake_individuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def generate_fake_individual(group_code, recipient_info, individual_role, locati
"number_of_elderly": fake.random_int(min=0, max=5),
"number_of_children": fake.random_int(min=0, max=10),
"beneficiary_data_source": fake.company(),
"location_name": location.name if location else ""
"location_name": location.name if location else "",
"location_code": location.code if location else "",
}


Expand Down
63 changes: 37 additions & 26 deletions individual/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,9 +584,9 @@ def process_chunk(
chunk,
properties,
unique_validations,
loc_name_dist_ids_from_db,
loc_name_code_district_ids_from_db,
user_allowed_loc_ids,
duplicate_village_names,
duplicate_village_name_code_tuples,
):
validated_dataframe = []
check_location = 'location_name' in chunk.columns
Expand All @@ -607,9 +607,10 @@ def process_chunk(
field_validation['validations']['location_name'] = (
IndividualImportService._validate_location(
row.location_name,
loc_name_dist_ids_from_db,
row.location_code,
loc_name_code_district_ids_from_db,
user_allowed_loc_ids,
duplicate_village_names,
duplicate_village_name_code_tuples,
)
)

Expand All @@ -632,62 +633,72 @@ def _validate_possible_individuals(self, dataframe: DataFrame, upload_id: uuid):
check_location = 'location_name' in dataframe.columns
if check_location:
# Issue a single DB query instead of per row for efficiency
loc_name_dist_ids_from_db = self._query_location_district_ids(dataframe.location_name)
loc_name_code_district_ids_from_db = self._query_location_district_ids(dataframe)
user_allowed_loc_ids = LocationManager().get_allowed_ids(self.user)
duplicate_village_names = self._query_dupcliate_village_names()
duplicate_village_name_code_tuples = self._query_duplicate_village_name_code()
else:
loc_name_dist_ids_from_db = None
loc_name_code_district_ids_from_db = None
user_allowed_loc_ids = None
duplicate_village_names = None
duplicate_village_name_code_tuples = None

# TODO: Use ProcessPoolExecutor after resolving django dependency loading issue
validated_dataframe = IndividualImportService.process_chunk(
dataframe,
properties,
unique_validations,
loc_name_dist_ids_from_db,
loc_name_code_district_ids_from_db,
user_allowed_loc_ids,
duplicate_village_names,
duplicate_village_name_code_tuples,
)

self.save_validation_error_in_data_source_bulk(validated_dataframe)
invalid_items = fetch_summary_of_broken_items(upload_id)
return validated_dataframe, invalid_items

@staticmethod
def _query_location_district_ids(location_names):
locations = Location.objects.filter(type="V", name__in=location_names.unique().tolist())
return {loc.name: loc.parent.parent.id for loc in locations}
def _query_location_district_ids(df):
unique_tuples = df[['location_name', 'location_code']].drop_duplicates()
query = Q()
for _, row in unique_tuples.iterrows():
query |= Q(name=row['location_name'], code=row['location_code'])
locations = Location.objects.filter(type="V").filter(query)
return {(loc.name, loc.code): loc.parent.parent.id for loc in locations}

@staticmethod
def _query_dupcliate_village_names():
def _query_duplicate_village_name_code():
return (
Location.objects
.filter(type="V")
.values('name')
.annotate(name_count=Count('name'))
.values('name', 'code')
.annotate(name_count=Count('id'))
.filter(name_count__gt=1)
.values_list('name', flat=True)
.values_list('name', 'code')
)

@staticmethod
def _validate_location(location_name, loc_name_dist_ids_from_db, user_allowed_loc_ids, duplicate_village_names):
def _validate_location(
location_name,
location_code,
loc_name_code_district_ids_from_db,
user_allowed_loc_ids,
duplicate_village_name_code_tuples
):
result = {
'field_name': 'location_name',
}
if location_name is None or location_name == "":
if (location_name is None or location_name == "") and (location_code is None or location_code == ""):
result['success'] = True
elif loc_name_dist_ids_from_db is None and user_allowed_loc_ids is None:
elif loc_name_code_district_ids_from_db is None and user_allowed_loc_ids is None:
result['success'] = True
elif location_name not in loc_name_dist_ids_from_db:
elif (location_name, location_code) not in loc_name_code_district_ids_from_db:
result['success'] = False
result['note'] = f"'location_name' value '{location_name}' is not a valid location name. Please check the spelling against the list of locations in the system."
elif location_name in duplicate_village_names:
result['note'] = f"Location with name '{location_name}' and code '{location_code}' is not valid. Please check the spelling against the list of locations in the system."
elif (location_name, location_code) in duplicate_village_name_code_tuples:
result['success'] = False
result['note'] = f"'location_name' value '{location_name}' is ambiguous, because there are more than one location with this name found in the system."
elif loc_name_dist_ids_from_db[location_name] not in user_allowed_loc_ids:
result['note'] = f"Location with name '{location_name}' and code '{location_code}' is ambiguous, because there are more than one location with this name and code found in the system."
elif loc_name_code_district_ids_from_db[(location_name, location_code)] not in user_allowed_loc_ids:
result['success'] = False
result['note'] = f"'location_name' value '{location_name}' is outside the current user's location permissions."
result['note'] = f"Location with name '{location_name}' and code '{location_code}' is outside the current user's location permissions."
else:
result['success'] = True
return result
Expand Down
15 changes: 7 additions & 8 deletions individual/tests/fixtures/individual_upload.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
first_name,last_name,dob,location_name,individual_role,recipient_info,group_code
Mia,Foo,1989-01-01,Washington DC,HEAD,1,AA
Bob,Foo,1988-01-01,Washington DC,SPOUSE,1,AA
Eric,Foo,2020-01-01,Washington DC,SON,1,AA
Ann,Foo,2020-01-01,Washington D.C.,DAUGHTER,1,AA
Michelle,Bar,1990-01-01,Fairfax,HEAD,0,BB
Frank,Baz,1998-01-01,,HEAD,0,CC

first_name,last_name,dob,location_name,location_code,individual_role,recipient_info,group_code
Mia,Foo,1989-01-01,Washington DC,202,HEAD,1,AA
Bob,Foo,1988-01-01,Washington DC,202,SPOUSE,1,AA
Eric,Foo,2020-01-01,Washington DC,202,SON,1,AA
Ann,Foo,2020-01-01,Washington D.C.,202,DAUGHTER,1,AA
Michelle,Bar,1990-01-01,Fairfax,703,HEAD,0,BB
Frank,Baz,1998-01-01,,,HEAD,0,CC
25 changes: 14 additions & 11 deletions individual/tests/individual_import_service_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from unittest.mock import MagicMock, patch
from core.utils import filter_validity
from core.models.user import Role
from location.models import Location


def count_csv_records(file_path):
Expand Down Expand Up @@ -53,11 +54,11 @@ def setUpClass(cls):
def setUpTestData(cls):
cls.village_a = create_test_village({
'name': 'Washington DC',
'code': 'VsA',
'code': '202',
})
cls.village_b = create_test_village({
'name': 'Fairfax',
'code': 'VsB'
'code': '703'
})


Expand Down Expand Up @@ -247,7 +248,7 @@ def test_validate_import_individuals_row_level_security(self, mock_fetch_summary
self.assertEqual(loc_validation.get('field_name'), 'location_name')
self.assertEqual(
loc_validation.get('note'),
f"'location_name' value '{self.village_b.name}' is outside the current user's location permissions."
f"Location with name '{self.village_b.name}' and code '{self.village_b.code}' is outside the current user's location permissions."
)

# Unknown individual location
Expand All @@ -259,18 +260,20 @@ def test_validate_import_individuals_row_level_security(self, mock_fetch_summary
self.assertEqual(loc_validation.get('field_name'), 'location_name')
self.assertEqual(
loc_validation.get('note'),
"'location_name' value 'Washington D.C.' is not a valid location name. "
"Location with name 'Washington D.C.' and code '202' is not valid. "
"Please check the spelling against the list of locations in the system."
)


@patch('individual.services.load_dataframe')
@patch('individual.services.fetch_summary_of_broken_items')
def test_validate_import_individuals_ambiguous_location_name(self, mock_fetch_summary, mock_load_dataframe):
# set up another location also named Fairfax and save to DB
loc_dup_name = create_test_village({
# set up another location with the same named and code in DB
loc_dup = Location.objects.create(**{
'name': 'Fairfax',
'code': 'VsB2',
'code': '703',
'type': 'V',
'parent': self.village_b.parent,
})

dataframe = pd.read_csv(self.csv_file_path, na_filter=False)
Expand All @@ -296,14 +299,14 @@ def test_validate_import_individuals_ambiguous_location_name(self, mock_fetch_su
# Check that the validation flagged lack of permission and unrecognized locations
validated_rows = result['data']

rows_ambiguous_loc = [row for row in validated_rows if row['row']['location_name'] == loc_dup_name.name]
self.assertTrue(rows_ambiguous_loc, f'Expected at least one row with location_name={loc_dup_name.name}')
rows_ambiguous_loc = [row for row in validated_rows if row['row']['location_name'] == loc_dup.name]
self.assertTrue(rows_ambiguous_loc, f'Expected at least one row with location_name={loc_dup.name}')
for row in rows_ambiguous_loc:
loc_validation = row['validations']['location_name']
self.assertFalse(loc_validation.get('success'))
self.assertEqual(loc_validation.get('field_name'), 'location_name')
self.assertEqual(
loc_validation.get('note'),
"'location_name' value 'Fairfax' is ambiguous, "
"because there are more than one location with this name found in the system."
"Location with name 'Fairfax' and code '703' is ambiguous, "
"because there are more than one location with this name and code found in the system."
)
2 changes: 1 addition & 1 deletion individual/tests/test_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_download_template_file(self):
response['Content-Disposition']
)

expected_base_csv_header = f'first_name,last_name,dob,location_name,id'
expected_base_csv_header = f'first_name,last_name,dob,location_name,location_code,id'
content = b"".join(response.streaming_content).decode('utf-8')
self.assertTrue(
expected_base_csv_header in content,
Expand Down
12 changes: 8 additions & 4 deletions individual/tests/test_workflows_individual_update.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from django.db import connection
from django.test import TestCase
from core.test_helpers import create_test_interactive_user
from individual.models import (
Expand Down Expand Up @@ -80,6 +81,7 @@ def setUp(self):
"last_name": "Doe",
"dob": "1980-01-01",
"location_name": self.village.name,
"location_code": self.village.code,
}
)
self.valid_data_source.save(user=self.user)
Expand All @@ -96,8 +98,8 @@ def setUp(self):

@patch('individual.apps.IndividualConfig.enable_maker_checker_for_individual_update', False)
@skipIf(
connection.vendor != "postgresql",
"Skipping tests due to implementation usage of validate_json_schema, which is a postgres specific extension."
connection.vendor != "postgresql",
"Skipping tests due to implementation usage of validate_json_schema, which is a postgres specific extension."
)
def test_process_update_individuals_workflow_successful_execution(self):
process_update_individuals_workflow(self.user_uuid, self.upload_uuid)
Expand Down Expand Up @@ -130,15 +132,16 @@ def test_process_update_individuals_workflow_successful_execution(self):

@patch('individual.apps.IndividualConfig.enable_maker_checker_for_individual_update', False)
@skipIf(
connection.vendor != "postgresql",
"Skipping tests due to implementation usage of validate_json_schema, which is a postgres specific extension."
connection.vendor != "postgresql",
"Skipping tests due to implementation usage of validate_json_schema, which is a postgres specific extension."
)
def test_process_update_individuals_workflow_with_all_valid_entries(self):
# Update invalid entry in IndividualDataSource to valid data
self.invalid_data_source.json_ext = {
"ID": str(self.individual2.id),
"first_name": self.individual2_updated_first_name,
"location_name": None,
"location_code": None,
}
self.invalid_data_source.save(user=self.user)

Expand Down Expand Up @@ -170,6 +173,7 @@ def test_process_update_individuals_workflow_with_maker_checker_enabled(self):
"ID": str(self.individual2.id),
"first_name": "Jane",
"location_name": None,
"location_code": None,
}
self.invalid_data_source.save(user=self.user)

Expand Down
2 changes: 2 additions & 0 deletions individual/tests/test_workflows_individual_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def setUp(self):
"last_name": "Doe",
"dob": "1980-01-01",
"location_name": self.village.name,
"location_code": self.village.code,
}
)
self.valid_data_source.save(user=self.user)
Expand Down Expand Up @@ -117,6 +118,7 @@ def test_process_import_individuals_workflow_with_all_valid_entries(self):
"last_name": "Doe",
"dob": "1982-01-01",
"location_name": None,
"location_code": None,
}
self.invalid_data_source.save(user=self.user)

Expand Down
8 changes: 4 additions & 4 deletions individual/tests/test_workflows_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ def tearDown(self):
patch.stopall()

def test_validate_dataframe_headers_valid(self):
self.executor.df = pd.DataFrame(columns=['first_name', 'last_name', 'dob', 'id', 'location_name'])
self.executor.df = pd.DataFrame(columns=['first_name', 'last_name', 'dob', 'id', 'location_name', 'location_code'])
try:
self.executor.validate_dataframe_headers()
except PythonWorkflowHandlerException:
self.fail("validate_dataframe_headers() raised PythonWorkflowHandlerException unexpectedly!")

def test_validate_dataframe_headers_missing_required_fields(self):
# DataFrame missing required 'first_name' column
self.executor.df = pd.DataFrame(columns=['last_name', 'dob', 'id', 'location_name'])
self.executor.df = pd.DataFrame(columns=['last_name', 'dob', 'id', 'location_name', 'location_code'])

with self.assertRaises(PythonWorkflowHandlerException) as cm:
self.executor.validate_dataframe_headers()
Expand All @@ -49,7 +49,7 @@ def test_validate_dataframe_headers_missing_required_fields(self):

def test_validate_dataframe_headers_invalid_headers(self):
# DataFrame with an invalid column
self.executor.df = pd.DataFrame(columns=['first_name', 'last_name', 'dob', 'id', 'location_name', 'unexpected_column'])
self.executor.df = pd.DataFrame(columns=['first_name', 'last_name', 'dob', 'id', 'location_name', 'location_code', 'unexpected_column'])

with self.assertRaises(PythonWorkflowHandlerException) as cm:
self.executor.validate_dataframe_headers()
Expand All @@ -58,7 +58,7 @@ def test_validate_dataframe_headers_invalid_headers(self):

def test_validate_dataframe_headers_update_missing_id(self):
# DataFrame missing 'ID' when is_update=True
columns = ['first_name', 'last_name', 'dob', 'id', 'location_name']
columns = ['first_name', 'last_name', 'dob', 'id', 'location_name', 'location_code']
self.executor.df = pd.DataFrame(columns=columns)

with self.assertRaises(PythonWorkflowHandlerException) as cm:
Expand Down
1 change: 1 addition & 0 deletions individual/workflows/base_individual_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def process_update_individuals_workflow(user_uuid, upload_uuid):
FROM individual_individualdatasource f
LEFT JOIN "tblLocations" AS loc
ON loc."LocationName" = f."Json_ext"->>'location_name'
AND loc."LocationCode" = f."Json_ext"->>'location_code'
AND loc."LocationType"='V'
WHERE individual_individual."UUID" = (f."Json_ext" ->> 'ID')::UUID
returning individual_individual."UUID", f."UUID" as "individualdatasource_id")
Expand Down
1 change: 1 addition & 0 deletions individual/workflows/base_individual_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def process_import_individuals_workflow(user_uuid, upload_uuid):
FROM individual_individualdatasource AS ds
LEFT JOIN "tblLocations" AS loc
ON loc."LocationName" = ds."Json_ext"->>'location_name'
AND loc."LocationCode" = ds."Json_ext"->>'location_code'
AND loc."LocationType"='V'
WHERE ds.upload_id=current_upload_id
AND ds.individual_id is null
Expand Down
2 changes: 2 additions & 0 deletions individual/workflows/individual_update_valid.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def process_update_valid_individuals_workflow(user_uuid, upload_uuid, accepted=N
FROM individual_individualdatasource ids
LEFT JOIN "tblLocations" AS loc
ON loc."LocationName" = ids."Json_ext"->>'location_name'
AND loc."LocationCode" = ids."Json_ext"->>'location_code'
AND loc."LocationType"='V'
WHERE individual_individual."UUID" = (ids."Json_ext" ->> 'ID')::UUID
AND ids.upload_id = current_upload_id
Expand Down Expand Up @@ -212,6 +213,7 @@ def process_update_valid_individuals_workflow(user_uuid, upload_uuid, accepted=N
FROM individual_individualdatasource ids
LEFT JOIN "tblLocations" AS loc
ON loc."LocationName" = ids."Json_ext"->>'location_name'
AND loc."LocationCode" = ids."Json_ext"->>'location_code'
AND loc."LocationType"='V'
WHERE individual_individual."UUID" = (ids."Json_ext" ->> 'ID')::UUID
AND ids.upload_id = current_upload_id
Expand Down
Loading

0 comments on commit 6f83281

Please sign in to comment.