Merge pull request ckan#90 from qld-gov-au/QOLSVC-5123-empty-columns

[QOLSVC-5123] skip rows that are completely blank instead of erroring out
JVickery-TBS · Mar 15, 2024 · 0f79888 · 0f79888
2 parents 7857c1a + 541a343
commit 0f79888
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 1 deletion.
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -148,6 +148,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     # Get the list of rows to skip. The rows in the tabulator stream are
     # numbered starting with 1.
     skip_rows = list(range(1, header_offset + 1))
+    skip_rows.append({'type': 'preset', 'value': 'blank'})
 
     # Get the delimiter used in the file
     delimiter = stream.dialect.get('delimiter')
@@ -360,12 +361,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     try:
         file_format = os.path.splitext(table_filepath)[1].strip('.')
         with UnknownEncodingStream(table_filepath, file_format, decoding_result,
+                                   skip_rows=[{'type': 'preset', 'value': 'blank'}],
                                    post_parse=[TypeConverter().convert_types]) as stream:
             header_offset, headers = headers_guess(stream.sample)
     except TabulatorException:
         try:
             file_format = mimetype.lower().split('/')[-1]
             with UnknownEncodingStream(table_filepath, file_format, decoding_result,
+                                       skip_rows=[{'type': 'preset', 'value': 'blank'}],
                                        post_parse=[TypeConverter().convert_types]) as stream:
                 header_offset, headers = headers_guess(stream.sample)
         except TabulatorException as e:
@@ -387,6 +390,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     # Get the list of rows to skip. The rows in the tabulator stream are
     # numbered starting with 1. We also want to skip the header row.
     skip_rows = list(range(1, header_offset + 2))
+    skip_rows.append({'type': 'preset', 'value': 'blank'})
 
     TYPES, TYPE_MAPPING = get_types()
     strict_guessing = p.toolkit.asbool(

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -7,7 +7,7 @@
 
 from ckan.model.domain_object import DomainObjectOperation
 from ckan.model.resource import Resource
-from ckan.model.package import Package
+# from ckan.model.package import Package
 
 from . import action, auth, helpers as xloader_helpers, utils
 from ckanext.xloader.utils import XLoaderFormats

diff --git a/ckanext/xloader/tests/samples/sample_with_empty_lines.csv b/ckanext/xloader/tests/samples/sample_with_empty_lines.csv
@@ -0,0 +1,10 @@
+date,temperature,place
+2011-01-01,1,Galway
+2011-01-02,-1,Galway
+2011-01-03,0,Galway
+2011-01-01,6,Berkeley
+
+,,Berkeley
+2011-01-03,5,
+
+
diff --git a/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv b/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv
@@ -0,0 +1,2 @@
+Agency (Dept or Stat Body),Agency address,Contract description/name,Award contract date,Contract value,Supplier name,Supplier address,Variation to contract (Yes/No),Specific confidentiality provision used,Procurement method,Reason for Limited tender,Form of contract,Number of offers sought,Evaluation criteria and weightings,Deliverables,Contract milestones,Contract performance management,,,,,,,,,,,,,,,
+State-wide Operations,"111 Easy St, Duckburg, 40000",con_12345-Social services,01/01/1970,"$123,456",LexCorp,123 Example St ELEMENT CITY 4444,No,No,Selective,,,,,,,,,,,,,,,,,,,,,,
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -632,6 +632,18 @@ def test_with_blanks(self, Session):
         )
         assert len(self._get_records(Session, resource_id)) == 3
 
+    def test_with_empty_lines(self, Session):
+        csv_filepath = get_sample_filepath("sample_with_empty_lines.csv")
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_csv(
+            csv_filepath,
+            resource_id=resource_id,
+            mimetype="text/csv",
+            logger=logger,
+        )
+        assert len(self._get_records(Session, resource_id)) == 6
+
     def test_with_quoted_commas(self, Session):
         csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
         resource = factories.Resource()
@@ -1217,6 +1229,30 @@ def test_no_entries(self):
                 logger=logger,
             )
 
+    def test_with_blanks(self, Session):
+        csv_filepath = get_sample_filepath("sample_with_blanks.csv")
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_table(
+            csv_filepath,
+            resource_id=resource_id,
+            mimetype="text/csv",
+            logger=logger,
+        )
+        assert len(self._get_records(Session, resource_id)) == 3
+
+    def test_with_empty_lines(self, Session):
+        csv_filepath = get_sample_filepath("sample_with_empty_lines.csv")
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_table(
+            csv_filepath,
+            resource_id=resource_id,
+            mimetype="text/csv",
+            logger=logger,
+        )
+        assert len(self._get_records(Session, resource_id)) == 6
+
     def test_with_quoted_commas(self, Session):
         csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
         resource = factories.Resource()
@@ -1241,6 +1277,18 @@ def test_with_iso_8859_1(self, Session):
         )
         assert len(self._get_records(Session, resource_id)) == 266
 
+    def test_with_extra_blank_cells(self, Session):
+        csv_filepath = get_sample_filepath("sample_with_extra_blank_cells.csv")
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_table(
+            csv_filepath,
+            resource_id=resource_id,
+            mimetype="text/csv",
+            logger=logger,
+        )
+        assert len(self._get_records(Session, resource_id)) == 1
+
     def test_with_mixed_quotes(self, Session):
         csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
         resource = factories.Resource()