ckan · OriHoch · Aug 17, 2020 · Aug 17, 2020 · Aug 20, 2020
diff --git a/README.rst b/README.rst
@@ -260,6 +260,12 @@ Configuration:
     # not be loaded into the datastore.
     ckanext.xloader.max_excerpt_lines = 100
 
+    # If set to True allows unicode characters in header names.
+    # If set to False (default), characters are encoded to ascii
+    # using the unidecode library.
+    ckanext.xloader.unicode_headers = False
+
+
 ------------------------
 Developer installation
 ------------------------

diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -196,6 +196,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
         # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
         #    the superuser issue. <-- picked
 
+        if config.get('ckanext.xloader.unicode_headers'):
+            column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
+        else:
+            column_names = ', '.join(['"{}"'.format(h) for h in headers])
         raw_connection = engine.raw_connection()
         try:
             cur = raw_connection.cursor()
@@ -211,8 +215,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
                             "      ENCODING '{encoding}');"
                             .format(
                                 resource_id=resource_id,
-                                column_names=', '.join(['"{}"'.format(h)
-                                                        for h in headers]),
+                                column_names=column_names,
                                 delimiter=delimiter,
                                 encoding='UTF8',
                                 ),
@@ -236,7 +239,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('...copying done')
 
     logger.info('Creating search index...')
-    _populate_fulltext(connection, resource_id, fields=fields)
+
+    if config.get('ckanext.xloader.unicode_headers'):
+        encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
+    else:
+        encoded_fields = fields
+
+    _populate_fulltext(connection, resource_id, fields=encoded_fields)
     logger.info('...search index created')
 
     return fields
@@ -401,12 +410,16 @@ def get_types():
 
 
 def encode_headers(headers):
+    if config.get('ckanext.xloader.unicode_headers'):
+        decode_func = unicode
+    else:
+        decode_func = unidecode
     encoded_headers = []
     for header in headers:
         try:
-            encoded_headers.append(unidecode(header))
+            encoded_headers.append(decode_func(header))
         except AttributeError:
-            encoded_headers.append(unidecode(str(header)))
+            encoded_headers.append(decode_func(str(header)))
 
     return encoded_headers
 
@@ -514,7 +527,7 @@ def _populate_fulltext(connection, resource_id, fields):
             (text/numeric/timestamp)
     '''
     sql = \
-        u'''
+        '''
         UPDATE {table}
         SET _full_text = to_tsvector({cols});
         '''.format(
@@ -560,8 +573,8 @@ def _create_fulltext_trigger(connection, resource_id):
 def identifier(s):
     # "%" needs to be escaped, otherwise connection.execute thinks it is for
     # substituting a bind parameter
-    return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
-        + u'"'
+    return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%')\
+        + '"'
 
 
 def literal_string(s):

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -23,6 +23,7 @@
 
 class XLoaderFormats(object):
     formats = None
+
     @classmethod
     def is_it_an_xloader_format(cls, format_):
         if cls.formats is None:

diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv
@@ -0,0 +1,7 @@
+זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
+229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
+229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
+229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
+229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
+229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
+229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -51,10 +51,10 @@ def _get_records(self, table_name, limit=None,
                              if col != '_full_text')
         else:
             cols = '*'
-        sql = 'SELECT {cols} FROM "{table_name}"' \
+        sql = u'SELECT {cols} FROM "{table_name}"' \
             .format(cols=cols, table_name=table_name)
         if limit is not None:
-            sql += ' LIMIT {}'.format(limit)
+            sql += u' LIMIT {}'.format(limit)
         results = c.execute(sql)
         return results.fetchall()
 
@@ -344,6 +344,39 @@ def test_column_names(self):
         assert_equal(self._get_records('test1')[0],
                      (1, u'2011-01-01', u'1', u'Galway'))
 
+    @helpers.change_config('ckanext.xloader.unicode_headers', 'True')
+    def test_unicode_column_names(self):
+        csv_filepath = get_sample_filepath('hebrew_sample.csv')
+        resource_id = 'test_hebrew'
+        factories.Resource(id=resource_id)
+        loader.load_csv(csv_filepath, resource_id=resource_id,
+                        mimetype='text/csv', logger=PrintLogger())
+        records = self._get_records('test_hebrew')
+        print records
+        assert_equal(
+            records[0],
+            (1, u'229312', u'פ בית העמק עמקה 3', u'360', u'פרטי', u'Cl', u'תקן ישראלי מותר', u'400', u'20/09/2018',
+             u'44.85', u'11.20')
+        )
+        print self._get_column_names('test_hebrew')
+        assert_equal(
+            self._get_column_names('test_hebrew'),
+            [
+                u'_id',
+                u'_full_text',
+                u'זיהוי',
+                u'שם',
+                u'תא דיווח',
+                u'שימוש',
+                u'פרמטר',
+                u'סוג תקן מי שתייה',
+                u'ערך תקן',
+                u'תאריך דיגום אחרון',
+                u'ריכוז אחרון',
+                u'אחוז מתקן מי השתיה'
+            ]
+        )
+
 
 class TestLoadUnhandledTypes(TestLoadBase):
 
@@ -478,3 +511,38 @@ def test_no_entries(self):
         with assert_raises(LoaderError):
             loader.load_table(csv_filepath, resource_id=resource_id,
                               mimetype='csv', logger=PrintLogger())
+
+    @helpers.change_config('ckanext.xloader.unicode_headers', 'True')
+    def test_hebrew_unicode_headers(self):
+        xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
+        resource_id = 'hebrew_sample_xlsx'
+        factories.Resource(id=resource_id)
+        loader.load_table(xlsx_filepath, resource_id=resource_id,
+                          mimetype='xlsx', logger=PrintLogger())
+        records = self._get_records('hebrew_sample_xlsx')
+        print records
+        assert_equal(
+            records[0],
+            (1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר',
+             Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0),
+             Decimal('44.85000000000000142108547152020037174224853515625'),
+             Decimal('11.199999999999999289457264239899814128875732421875'))
+        )
+        print self._get_column_names('hebrew_sample_xlsx')
+        assert_equal(
+            self._get_column_names('hebrew_sample_xlsx'),
+            [
+                u'_id',
+                u'_full_text',
+                u'זיהוי',
+                u'שם',
+                u'תא דיווח',
+                u'שימוש',
+                u'פרמטר',
+                u'סוג תקן מי שתייה',
+                u'ערך תקן',
+                u'תאריך דיגום אחרון',
+                u'ריכוז אחרון',
+                u'אחוז מתקן מי השתיה'
+            ]
+        )