Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for unicode header names #111

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ Configuration:
# not be loaded into the datastore.
ckanext.xloader.max_excerpt_lines = 100

# If set to True allows unicode characters in header names.
# If set to False (default), characters are encoded to ascii
# using the unidecode library.
ckanext.xloader.unicode_headers = False


------------------------
Developer installation
------------------------
Expand Down
29 changes: 21 additions & 8 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
# the superuser issue. <-- picked

if config.get('ckanext.xloader.unicode_headers'):
column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
else:
column_names = ', '.join(['"{}"'.format(h) for h in headers])
raw_connection = engine.raw_connection()
try:
cur = raw_connection.cursor()
Expand All @@ -211,8 +215,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
" ENCODING '{encoding}');"
.format(
resource_id=resource_id,
column_names=', '.join(['"{}"'.format(h)
for h in headers]),
column_names=column_names,
delimiter=delimiter,
encoding='UTF8',
),
Expand All @@ -236,7 +239,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('...copying done')

logger.info('Creating search index...')
_populate_fulltext(connection, resource_id, fields=fields)

if config.get('ckanext.xloader.unicode_headers'):
encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
else:
encoded_fields = fields

_populate_fulltext(connection, resource_id, fields=encoded_fields)
logger.info('...search index created')

return fields
Expand Down Expand Up @@ -401,12 +410,16 @@ def get_types():


def encode_headers(headers):
if config.get('ckanext.xloader.unicode_headers'):
decode_func = unicode
else:
decode_func = unidecode
encoded_headers = []
for header in headers:
try:
encoded_headers.append(unidecode(header))
encoded_headers.append(decode_func(header))
except AttributeError:
encoded_headers.append(unidecode(str(header)))
encoded_headers.append(decode_func(str(header)))

return encoded_headers

Expand Down Expand Up @@ -514,7 +527,7 @@ def _populate_fulltext(connection, resource_id, fields):
(text/numeric/timestamp)
'''
sql = \
u'''
'''
UPDATE {table}
SET _full_text = to_tsvector({cols});
'''.format(
Expand Down Expand Up @@ -560,8 +573,8 @@ def _create_fulltext_trigger(connection, resource_id):
def identifier(s):
# "%" needs to be escaped, otherwise connection.execute thinks it is for
# substituting a bind parameter
return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
+ u'"'
return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%')\
+ '"'


def literal_string(s):
Expand Down
1 change: 1 addition & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

class XLoaderFormats(object):
formats = None

@classmethod
def is_it_an_xloader_format(cls, format_):
if cls.formats is None:
Expand Down
7 changes: 7 additions & 0 deletions ckanext/xloader/tests/samples/hebrew_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
Binary file added ckanext/xloader/tests/samples/hebrew_sample.xlsx
Binary file not shown.
72 changes: 70 additions & 2 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def _get_records(self, table_name, limit=None,
if col != '_full_text')
else:
cols = '*'
sql = 'SELECT {cols} FROM "{table_name}"' \
sql = u'SELECT {cols} FROM "{table_name}"' \
.format(cols=cols, table_name=table_name)
if limit is not None:
sql += ' LIMIT {}'.format(limit)
sql += u' LIMIT {}'.format(limit)
results = c.execute(sql)
return results.fetchall()

Expand Down Expand Up @@ -344,6 +344,39 @@ def test_column_names(self):
assert_equal(self._get_records('test1')[0],
(1, u'2011-01-01', u'1', u'Galway'))

@helpers.change_config('ckanext.xloader.unicode_headers', 'True')
def test_unicode_column_names(self):
csv_filepath = get_sample_filepath('hebrew_sample.csv')
resource_id = 'test_hebrew'
factories.Resource(id=resource_id)
loader.load_csv(csv_filepath, resource_id=resource_id,
mimetype='text/csv', logger=PrintLogger())
records = self._get_records('test_hebrew')
print records
assert_equal(
records[0],
(1, u'229312', u'פ בית העמק עמקה 3', u'360', u'פרטי', u'Cl', u'תקן ישראלי מותר', u'400', u'20/09/2018',
u'44.85', u'11.20')
)
print self._get_column_names('test_hebrew')
assert_equal(
self._get_column_names('test_hebrew'),
[
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]
)


class TestLoadUnhandledTypes(TestLoadBase):

Expand Down Expand Up @@ -478,3 +511,38 @@ def test_no_entries(self):
with assert_raises(LoaderError):
loader.load_table(csv_filepath, resource_id=resource_id,
mimetype='csv', logger=PrintLogger())

@helpers.change_config('ckanext.xloader.unicode_headers', 'True')
def test_hebrew_unicode_headers(self):
xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
resource_id = 'hebrew_sample_xlsx'
factories.Resource(id=resource_id)
loader.load_table(xlsx_filepath, resource_id=resource_id,
mimetype='xlsx', logger=PrintLogger())
records = self._get_records('hebrew_sample_xlsx')
print records
assert_equal(
records[0],
(1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר',
Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0),
Decimal('44.85000000000000142108547152020037174224853515625'),
Decimal('11.199999999999999289457264239899814128875732421875'))
)
print self._get_column_names('hebrew_sample_xlsx')
assert_equal(
self._get_column_names('hebrew_sample_xlsx'),
[
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]
)