From 0af4399220f284f4ff6a2ddcbb3b2deda4247226 Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Mon, 17 Aug 2020 17:22:12 +0300 Subject: [PATCH 1/3] add support for unicode header names --- README.rst | 6 ++ ckanext/xloader/loader.py | 39 ++++++---- .../xloader/tests/samples/hebrew_sample.csv | 7 ++ .../xloader/tests/samples/hebrew_sample.xlsx | Bin 0 -> 5414 bytes ckanext/xloader/tests/test_loader.py | 68 +++++++++++++++++- 5 files changed, 105 insertions(+), 15 deletions(-) create mode 100644 ckanext/xloader/tests/samples/hebrew_sample.csv create mode 100644 ckanext/xloader/tests/samples/hebrew_sample.xlsx diff --git a/README.rst b/README.rst index d8b09674..06e9fe67 100644 --- a/README.rst +++ b/README.rst @@ -260,6 +260,12 @@ Configuration: # not be loaded into the datastore. ckanext.xloader.max_excerpt_lines = 100 + # If set to True allows unicode characters in header names. + # If set to False (default), characters are encoded to ascii + # using the unidecode library. + ckanext.xloader.unicode_headers = False + + ------------------------ Developer installation ------------------------ diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 8cbd304e..d2a7af4b 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -37,7 +37,7 @@ def get_write_engine(): MAX_COLUMN_LENGTH = 63 -def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): +def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None): '''Loads a CSV into DataStore. Does not create the indexes.''' # use messytables to determine the header row @@ -64,7 +64,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): header_offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. - headers = encode_headers(headers) + headers = encode_headers(headers, unicode_headers=unicode_headers) # Guess the delimiter used in the file with open(csv_filepath, 'r') as f: @@ -196,6 +196,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked + if unicode_headers or config.get('ckanext.xloader.unicode_headers'): + column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers]) + else: + column_names = ', '.join(['"{}"'.format(h) for h in headers]) raw_connection = engine.raw_connection() try: cur = raw_connection.cursor() @@ -211,8 +215,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): " ENCODING '{encoding}');" .format( resource_id=resource_id, - column_names=', '.join(['"{}"'.format(h) - for h in headers]), + column_names=column_names, delimiter=delimiter, encoding='UTF8', ), @@ -236,7 +239,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('...copying done') logger.info('Creating search index...') - _populate_fulltext(connection, resource_id, fields=fields) + + if unicode_headers or config.get('ckanext.xloader.unicode_headers'): + encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields] + else: + encoded_fields = fields + + _populate_fulltext(connection, resource_id, fields=encoded_fields) logger.info('...search index created') return fields @@ -259,7 +268,7 @@ def create_column_indexes(fields, resource_id, logger): logger.info('...column indexes created.') -def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): +def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. @@ -299,7 +308,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. - headers = encode_headers(headers) + headers = encode_headers(headers, unicode_headers=unicode_headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) @@ -400,13 +409,17 @@ def get_types(): return _TYPES, TYPE_MAPPING -def encode_headers(headers): +def encode_headers(headers, unicode_headers=None): + if unicode_headers or config.get('ckanext.xloader.unicode_headers'): + decode_func = unicode + else: + decode_func = unidecode encoded_headers = [] for header in headers: try: - encoded_headers.append(unidecode(header)) + encoded_headers.append(decode_func(header)) except AttributeError: - encoded_headers.append(unidecode(str(header))) + encoded_headers.append(decode_func(str(header))) return encoded_headers @@ -514,7 +527,7 @@ def _populate_fulltext(connection, resource_id, fields): (text/numeric/timestamp) ''' sql = \ - u''' + ''' UPDATE {table} SET _full_text = to_tsvector({cols}); '''.format( @@ -560,8 +573,8 @@ def _create_fulltext_trigger(connection, resource_id): def identifier(s): # "%" needs to be escaped, otherwise connection.execute thinks it is for # substituting a bind parameter - return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\ - + u'"' + return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%')\ + + '"' def literal_string(s): diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv new file mode 100644 index 00000000..9a951e71 --- /dev/null +++ b/ckanext/xloader/tests/samples/hebrew_sample.csv @@ -0,0 +1,7 @@ +זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה +229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20 +229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00 +229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20 +229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70 +229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60 +229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10 diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..7dea64359b885309f6d61ccf47a0db11ff1daa28 GIT binary patch literal 5414 zcmaJ_1z42p5~dpj=~zTzk&x~Zq+tOm>5|T+yE_D=JEXh2K|;E_K}1rz1@3arJy)*x zo-_aRKO4{g&F=TkeDAz7vLIME94I6tBq(odEIFu0Mg)29WX_;%V_wb>l(ai3Xlqs0$YxLchV;DG=m`H&h!c{0yhph(F$~M`4zAx9CLC4U}a!qIK zMMoa8A&q1ilmDo@O3T2EcxrL=jqe-(3&1RSf0=K(Ji8ANn2gCYF#M&q;@D0u$0^gfJ9q06N^tFxm!=Z1fyW zayJrPF~RhvN~*lXs}7#kDoY1?&9%$9>Pt1HGlO=3?Wgak?PUNM6x#i+T0`D7pXd|h=?)pm+D z5A{lDaufDRJRuCPobMNEx05iA*V-PNi8WotFRBS<(O%+ztk zL_(O=R=sZnXm17%Ki-8@&X|TX)p>o&M>tP8IqtGA>LdA{Z(|&cnvf_TM~)G1zNrN4 zq>APEmbmN4U6D*KKg-sMyb$Oslc(=9vaKKa)n|qR%_vCdaw;VzmTwQ_k7y_@(Zh5j zset4fQi;;35o~ygd}5iC@_pnK8ew|kNBIFe$CDdsX!uCn4{O`)ug_$_PrW0&+K{#0 zm5eB{Khxt9zOK@TJ64a>tjTAd>Yp_YAzJ9POYY|;Rj_CQ}QJq^QfI(SdHaVc1{Gr+AHjV z>XP7gEm%*I`DvbMU_Qh{a%LXy>-iFN!b|4%&x30^Q2Q zlF%l&W9yuPS=vE;=LGtdTHa0MOnXA}G*>iF6EZ5F;<05%i*%dZrDnUevcH!Z$H5m> zElA8t#{@P<3$`l%lujTUSDm1cp;E9?5owbK?bf+sE zl6pox%0!1f?9IoPT)`)DdzN-4mG|*GS0uyR)$7IRz%{OQ<5?p+114v~shu;dhS-&q zR!K6pYyNP(g0d2FRn;Y7If=vd4|%v*vSB;Fx(@`~NS^rCrQ|-EUH;)n zj#zpCto;}C_QK5{>c_GA=-1W6>>RN;xHIeC)UU4hNZ$7|xgvJkD;v(fG5aKU%O()x7LJZNrOlThT*GlWtJN9hmX}vw(_tQE<00346uA@@_TyTzc377$qo0&? zTk6fHM#$ez&SUvwxJhj zPr39SHf*OHI&-=VI+ME?F>A{5xS}I>!}!chrS8BfqzSz1oUfm~dXZx;r)C=}Y-zth zkzzsYx6_NgnT#mX-^)YC*)LkZnovV}6qjJlJoa+2ap4NLf)IL5supW4dz(^jLVZ!o zMy-F-a^qz(|Evo`Os5vh(4fJA4Tv!(rml2JI?rBC4M@Y}`r@G8!x+qlo#*#XqZ;i1 z?+UdvozOtJ)>%hW8=T}&3p74RoRrxvbV~cWPRa}gY)7IHVD~|EkRUV)vQiNYpe)N1 zxptSi%pEh!L90nB#7;3FPx=sU1H8_J52d5wRmxH*;bcJm>41l->RJv+9F5XxSRuo= zukB$LQff^KKaGie<_kMg+A~CBWBA5E!?sNot7za9ISG9B-9dSUSGwh?8xq< z7w|Ubb9C1S06C?kOukdH;Htx+b4xZC&$WK^Gvc)h2YF9lJv2}f}8^5l!9%`ZE9;L*%T4;qWaD}V0+q>F?D z>8!Yu!dzDXLGHm!$^Iun&bOxv)GW}gsy^L)95~}okYWV zW+#JbCfL-Y?O<2KK^BQU^B&WW?p4ZBi^xR_iliy`J&=Yv$nhn49@QowENJdy-Gp({ zVD1>1T42^2f@K+m1JSR2I<8Gi<3FRMnOE*8?71f4`6R?vGBdQlL5D2)v%CF>sdb2P z>)%-pPh3IOJ41a)9cA@$o;ygfw}<_w)ItBZaIL3yNoC$QD4bVJV%_h6hgG`uHlGN%yPhLJ$}XzakB*acu5@=hX!tTS6D;l2s*A&` zTM)F0l|pibsgo8IbI9xCIM98gfD|LjE23yc(m_P)MB0WLTT0zZ$2i^2XocG$xO?pT zy&$_C5?wi6up#qv!f`_?+qbb#vlLZdinbISjPL~@1zPvg*6OHv3oB}fSZj}m~X1n!}fP^zHKwsx(FSFQA<~gkskXteE1rWP3To6765jnQK7eQ+}@~5De z4Is#_jEe$q|Us>WhJ7tGXC3_WN$8@lIw!K)IL-!umt~el(Fg>UcI@L z=qNmp%gCwoPz@yvdnu&{AzWyZ&__`G*gSt9VbkJlp|eOr_j#Qb606oWaYm#bQ&JiQ zO+3P0w7Ane9J_5{gdhK_Zl6<#fHq)&m5t#G=T)H1q7!p%Fq06VCW_r7mwyocVV?M$< zZqKdDb7fm=_6ob|E8o~$knuV9*mYo!d1>9<9qv8cLwcJ)5zc2rdJsPe|jV6n`dnF9sz2`_|B(!&T^+i%E$U*!l;c+s= zA(XWA)bK1MMJz=}GdNzk${WBS*O-3R)r?C+4D98+NAPhfA=fB zv*oHs68(7B93{OyU_8}6_R^RiQX&wM3=4hgRw3Kw`NxJ06Y_+C4 zai1FT_AR7=?LKe|p_#9d*2$MrNJ94Gl=cyNvsn1by6)3Yg>{*wx|d$n`-4LxBiC~t zZY(_NfaeC-Zen0!QJO;<+yZ9FvK(|$ZW|vd_Vs|a9L(n#-Ws?*BLFV4 z9=04+KzrXJ&H}y>TA#!#=6w?av8+|D{623nmoF-y_o9bHc7PFqV=;|qM{Tlz$r1Of zEdK7#*v*LeCBm2~TK%s-Rkv7^shFh0dxnk?0V^n8)EdUGo;ku>5Kr^64s=gnnvlL$ z-XC37R)!UFUPHx^?;%m=mG0Sb>Dy)tp8$c^i64MkFIA}Dg}~qjA<4h5tyqtN(zCQN zctFY`QXACCh||4tivv8cP=k*gcUbi$GbhVmHIw?$IQWyiL375^a=wAn+VUdhWZl$NABHHS(9@ht&PF_@JE z`hr86AcvsRT-HN?Fkh%aF5l=vWpf@;1ad&=uy+F}c;lt~AsLKZ@L>mKTVD)s8{OFA z4M}kk$Ju;l7GZ^Un*y{#R8Wx?L4npAd7ID3O}nM|yRPhbdedi6ql%r6L*N&?02aWY zFVuz_N15N&$iBBYd=sSL(#8rH;j`jhN*6Ac%k+tMr@!BCyg_(?`D-!XCKUu?j(^NY z$SH_@vm~Ys zw%AXKy5rl8QGyW$iFEf#HnUh#*2)iGmkv#P>9-VhWfOcrOF8dpddwzOHF!?o(>%SRZ4xr( z5gJ#apL@b7lwc0(3IC7^b|o5Th*$DGx~|cacmw^0un_tM$K{4Qg;?S1Epd#2jX~A( zySCm69-k@?6s-?3s*ZcS>$(0_&$@J6f|^@A@2$Ucu9LWv_4Rm`BJplfMpl(`*6UqibUHOkz9~uS+>eoo?@kr-kr1jtS_X93j z(0}`RJWhBR0R1u|$S*_u{LeV(cMp%{?ZepNmu*6_1>)iV4k3Q`@>uCSNa0_Wfb<_P ze+lE?7eB5e4`S$-O`!a_iT_BW-6e{CcK;tz|BI`BU;a2V9!}jaQ-^r_KW9)D1ds6W5HjTAhlFM9>BFag0W|$Y AEC2ui literal 0 HcmV?d00001 diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 2cf1c7d0..05b1a46a 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -6,6 +6,7 @@ from nose.plugins.skip import SkipTest import datetime from decimal import Decimal +import mock from ckan.tests import helpers, factories from ckanext.xloader import loader @@ -51,10 +52,10 @@ def _get_records(self, table_name, limit=None, if col != '_full_text') else: cols = '*' - sql = 'SELECT {cols} FROM "{table_name}"' \ + sql = u'SELECT {cols} FROM "{table_name}"' \ .format(cols=cols, table_name=table_name) if limit is not None: - sql += ' LIMIT {}'.format(limit) + sql += u' LIMIT {}'.format(limit) results = c.execute(sql) return results.fetchall() @@ -344,6 +345,37 @@ def test_column_names(self): assert_equal(self._get_records('test1')[0], (1, u'2011-01-01', u'1', u'Galway')) + def test_unicode_column_names(self): + csv_filepath = get_sample_filepath('hebrew_sample.csv') + resource_id = 'test_hebrew' + factories.Resource(id=resource_id) + loader.load_csv(csv_filepath, resource_id=resource_id, + mimetype='text/csv', logger=PrintLogger(), unicode_headers=True) + records = self._get_records('test_hebrew') + print records + assert_equal( + records[0], + (1, u'229312',u'פ בית העמק עמקה 3',u'360',u'פרטי',u'Cl',u'תקן ישראלי מותר',u'400',u'20/09/2018',u'44.85',u'11.20') + ) + print self._get_column_names('test_hebrew') + assert_equal( + self._get_column_names('test_hebrew'), + [ + u'_id', + u'_full_text', + u'זיהוי', + u'שם', + u'תא דיווח', + u'שימוש', + u'פרמטר', + u'סוג תקן מי שתייה', + u'ערך תקן', + u'תאריך דיגום אחרון', + u'ריכוז אחרון', + u'אחוז מתקן מי השתיה' + ] + ) + class TestLoadUnhandledTypes(TestLoadBase): @@ -478,3 +510,35 @@ def test_no_entries(self): with assert_raises(LoaderError): loader.load_table(csv_filepath, resource_id=resource_id, mimetype='csv', logger=PrintLogger()) + + def test_hebrew_unicode_headers(self): + xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx') + resource_id = 'hebrew_sample_xlsx' + factories.Resource(id=resource_id) + loader.load_table(xlsx_filepath, resource_id=resource_id, + mimetype='xlsx', logger=PrintLogger(), unicode_headers=True) + records = self._get_records('hebrew_sample_xlsx') + print records + assert_equal( + records[0], + (1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר', Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0), + Decimal('44.85000000000000142108547152020037174224853515625'), Decimal('11.199999999999999289457264239899814128875732421875')) + ) + print self._get_column_names('hebrew_sample_xlsx') + assert_equal( + self._get_column_names('hebrew_sample_xlsx'), + [ + u'_id', + u'_full_text', + u'זיהוי', + u'שם', + u'תא דיווח', + u'שימוש', + u'פרמטר', + u'סוג תקן מי שתייה', + u'ערך תקן', + u'תאריך דיגום אחרון', + u'ריכוז אחרון', + u'אחוז מתקן מי השתיה' + ] + ) \ No newline at end of file From 30f0e45e087dd7c47dfdbc5c6ba80449d2e8c296 Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Mon, 17 Aug 2020 17:46:26 +0300 Subject: [PATCH 2/3] flake8 fixes --- ckanext/xloader/plugin.py | 1 + ckanext/xloader/tests/test_loader.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index a595eba0..2306dc6b 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -23,6 +23,7 @@ class XLoaderFormats(object): formats = None + @classmethod def is_it_an_xloader_format(cls, format_): if cls.formats is None: diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 05b1a46a..70d32b1b 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -6,7 +6,6 @@ from nose.plugins.skip import SkipTest import datetime from decimal import Decimal -import mock from ckan.tests import helpers, factories from ckanext.xloader import loader @@ -355,7 +354,8 @@ def test_unicode_column_names(self): print records assert_equal( records[0], - (1, u'229312',u'פ בית העמק עמקה 3',u'360',u'פרטי',u'Cl',u'תקן ישראלי מותר',u'400',u'20/09/2018',u'44.85',u'11.20') + (1, u'229312', u'פ בית העמק עמקה 3', u'360', u'פרטי', u'Cl', u'תקן ישראלי מותר', u'400', u'20/09/2018', + u'44.85', u'11.20') ) print self._get_column_names('test_hebrew') assert_equal( @@ -521,8 +521,10 @@ def test_hebrew_unicode_headers(self): print records assert_equal( records[0], - (1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר', Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0), - Decimal('44.85000000000000142108547152020037174224853515625'), Decimal('11.199999999999999289457264239899814128875732421875')) + (1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר', + Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0), + Decimal('44.85000000000000142108547152020037174224853515625'), + Decimal('11.199999999999999289457264239899814128875732421875')) ) print self._get_column_names('hebrew_sample_xlsx') assert_equal( @@ -541,4 +543,4 @@ def test_hebrew_unicode_headers(self): u'ריכוז אחרון', u'אחוז מתקן מי השתיה' ] - ) \ No newline at end of file + ) From 45890cdc90bd82debb04e7abc69b2157959eabfa Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Thu, 20 Aug 2020 14:59:40 +0300 Subject: [PATCH 3/3] remove unicode_headers argument, use change_config test helper instead --- ckanext/xloader/loader.py | 16 ++++++++-------- ckanext/xloader/tests/test_loader.py | 6 ++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index d2a7af4b..74226cf2 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -37,7 +37,7 @@ def get_write_engine(): MAX_COLUMN_LENGTH = 63 -def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None): +def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads a CSV into DataStore. Does not create the indexes.''' # use messytables to determine the header row @@ -64,7 +64,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicod header_offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. - headers = encode_headers(headers, unicode_headers=unicode_headers) + headers = encode_headers(headers) # Guess the delimiter used in the file with open(csv_filepath, 'r') as f: @@ -196,7 +196,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicod # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked - if unicode_headers or config.get('ckanext.xloader.unicode_headers'): + if config.get('ckanext.xloader.unicode_headers'): column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers]) else: column_names = ', '.join(['"{}"'.format(h) for h in headers]) @@ -240,7 +240,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicod logger.info('Creating search index...') - if unicode_headers or config.get('ckanext.xloader.unicode_headers'): + if config.get('ckanext.xloader.unicode_headers'): encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields] else: encoded_fields = fields @@ -268,7 +268,7 @@ def create_column_indexes(fields, resource_id, logger): logger.info('...column indexes created.') -def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None): +def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. @@ -308,7 +308,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None, un for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. - headers = encode_headers(headers, unicode_headers=unicode_headers) + headers = encode_headers(headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) @@ -409,8 +409,8 @@ def get_types(): return _TYPES, TYPE_MAPPING -def encode_headers(headers, unicode_headers=None): - if unicode_headers or config.get('ckanext.xloader.unicode_headers'): +def encode_headers(headers): + if config.get('ckanext.xloader.unicode_headers'): decode_func = unicode else: decode_func = unidecode diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 70d32b1b..e18bf3ef 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -344,12 +344,13 @@ def test_column_names(self): assert_equal(self._get_records('test1')[0], (1, u'2011-01-01', u'1', u'Galway')) + @helpers.change_config('ckanext.xloader.unicode_headers', 'True') def test_unicode_column_names(self): csv_filepath = get_sample_filepath('hebrew_sample.csv') resource_id = 'test_hebrew' factories.Resource(id=resource_id) loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger(), unicode_headers=True) + mimetype='text/csv', logger=PrintLogger()) records = self._get_records('test_hebrew') print records assert_equal( @@ -511,12 +512,13 @@ def test_no_entries(self): loader.load_table(csv_filepath, resource_id=resource_id, mimetype='csv', logger=PrintLogger()) + @helpers.change_config('ckanext.xloader.unicode_headers', 'True') def test_hebrew_unicode_headers(self): xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx') resource_id = 'hebrew_sample_xlsx' factories.Resource(id=resource_id) loader.load_table(xlsx_filepath, resource_id=resource_id, - mimetype='xlsx', logger=PrintLogger(), unicode_headers=True) + mimetype='xlsx', logger=PrintLogger()) records = self._get_records('hebrew_sample_xlsx') print records assert_equal(