From 71676da64f761c56801936ddcdcfb6b0e4c559df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Mon, 13 Nov 2023 18:16:24 +0100 Subject: [PATCH 1/5] Guess encoding if default does not work If reading a tsv file with default encoding fails, roll out a cannon (charset-normalizer) and try to guess encoding to use. By default, `Path.open()` will use `locale.getencoding()` when reading a file (which means that we implicitly use utf-8, at least on linux). This would fail when reading files with non-ascii characters prepared (with not-uncommon settings) on Windows. There is no perfect way to learn the encoding from a plain text file, but existing tools seem to do a good job. This commit refactors tabby loader, makes it use guessed encoding (but only after the default fails) and closes #112 https://charset-normalizer.readthedocs.io --- datalad_tabby/io/load.py | 59 +++++++++++++++++++++++++++++++++------- setup.cfg | 1 + 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 332c0e4..b454426 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -10,6 +10,8 @@ List, ) +from charset_normalizer import from_path as cs_from_path + from .load_utils import ( _assign_context, _compact_obj, @@ -95,7 +97,19 @@ def _load_single( trace=trace, ) - with src.open(newline='') as tsvfile: + try: + obj.update(self._parse_tsv_single(src)) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + obj.update(self._parse_tsv_single(src, encoding=encoding)) + + return self._postproc_obj(obj, src=src, trace=trace) + + def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict: + obj = {} + with src.open(newline='', encoding=encoding) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') # row_id is useful for error reporting for row_id, row in enumerate(reader): @@ -117,8 +131,7 @@ def _load_single( # we support "sequence" values via multi-column values # supporting two ways just adds unnecessary complexity obj[key] = val - - return self._postproc_obj(obj, src=src, trace=trace) + return obj def _load_many( self, @@ -144,26 +157,52 @@ def _load_many( # the table field/column names have purposefully _nothing_ # to do with any possibly loaded JSON data - fieldnames = None - with src.open(newline='') as tsvfile: + try: + array.extend( + self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None) + ) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + array.extend( + self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding + ) + ) + + return array + + def _parse_tsv_many( + self, + src: Path, + obj_tmpl: Dict, + trace: List, + fieldnames: List | None = None, + encoding: str | None = None, + ) -> List[Dict]: + array = [] + with src.open(newline="", encoding=encoding) as tsvfile: # we cannot use DictReader -- we need to support identically named # columns - reader = csv.reader(tsvfile, delimiter='\t') + reader = csv.reader(tsvfile, delimiter="\t") # row_id is useful for error reporting for row_id, row in enumerate(reader): # row is a list of field, with only as many items # as this particular row has columns - if not len(row) \ - or row[0].startswith('#') \ - or all(v is None for v in row): + if ( + not len(row) + or row[0].startswith("#") + or all(v is None for v in row) + ): # skip empty rows, rows with no key, or rows with # a comment key continue if fieldnames is None: # the first non-ignored row defines the property names/keys # cut `val` short and remove trailing empty items - fieldnames = row[:_get_index_after_last_nonempty(row)] + fieldnames = row[: _get_index_after_last_nonempty(row)] continue obj = obj_tmpl.copy() diff --git a/setup.cfg b/setup.cfg index 8b06c8c..fe2b49f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ install_requires = datalad >= 0.18.0 datalad-next @ git+https://github.com/datalad/datalad-next.git@main datalad-metalad + charset-normalizer openpyxl pyld packages = find_namespace: From ef7d778311f36f3b646d5c388bd929f99a231345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Mon, 13 Nov 2023 19:05:37 +0100 Subject: [PATCH 2/5] Narrow down the try/except This narrows down the try/except to wrap the loader only, and not the extend/append. It is clearer what is being tried. --- datalad_tabby/io/load.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index b454426..69597fe 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -98,12 +98,14 @@ def _load_single( ) try: - obj.update(self._parse_tsv_single(src)) + tsv_obj = self._parse_tsv_single(src) except UnicodeDecodeError: # by default Path.open() uses locale.getencoding() # that didn't work, try guessing encoding = cs_from_path(src).best().encoding - obj.update(self._parse_tsv_single(src, encoding=encoding)) + tsv_obj = self._parse_tsv_single(src, encoding=encoding) + + obj.update(tsv_obj) return self._postproc_obj(obj, src=src, trace=trace) @@ -159,19 +161,19 @@ def _load_many( # to do with any possibly loaded JSON data try: - array.extend( - self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None) + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None ) except UnicodeDecodeError: # by default Path.open() uses locale.getencoding() # that didn't work, try guessing encoding = cs_from_path(src).best().encoding - array.extend( - self._parse_tsv_many( - src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding - ) + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding ) + array.extend(tsv_array) + return array def _parse_tsv_many( From 8d4b6e1abacc81a1c77cd123e6e82d0803016fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 12:20:21 +0100 Subject: [PATCH 3/5] Fix a type annotation --- datalad_tabby/io/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 69597fe..dc43565 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -109,7 +109,7 @@ def _load_single( return self._postproc_obj(obj, src=src, trace=trace) - def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict: + def _parse_tsv_single(self, src: Path, encoding: str | None = None) -> Dict: obj = {} with src.open(newline='', encoding=encoding) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') From 070937a7c2565532fb61de29abc90d4650757ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 12:35:50 +0100 Subject: [PATCH 4/5] Add an encoding argument to tabby loader When an encoding is explicitly specified, it will be used. Otherwise, default encoding used by Path.open will be tried, and charset_normalizer will be used to guess if that fails. --- datalad_tabby/io/load.py | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index dc43565..94ae273 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -75,10 +75,11 @@ def __init__( self._jsonld = jsonld self._recursive = recursive - def __call__(self, src: Path, *, single: bool = True): + def __call__(self, src: Path, *, single: bool = True, encoding: str | None = None): return (self._load_single if single else self._load_many)( src=src, trace=[], + encoding=encoding, ) def _load_single( @@ -86,6 +87,7 @@ def _load_single( *, src: Path, trace: List, + encoding: str | None = None, ) -> Dict: jfpath = self._get_corresponding_jsondata_fpath(src) obj = json.load(jfpath.open()) if jfpath.exists() else {} @@ -97,13 +99,16 @@ def _load_single( trace=trace, ) - try: - tsv_obj = self._parse_tsv_single(src) - except UnicodeDecodeError: - # by default Path.open() uses locale.getencoding() - # that didn't work, try guessing - encoding = cs_from_path(src).best().encoding + if encoding is not None: tsv_obj = self._parse_tsv_single(src, encoding=encoding) + else: + try: + tsv_obj = self._parse_tsv_single(src) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + tsv_obj = self._parse_tsv_single(src, encoding=encoding) obj.update(tsv_obj) @@ -140,6 +145,7 @@ def _load_many( *, src: Path, trace: List, + encoding: str | None = None, ) -> List[Dict]: obj_tmpl = {} array = list() @@ -160,17 +166,22 @@ def _load_many( # the table field/column names have purposefully _nothing_ # to do with any possibly loaded JSON data - try: - tsv_array = self._parse_tsv_many( - src, obj_tmpl, trace=trace, fieldnames=None - ) - except UnicodeDecodeError: - # by default Path.open() uses locale.getencoding() - # that didn't work, try guessing - encoding = cs_from_path(src).best().encoding + if encoding is not None: tsv_array = self._parse_tsv_many( src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding ) + else: + try: + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None + ) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding + ) array.extend(tsv_array) From f0c44c1818c11a56c99ffe93e67cc0c688747f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 13:14:08 +0100 Subject: [PATCH 5/5] Make encoding a property of TabbyLoader Because load functions are used recursively (when load statements are found in a tabby file), it would be too much hassle to pass the encoding parameter around - better use `self._encoding`. --- datalad_tabby/io/load.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 94ae273..681cacb 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -30,6 +30,7 @@ def load_tabby( jsonld: bool = True, recursive: bool = True, cpaths: List | None = None, + encoding: str | None = None, ) -> Dict | List: """Load a tabby (TSV) record as structured (JSON(-LD)) data @@ -50,11 +51,14 @@ def load_tabby( With the ``jsonld`` flag, a declared or default JSON-LD context is loaded and inserted into the record. + + Encoding used when reading tsv files can be specified as ``encoding``. """ ldr = _TabbyLoader( jsonld=jsonld, recursive=recursive, cpaths=cpaths, + encoding=encoding, ) return ldr(src=src, single=single) @@ -65,6 +69,7 @@ def __init__( jsonld: bool = True, recursive: bool = True, cpaths: List[Path] | None = None, + encoding: str | None = None, ): std_convention_path = Path(__file__).parent / 'conventions' if cpaths is None: @@ -72,14 +77,14 @@ def __init__( else: cpaths.append(std_convention_path) self._cpaths = cpaths + self._encoding = encoding self._jsonld = jsonld self._recursive = recursive - def __call__(self, src: Path, *, single: bool = True, encoding: str | None = None): + def __call__(self, src: Path, *, single: bool = True): return (self._load_single if single else self._load_many)( src=src, trace=[], - encoding=encoding, ) def _load_single( @@ -87,7 +92,6 @@ def _load_single( *, src: Path, trace: List, - encoding: str | None = None, ) -> Dict: jfpath = self._get_corresponding_jsondata_fpath(src) obj = json.load(jfpath.open()) if jfpath.exists() else {} @@ -98,9 +102,8 @@ def _load_single( src=src, trace=trace, ) - - if encoding is not None: - tsv_obj = self._parse_tsv_single(src, encoding=encoding) + if self._encoding is not None: + tsv_obj = self._parse_tsv_single(src, encoding=self._encoding) else: try: tsv_obj = self._parse_tsv_single(src) @@ -145,7 +148,6 @@ def _load_many( *, src: Path, trace: List, - encoding: str | None = None, ) -> List[Dict]: obj_tmpl = {} array = list() @@ -165,10 +167,9 @@ def _load_many( # the table field/column names have purposefully _nothing_ # to do with any possibly loaded JSON data - - if encoding is not None: + if self._encoding is not None: tsv_array = self._parse_tsv_many( - src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding + src, obj_tmpl, trace=trace, fieldnames=None, encoding=self._encoding ) else: try: