From 71676da64f761c56801936ddcdcfb6b0e4c559df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Mon, 13 Nov 2023 18:16:24 +0100 Subject: [PATCH] Guess encoding if default does not work If reading a tsv file with default encoding fails, roll out a cannon (charset-normalizer) and try to guess encoding to use. By default, `Path.open()` will use `locale.getencoding()` when reading a file (which means that we implicitly use utf-8, at least on linux). This would fail when reading files with non-ascii characters prepared (with not-uncommon settings) on Windows. There is no perfect way to learn the encoding from a plain text file, but existing tools seem to do a good job. This commit refactors tabby loader, makes it use guessed encoding (but only after the default fails) and closes #112 https://charset-normalizer.readthedocs.io --- datalad_tabby/io/load.py | 59 +++++++++++++++++++++++++++++++++------- setup.cfg | 1 + 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 332c0e4..b454426 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -10,6 +10,8 @@ List, ) +from charset_normalizer import from_path as cs_from_path + from .load_utils import ( _assign_context, _compact_obj, @@ -95,7 +97,19 @@ def _load_single( trace=trace, ) - with src.open(newline='') as tsvfile: + try: + obj.update(self._parse_tsv_single(src)) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + obj.update(self._parse_tsv_single(src, encoding=encoding)) + + return self._postproc_obj(obj, src=src, trace=trace) + + def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict: + obj = {} + with src.open(newline='', encoding=encoding) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') # row_id is useful for error reporting for row_id, row in enumerate(reader): @@ -117,8 +131,7 @@ def _load_single( # we support "sequence" values via multi-column values # supporting two ways just adds unnecessary complexity obj[key] = val - - return self._postproc_obj(obj, src=src, trace=trace) + return obj def _load_many( self, @@ -144,26 +157,52 @@ def _load_many( # the table field/column names have purposefully _nothing_ # to do with any possibly loaded JSON data - fieldnames = None - with src.open(newline='') as tsvfile: + try: + array.extend( + self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None) + ) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + array.extend( + self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding + ) + ) + + return array + + def _parse_tsv_many( + self, + src: Path, + obj_tmpl: Dict, + trace: List, + fieldnames: List | None = None, + encoding: str | None = None, + ) -> List[Dict]: + array = [] + with src.open(newline="", encoding=encoding) as tsvfile: # we cannot use DictReader -- we need to support identically named # columns - reader = csv.reader(tsvfile, delimiter='\t') + reader = csv.reader(tsvfile, delimiter="\t") # row_id is useful for error reporting for row_id, row in enumerate(reader): # row is a list of field, with only as many items # as this particular row has columns - if not len(row) \ - or row[0].startswith('#') \ - or all(v is None for v in row): + if ( + not len(row) + or row[0].startswith("#") + or all(v is None for v in row) + ): # skip empty rows, rows with no key, or rows with # a comment key continue if fieldnames is None: # the first non-ignored row defines the property names/keys # cut `val` short and remove trailing empty items - fieldnames = row[:_get_index_after_last_nonempty(row)] + fieldnames = row[: _get_index_after_last_nonempty(row)] continue obj = obj_tmpl.copy() diff --git a/setup.cfg b/setup.cfg index 8b06c8c..fe2b49f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ install_requires = datalad >= 0.18.0 datalad-next @ git+https://github.com/datalad/datalad-next.git@main datalad-metalad + charset-normalizer openpyxl pyld packages = find_namespace: