From 71676da64f761c56801936ddcdcfb6b0e4c559df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= <m.szczepanik@fz-juelich.de>
Date: Mon, 13 Nov 2023 18:16:24 +0100
Subject: [PATCH] Guess encoding if default does not work

If reading a tsv file with default encoding fails, roll out a
cannon (charset-normalizer) and try to guess encoding to use.

By default, `Path.open()` will use `locale.getencoding()` when reading
a file (which means that we implicitly use utf-8, at least on
linux). This would fail when reading files with non-ascii characters
prepared (with not-uncommon settings) on Windows. There is no perfect
way to learn the encoding from a plain text file, but existing tools
seem to do a good job.

This commit refactors tabby loader, makes it use guessed encoding (but
only after the default fails) and closes #112

https://charset-normalizer.readthedocs.io
---
 datalad_tabby/io/load.py | 59 +++++++++++++++++++++++++++++++++-------
 setup.cfg                |  1 +
 2 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py
index 332c0e4..b454426 100644
--- a/datalad_tabby/io/load.py
+++ b/datalad_tabby/io/load.py
@@ -10,6 +10,8 @@
     List,
 )
 
+from charset_normalizer import from_path as cs_from_path
+
 from .load_utils import (
     _assign_context,
     _compact_obj,
@@ -95,7 +97,19 @@ def _load_single(
                 trace=trace,
             )
 
-        with src.open(newline='') as tsvfile:
+        try:
+            obj.update(self._parse_tsv_single(src))
+        except UnicodeDecodeError:
+            # by default Path.open() uses locale.getencoding()
+            # that didn't work, try guessing
+            encoding = cs_from_path(src).best().encoding
+            obj.update(self._parse_tsv_single(src, encoding=encoding))
+
+        return self._postproc_obj(obj, src=src, trace=trace)
+
+    def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict:
+        obj = {}
+        with src.open(newline='', encoding=encoding) as tsvfile:
             reader = csv.reader(tsvfile, delimiter='\t')
             # row_id is useful for error reporting
             for row_id, row in enumerate(reader):
@@ -117,8 +131,7 @@ def _load_single(
                 # we support "sequence" values via multi-column values
                 # supporting two ways just adds unnecessary complexity
                 obj[key] = val
-
-        return self._postproc_obj(obj, src=src, trace=trace)
+        return obj
 
     def _load_many(
         self,
@@ -144,26 +157,52 @@ def _load_many(
 
         # the table field/column names have purposefully _nothing_
         # to do with any possibly loaded JSON data
-        fieldnames = None
 
-        with src.open(newline='') as tsvfile:
+        try:
+            array.extend(
+                self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None)
+            )
+        except UnicodeDecodeError:
+            # by default Path.open() uses locale.getencoding()
+            # that didn't work, try guessing
+            encoding = cs_from_path(src).best().encoding
+            array.extend(
+                self._parse_tsv_many(
+                    src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
+                )
+            )
+
+        return array
+
+    def _parse_tsv_many(
+        self,
+        src: Path,
+        obj_tmpl: Dict,
+        trace: List,
+        fieldnames: List | None = None,
+        encoding: str | None = None,
+    ) -> List[Dict]:
+        array = []
+        with src.open(newline="", encoding=encoding) as tsvfile:
             # we cannot use DictReader -- we need to support identically named
             # columns
-            reader = csv.reader(tsvfile, delimiter='\t')
+            reader = csv.reader(tsvfile, delimiter="\t")
             # row_id is useful for error reporting
             for row_id, row in enumerate(reader):
                 # row is a list of field, with only as many items
                 # as this particular row has columns
-                if not len(row) \
-                        or row[0].startswith('#') \
-                        or all(v is None for v in row):
+                if (
+                    not len(row)
+                    or row[0].startswith("#")
+                    or all(v is None for v in row)
+                ):
                     # skip empty rows, rows with no key, or rows with
                     # a comment key
                     continue
                 if fieldnames is None:
                     # the first non-ignored row defines the property names/keys
                     # cut `val` short and remove trailing empty items
-                    fieldnames = row[:_get_index_after_last_nonempty(row)]
+                    fieldnames = row[: _get_index_after_last_nonempty(row)]
                     continue
 
                 obj = obj_tmpl.copy()
diff --git a/setup.cfg b/setup.cfg
index 8b06c8c..fe2b49f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,6 +17,7 @@ install_requires =
     datalad >= 0.18.0
     datalad-next @ git+https://github.com/datalad/datalad-next.git@main
     datalad-metalad
+    charset-normalizer
     openpyxl
     pyld
 packages = find_namespace: