Skip to content

Commit

Permalink
Guess encoding if default does not work
Browse files Browse the repository at this point in the history
If reading a tsv file with default encoding fails, roll out a
cannon (charset-normalizer) and try to guess encoding to use.

By default, `Path.open()` will use `locale.getencoding()` when reading
a file (which means that we implicitly use utf-8, at least on
linux). This would fail when reading files with non-ascii characters
prepared (with not-uncommon settings) on Windows. There is no perfect
way to learn the encoding from a plain text file, but existing tools
seem to do a good job.

This commit refactors tabby loader, makes it use guessed encoding (but
only after the default fails) and closes psychoinformatics-de#112

https://charset-normalizer.readthedocs.io
  • Loading branch information
mslw committed Nov 13, 2023
1 parent ff3d225 commit 71676da
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 10 deletions.
59 changes: 49 additions & 10 deletions datalad_tabby/io/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
List,
)

from charset_normalizer import from_path as cs_from_path

from .load_utils import (
_assign_context,
_compact_obj,
Expand Down Expand Up @@ -95,7 +97,19 @@ def _load_single(
trace=trace,
)

with src.open(newline='') as tsvfile:
try:
obj.update(self._parse_tsv_single(src))
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
obj.update(self._parse_tsv_single(src, encoding=encoding))

return self._postproc_obj(obj, src=src, trace=trace)

def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict:
obj = {}
with src.open(newline='', encoding=encoding) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
# row_id is useful for error reporting
for row_id, row in enumerate(reader):
Expand All @@ -117,8 +131,7 @@ def _load_single(
# we support "sequence" values via multi-column values
# supporting two ways just adds unnecessary complexity
obj[key] = val

return self._postproc_obj(obj, src=src, trace=trace)
return obj

def _load_many(
self,
Expand All @@ -144,26 +157,52 @@ def _load_many(

# the table field/column names have purposefully _nothing_
# to do with any possibly loaded JSON data
fieldnames = None

with src.open(newline='') as tsvfile:
try:
array.extend(
self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None)
)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
array.extend(
self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
)
)

return array

def _parse_tsv_many(
self,
src: Path,
obj_tmpl: Dict,
trace: List,
fieldnames: List | None = None,
encoding: str | None = None,
) -> List[Dict]:
array = []
with src.open(newline="", encoding=encoding) as tsvfile:
# we cannot use DictReader -- we need to support identically named
# columns
reader = csv.reader(tsvfile, delimiter='\t')
reader = csv.reader(tsvfile, delimiter="\t")
# row_id is useful for error reporting
for row_id, row in enumerate(reader):
# row is a list of field, with only as many items
# as this particular row has columns
if not len(row) \
or row[0].startswith('#') \
or all(v is None for v in row):
if (
not len(row)
or row[0].startswith("#")
or all(v is None for v in row)
):
# skip empty rows, rows with no key, or rows with
# a comment key
continue
if fieldnames is None:
# the first non-ignored row defines the property names/keys
# cut `val` short and remove trailing empty items
fieldnames = row[:_get_index_after_last_nonempty(row)]
fieldnames = row[: _get_index_after_last_nonempty(row)]
continue

obj = obj_tmpl.copy()
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ install_requires =
datalad >= 0.18.0
datalad-next @ git+https://github.com/datalad/datalad-next.git@main
datalad-metalad
charset-normalizer
openpyxl
pyld
packages = find_namespace:
Expand Down

0 comments on commit 71676da

Please sign in to comment.