Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guess encoding if default does not work #114

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 49 additions & 10 deletions datalad_tabby/io/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
List,
)

from charset_normalizer import from_path as cs_from_path

from .load_utils import (
_assign_context,
_compact_obj,
Expand Down Expand Up @@ -95,7 +97,19 @@ def _load_single(
trace=trace,
)

with src.open(newline='') as tsvfile:
try:
obj.update(self._parse_tsv_single(src))
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
obj.update(self._parse_tsv_single(src, encoding=encoding))

return self._postproc_obj(obj, src=src, trace=trace)

def _parse_tsv_single(self, src: Path, encoding: bool = None) -> Dict:
obj = {}
with src.open(newline='', encoding=encoding) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
# row_id is useful for error reporting
for row_id, row in enumerate(reader):
Expand All @@ -117,8 +131,7 @@ def _load_single(
# we support "sequence" values via multi-column values
# supporting two ways just adds unnecessary complexity
obj[key] = val

return self._postproc_obj(obj, src=src, trace=trace)
return obj

def _load_many(
self,
Expand All @@ -144,26 +157,52 @@ def _load_many(

# the table field/column names have purposefully _nothing_
# to do with any possibly loaded JSON data
fieldnames = None

with src.open(newline='') as tsvfile:
try:
array.extend(
self._parse_tsv_many(src, obj_tmpl, trace=trace, fieldnames=None)
)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
array.extend(
self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
)
)

return array

def _parse_tsv_many(
self,
src: Path,
obj_tmpl: Dict,
trace: List,
fieldnames: List | None = None,
encoding: str | None = None,
) -> List[Dict]:
array = []
with src.open(newline="", encoding=encoding) as tsvfile:
# we cannot use DictReader -- we need to support identically named
# columns
reader = csv.reader(tsvfile, delimiter='\t')
reader = csv.reader(tsvfile, delimiter="\t")
# row_id is useful for error reporting
for row_id, row in enumerate(reader):
# row is a list of field, with only as many items
# as this particular row has columns
if not len(row) \
or row[0].startswith('#') \
or all(v is None for v in row):
if (
not len(row)
or row[0].startswith("#")
or all(v is None for v in row)
):
# skip empty rows, rows with no key, or rows with
# a comment key
continue
if fieldnames is None:
# the first non-ignored row defines the property names/keys
# cut `val` short and remove trailing empty items
fieldnames = row[:_get_index_after_last_nonempty(row)]
fieldnames = row[: _get_index_after_last_nonempty(row)]
continue

obj = obj_tmpl.copy()
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ install_requires =
datalad >= 0.18.0
datalad-next @ git+https://github.com/datalad/datalad-next.git@main
datalad-metalad
charset-normalizer
openpyxl
pyld
packages = find_namespace:
Expand Down
Loading