From c1181925877a8da8e709fa08f707029c133e05ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 16:52:58 +0100 Subject: [PATCH] Add an encoding parameter to io.load_tabby By default, `Path.open()` uses `locale.getencoding()` when opening the file for reading. This has caused problems when loading files saved (presumably on Windows) with iso-8859-1 encoding on linux (where utf-8 is the default), see #112 The default behaviour is maintained with `encoding=None`, and any valid encoding name can be provided as an argument to load_tabby. The encoding will be used for loading tsv files. The encoding is stored as an attribute of `_TabbyLoader` rather than passed as an input to the load functions - since they may end up being called in a few places (when sheet import statements are found), it would be too much passing around otherwise. With external libraries it might be possible to guess a file encoding that produces a correct result based on the files content, but the success is not guaranteed when there are few non-ascii characters in the entire file (think: list of authors). Here, we do not attempt to guess, instead expecting the user to know the encoding they need to use. Ref: https://docs.python.org/3/library/pathlib.html#pathlib.Path.open https://docs.python.org/3/library/functions.html#open --- datalad_tabby/io/load.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 332c0e4..ab9efe9 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -28,6 +28,7 @@ def load_tabby( jsonld: bool = True, recursive: bool = True, cpaths: List | None = None, + encoding: str | None = None, ) -> Dict | List: """Load a tabby (TSV) record as structured (JSON(-LD)) data @@ -48,11 +49,16 @@ def load_tabby( With the ``jsonld`` flag, a declared or default JSON-LD context is loaded and inserted into the record. + + Tsv file encoding used when reading can be specified with the + ``encoding`` parameter. + """ ldr = _TabbyLoader( jsonld=jsonld, recursive=recursive, cpaths=cpaths, + encoding=encoding, ) return ldr(src=src, single=single) @@ -63,6 +69,7 @@ def __init__( jsonld: bool = True, recursive: bool = True, cpaths: List[Path] | None = None, + encoding: str | None = None, ): std_convention_path = Path(__file__).parent / 'conventions' if cpaths is None: @@ -70,6 +77,7 @@ def __init__( else: cpaths.append(std_convention_path) self._cpaths = cpaths + self._encoding = encoding self._jsonld = jsonld self._recursive = recursive @@ -95,7 +103,7 @@ def _load_single( trace=trace, ) - with src.open(newline='') as tsvfile: + with src.open(newline='', encoding=self._encoding) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') # row_id is useful for error reporting for row_id, row in enumerate(reader): @@ -146,7 +154,7 @@ def _load_many( # to do with any possibly loaded JSON data fieldnames = None - with src.open(newline='') as tsvfile: + with src.open(newline='', encoding=self._encoding) as tsvfile: # we cannot use DictReader -- we need to support identically named # columns reader = csv.reader(tsvfile, delimiter='\t')