From 0408aafe6b9fb29b1152adc84b7529920d77f174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 14:11:02 +0100 Subject: [PATCH 1/2] Ignore rows where all values are empty strings This will do a better job of ignoring empty tsv lines, bu checking for empty strings in addition to Nones (I think csv reader typically produces empty strings). --- datalad_tabby/io/load.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index 332c0e4..47ff691 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -154,9 +154,12 @@ def _load_many( for row_id, row in enumerate(reader): # row is a list of field, with only as many items # as this particular row has columns - if not len(row) \ - or row[0].startswith('#') \ - or all(v is None for v in row): + if ( + not len(row) + or row[0].startswith("#") + or all(v is None for v in row) + or all(v == "" for v in row) + ): # skip empty rows, rows with no key, or rows with # a comment key continue From cae025af31ed1af393956d879e2bb4eabe8e33a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 15:24:33 +0100 Subject: [PATCH 2/2] Trim empty rows when converting xlsx to tsv With this change, excel to tabby conversion will preserve empty lines in the middle of the file (maybe a visual separation of sections in a many-objects file), but truncate empty lines at the end (maybe excel artefact). This requires double iteration over the rows (first to find where data ends, then to export), but it seems inexpensive. This should help situations when excel (or calc) xlsx file preserves blank lines. One test data file (tsv) used to test round-tripping is altered to remove empty lines at the end. So in the end we no longer guarantee round-tripping these empty lines, but I feel this was a non-feature. --- datalad_tabby/io/xlsx.py | 11 ++++++++++- .../tests/data/demorecord/tabbydemo_files.tsv | 4 ---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/datalad_tabby/io/xlsx.py b/datalad_tabby/io/xlsx.py index 1276094..38926c1 100644 --- a/datalad_tabby/io/xlsx.py +++ b/datalad_tabby/io/xlsx.py @@ -95,4 +95,13 @@ def _sheet2tsv(ws: Worksheet, dest: Path): tsvfile, delimiter='\t', ) - writer.writerows(ws.iter_rows(values_only=True)) + + # find the last nonempty row + max_idx = 1 + for i, row in enumerate(ws.iter_rows(values_only=True)): + if any(v is not None for v in row): + max_idx = i + 1 # max row is a 1-based index + + # write tsv, truncating empty rows at the end + writer.writerows(ws.iter_rows(values_only=True, max_row=max_idx)) + diff --git a/datalad_tabby/tests/data/demorecord/tabbydemo_files.tsv b/datalad_tabby/tests/data/demorecord/tabbydemo_files.tsv index 2bf7820..b1c0ba6 100644 --- a/datalad_tabby/tests/data/demorecord/tabbydemo_files.tsv +++ b/datalad_tabby/tests/data/demorecord/tabbydemo_files.tsv @@ -2,7 +2,3 @@ path[POSIX] size[bytes] checksum[md5] url raw/adelie.csv 23755 e7e2be6b203a221949f05e02fcefd853 https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff raw/gentoo.csv 11263 1549566fb97afa879dc9446edcf2015f https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381 raw/chinstrap.csv 18872 e4b0710c69297031d63866ce8b888f25 https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462 - - - -