Skip to content

Commit

Permalink
Ignore UTF-8 BOM sequences in various scenarios (#69)
Browse files Browse the repository at this point in the history
- Adapt the .open() helper to use encoding=utf-8-sig (for reading only)
- Adapt the Reader() to ignore a (single) UTF-8 BOM sequences at the
  start of a line; this handles concatenated files, non-file input, etc.
- Add exhaustive tests

Note that this deliberately does not strip multiple concatenated BOM
sequences since that's indicative of malformed input.

Fixes #68.
  • Loading branch information
wbolster authored Oct 18, 2021
1 parent 6231269 commit afabff1
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 2 deletions.
13 changes: 11 additions & 2 deletions jsonlines/jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import builtins
import codecs
import enum
import io
import json
Expand Down Expand Up @@ -42,6 +43,13 @@
str,
}

# Characters to skip at the beginning of a line. Note: at most one such
# character is skipped per line.
SKIPPABLE_SINGLE_INITIAL_CHARS = (
"\x1e", # RFC7464 text sequence
codecs.BOM_UTF8.decode(),
)


class DumpsResultConversion(enum.Enum):
LeaveAsIs = enum.auto()
Expand Down Expand Up @@ -293,7 +301,7 @@ def read(
)
raise exc from orig_exc

if line.startswith("\x1e"): # RFC7464 text sequence
if line.startswith(SKIPPABLE_SINGLE_INITIAL_CHARS):
line = line[1:]

try:
Expand Down Expand Up @@ -611,7 +619,8 @@ def open(
raise ValueError("'mode' must be either 'r', 'w', or 'a'")

cls = Reader if mode == "r" else Writer
fp = builtins.open(file, mode=mode + "t", encoding="utf-8")
encoding = "utf-8-sig" if mode == "r" else "utf-8"
fp = builtins.open(file, mode=mode + "t", encoding=encoding)
kwargs = dict(
loads=loads,
dumps=dumps,
Expand Down
64 changes: 64 additions & 0 deletions tests/test_jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
Tests for the jsonlines library.
"""

import codecs
import collections
import io
import json
import tempfile

import jsonlines
Expand Down Expand Up @@ -38,6 +40,48 @@ def test_reader_rfc7464_text_sequences() -> None:
assert list(reader) == ["a", "b"]


def test_reader_utf8_bom_bytes() -> None:
"""
UTF-8 BOM is ignored, even if it occurs in the middle of a stream.
"""
chunks = [
codecs.BOM_UTF8,
b"1\n",
codecs.BOM_UTF8,
b"2\n",
]
fp = io.BytesIO(b"".join(chunks))
with jsonlines.Reader(fp) as reader:
assert list(reader) == [1, 2]


def test_reader_utf8_bom_text() -> None:
"""
Text version of ``test_reader_utf8_bom_bytes()``.
"""
chunks = [
"1\n",
codecs.BOM_UTF8.decode(),
"2\n",
]
fp = io.StringIO("".join(chunks))
with jsonlines.Reader(fp) as reader:
assert list(reader) == [1, 2]


def test_reader_utf8_bom_bom_bom() -> None:
"""
Too many UTF-8 BOM BOM BOM chars cause BOOM 💥 BOOM.
"""
reader = jsonlines.Reader([codecs.BOM_UTF8.decode() * 3 + "1\n"])
with pytest.raises(jsonlines.InvalidLineError) as excinfo:
reader.read()

exc = excinfo.value
assert "invalid json" in str(exc)
assert isinstance(exc.__cause__, json.JSONDecodeError)


def test_writer_text() -> None:
fp = io.StringIO()
with jsonlines.Writer(fp) as writer:
Expand Down Expand Up @@ -78,6 +122,7 @@ def test_invalid_lines() -> None:
exc = excinfo.value
assert "invalid json" in str(exc)
assert exc.line == data
assert isinstance(exc.__cause__, json.JSONDecodeError)


def test_skip_invalid() -> None:
Expand Down Expand Up @@ -203,6 +248,18 @@ def test_open_reading() -> None:
assert list(reader) == [123]


def test_open_reading_with_utf8_bom() -> None:
"""
The ``.open()`` helper ignores a UTF-8 BOM.
"""
with tempfile.NamedTemporaryFile("wb") as fp:
fp.write(codecs.BOM_UTF8)
fp.write(b"123\n")
fp.flush()
with jsonlines.open(fp.name) as reader:
assert list(reader) == [123]


def test_open_writing() -> None:
with tempfile.NamedTemporaryFile("w+b") as fp:
with jsonlines.open(fp.name, mode="w") as writer:
Expand All @@ -224,3 +281,10 @@ def test_open_invalid_mode() -> None:
with pytest.raises(ValueError) as excinfo:
jsonlines.open("foo", mode="foo")
assert "mode" in str(excinfo.value)


def test_single_char_stripping() -> None:
""" "
Sanity check that a helper constant actually contains single-char strings.
"""
assert all(len(s) == 1 for s in jsonlines.jsonlines.SKIPPABLE_SINGLE_INITIAL_CHARS)

0 comments on commit afabff1

Please sign in to comment.