Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: Add support for a maximum line length while reading #70

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 90 additions & 3 deletions jsonlines/jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import typing
from typing import (
Any,
AnyStr,
Callable,
Dict,
Iterable,
Expand Down Expand Up @@ -182,16 +183,34 @@ class Reader(ReaderWriterBase):
decoder. If specified, it must be a callable that accepts a
(unicode) string and returns the decoded object.

:param file_or_iterable: file-like object or iterable yielding lines as
strings
The `max_line_length` argument limits the maximum line length. If
specified, this prevents reading and parsing of too large values.
When reading from an input file that has a ``.readline()`` method,
that will be used. For custom iterables, it is not possible to
limit the size of yielded items, but the limit will still prevent
JSON parsing of too large lines. Note that the limit applies per
line, not to the total amount of data.

.. warning::

Use `max_line_length` as a safety measure for untrusted input:
without a limit, (potentially malicious) large input without
newlines will be read into memory in its entirety, and parsed
afterwards. This could quickly exhaust memory and other system
resources.

:param file_or_iterable: file-like object or iterable yielding
lines as strings
:param loads: custom json decoder callable
:param max_line_length: the maximum line length to read/parse
"""

_file_or_iterable: Union[
typing.IO[str], typing.IO[bytes], Iterable[Union[str, bytes]]
]
_line_iter: Iterator[Tuple[int, Union[bytes, str]]] = attr.ib(init=False)
_loads: LoadsCallable = attr.ib(default=default_loads, kw_only=True)
_max_line_length: Optional[int] = attr.ib(default=None, kw_only=True)

def __attrs_post_init__(self) -> None:
if isinstance(self._file_or_iterable, io.IOBase):
Expand All @@ -200,7 +219,18 @@ def __attrs_post_init__(self) -> None:
self._file_or_iterable,
)

self._line_iter = enumerate(self._file_or_iterable, 1)
iterable: Iterable[Union[str, bytes]]
if (
self._fp is not None
and hasattr(self._fp, "readline")
and self._max_line_length is not None
):
self._line_iter = ReadlineIterator(
self._fp, # type: ignore[misc]
max_line_length=self._max_line_length,
)
else:
self._line_iter = enumerate(self._file_or_iterable, 1)

# No type specified, None not allowed
@overload
Expand Down Expand Up @@ -301,6 +331,10 @@ def read(
)
raise exc from orig_exc

if self._max_line_length is not None and len(line) > self._max_line_length:
# TODO: add tests for this
raise InvalidLineError("line too long", line, lineno)

if line.startswith(SKIPPABLE_SINGLE_INITIAL_CHARS):
line = line[1:]

Expand Down Expand Up @@ -643,3 +677,56 @@ def repr_for_fp(fp: typing.IO[Any]) -> str:
return repr(name)
else:
return repr(fp)


@attr.s(auto_attribs=True)
class ReadlineIterator(typing.Iterator[Tuple[int, AnyStr]]):
"""
Iterate over file-like objects using ``.readline()`` with a maximum length.

This is useful to avoid reading too large values into memory. A too long
line causes ``__next__()`` to raise an error. However, if called again
afterwards, it reads past that line and continues with the next line.

This behaviour makes this iterator ‘special’: it can continue after an
exception. In normal scenarios, this exception will reach the application,
which will then abort reading from the file, and the iterator will never
continue. However, ``Reader.iter(skip_invalid=True)`` needs to continue
afterwards: it should skip over too long lines, but the next line may be
fine.

# TODO: is this desirable behaviour at all? maybe a too long line is just
# bad bad bad and ``skip_invalid`` should not silently ignore it?
"""
# Note: this code is implemented as a class instead of a simpler generator
# function, because the latter cannot continue after raising an exception.

fp: typing.IO[AnyStr]
max_line_length: int
at_line_boundary: bool = True
lineno: int = 1

def __next__(self) -> Tuple[int, Union[AnyStr]]:
"""
Read the next line.
"""
# If previously interrupted, fast-forward to a line boundary.
while not self.at_line_boundary:
line = self.fp.readline(self.max_line_length)
if not line:
raise StopIteration
self.at_line_boundary = line[-1] in (
"\n", # str
0x0A, # bytes
)

line = self.fp.readline(self.max_line_length + 1)
if not line:
raise StopIteration

self.lineno += 1
if len(line) > self.max_line_length:
self.at_line_boundary = False
raise InvalidLineError("line too long", line, self.lineno)

return self.lineno, line
11 changes: 11 additions & 0 deletions tests/test_jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,17 @@ def test_skip_invalid() -> None:
assert next(it) == 34


def test_skip_invalid_long_lines() -> None:
"""
A line length limited reader is able to skip over too long lines.
"""
fp = io.StringIO("12\ninvalid\n34")
reader = jsonlines.Reader(fp, max_line_length=3)
it = reader.iter(skip_invalid=True)
assert next(it) == 12
assert next(it) == 34


def test_empty_strings_in_iterable() -> None:
input = ["123", "", "456"]
it = iter(jsonlines.Reader(input))
Expand Down