Skip to content

Commit

Permalink
wip: Add support for a maximum line length while reading
Browse files Browse the repository at this point in the history
Fixes #39.
  • Loading branch information
wbolster committed Oct 18, 2021
1 parent 6231269 commit 326a23c
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 3 deletions.
97 changes: 94 additions & 3 deletions jsonlines/jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import typing
from typing import (
Any,
AnyStr,
Callable,
Dict,
Iterable,
Expand Down Expand Up @@ -174,16 +175,34 @@ class Reader(ReaderWriterBase):
decoder. If specified, it must be a callable that accepts a
(unicode) string and returns the decoded object.
:param file_or_iterable: file-like object or iterable yielding lines as
strings
The `max_line_length` argument limits the maximum line length. If
specified, this prevents reading and parsing of too large values.
When reading from an input file that has a ``.readline()`` method,
that will be used. For custom iterables, it is not possible to
limit the size of yielded items, but the limit will still prevent
JSON parsing of too large lines. Note that the limit applies per
line, not to the total amount of data.
.. warning::
Use `max_line_length` as a safety measure for untrusted input:
without a limit, (potentially malicious) large input without
newlines will be read into memory in its entirety, and parsed
afterwards. This could quickly exhaust memory and other system
resources.
:param file_or_iterable: file-like object or iterable yielding
lines as strings
:param loads: custom json decoder callable
:param max_line_length: the maximum line length to read/parse
"""

_file_or_iterable: Union[
typing.IO[str], typing.IO[bytes], Iterable[Union[str, bytes]]
]
_line_iter: Iterator[Tuple[int, Union[bytes, str]]] = attr.ib(init=False)
_loads: LoadsCallable = attr.ib(default=default_loads, kw_only=True)
_max_line_length: Optional[int] = attr.ib(default=None, kw_only=True)

def __attrs_post_init__(self) -> None:
if isinstance(self._file_or_iterable, io.IOBase):
Expand All @@ -192,7 +211,18 @@ def __attrs_post_init__(self) -> None:
self._file_or_iterable,
)

self._line_iter = enumerate(self._file_or_iterable, 1)
iterable: Iterable[Union[str, bytes]]
if (
self._fp is not None
and hasattr(self._fp, "readline")
and self._max_line_length is not None
):
self._line_iter = ReadlineIterator(
self._fp, # type: ignore[misc]
max_line_length=self._max_line_length,
)
else:
self._line_iter = enumerate(self._file_or_iterable, 1)

# No type specified, None not allowed
@overload
Expand Down Expand Up @@ -293,6 +323,10 @@ def read(
)
raise exc from orig_exc

if self._max_line_length is not None and len(line) > self._max_line_length:
# TODO: add tests for this
raise InvalidLineError("line too long", line, lineno)

if line.startswith("\x1e"): # RFC7464 text sequence
line = line[1:]

Expand Down Expand Up @@ -634,3 +668,60 @@ def repr_for_fp(fp: typing.IO[Any]) -> str:
return repr(name)
else:
return repr(fp)


@attr.s(auto_attribs=True)
class ReadlineIterator(typing.Iterator[Tuple[int, AnyStr]]):
"""
Iterator over a file-like object using ``.readline()``, enforcing a length limit.
This can be used to avoid reading too large values into memory.
"""

# TODO: add more tests

# Note: this iterator is ‘special’ in the sense that it can continue after
# a call to next() resulted in an exception. Usually this exception will
# reach the application, which will usually abort reading from the file.
# However, Reader.iter(skip_invalid=True) continues afterwards: a too long
# line should not be parsed, but the next line may be fine. This is why the
# subsequent call to ``next()`` will continue with the next line.
#
# This code is implemented as a class instead of a simpler generator
# function, because the latter cannot do the above.

fp: typing.IO[AnyStr]
max_line_length: int
at_line_boundary: bool = True
lineno: int = 1

def __next__(self) -> Tuple[int, Union[AnyStr]]:
"""
Read the next line.
If needed, this reads past a previously detected too long line.
"""
# If previously interrupted, read until the next line boundary.
# TODO: make this nicer and simpler, e.g. why not read in
# chunk of size ‘max_line_length’ here as well, and reduce
# duplicated/convoluted logic.
if not self.at_line_boundary:
buf_size = 16 * 1024
while True:
line = self.fp.readline(buf_size)
if not line:
raise StopIteration
if line.endswith("\n" if isinstance(line, str) else b"\n"):
self.at_line_boundary = True
break

line = self.fp.readline(self.max_line_length + 1)
if not line:
raise StopIteration

self.lineno += 1
if len(line) > self.max_line_length:
self.at_line_boundary = False
raise InvalidLineError("line too long", line, self.lineno)

return self.lineno, line
11 changes: 11 additions & 0 deletions tests/test_jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ def test_skip_invalid() -> None:
assert next(it) == 34


def test_skip_invalid_long_lines() -> None:
"""
A line length limited reader is able to skip over too long lines.
"""
fp = io.StringIO("12\ninvalid\n34")
reader = jsonlines.Reader(fp, max_line_length=3)
it = reader.iter(skip_invalid=True)
assert next(it) == 12
assert next(it) == 34


def test_empty_strings_in_iterable() -> None:
input = ["123", "", "456"]
it = iter(jsonlines.Reader(input))
Expand Down

0 comments on commit 326a23c

Please sign in to comment.