diff --git a/jsonlines/jsonlines.py b/jsonlines/jsonlines.py index 2d5869f..df2fbe2 100644 --- a/jsonlines/jsonlines.py +++ b/jsonlines/jsonlines.py @@ -12,6 +12,7 @@ import typing from typing import ( Any, + AnyStr, Callable, Dict, Iterable, @@ -174,9 +175,26 @@ class Reader(ReaderWriterBase): decoder. If specified, it must be a callable that accepts a (unicode) string and returns the decoded object. - :param file_or_iterable: file-like object or iterable yielding lines as - strings + The `max_line_length` argument limits the maximum line length. If + specified, this prevents reading and parsing of too large values. + When reading from an input file that has a ``.readline()`` method, + that will be used. For custom iterables, it is not possible to + limit the size of yielded items, but the limit will still prevent + JSON parsing of too large lines. Note that the limit applies per + line, not to the total amount of data. + + .. warning:: + + Use `max_line_length` as a safety measure for untrusted input: + without a limit, (potentially malicious) large input without + newlines will be read into memory in its entirety, and parsed + afterwards. This could quickly exhaust memory and other system + resources. + + :param file_or_iterable: file-like object or iterable yielding + lines as strings :param loads: custom json decoder callable + :param max_line_length: the maximum line length to read/parse """ _file_or_iterable: Union[ @@ -184,6 +202,7 @@ class Reader(ReaderWriterBase): ] _line_iter: Iterator[Tuple[int, Union[bytes, str]]] = attr.ib(init=False) _loads: LoadsCallable = attr.ib(default=default_loads, kw_only=True) + _max_line_length: Optional[int] = attr.ib(default=None, kw_only=True) def __attrs_post_init__(self) -> None: if isinstance(self._file_or_iterable, io.IOBase): @@ -192,7 +211,18 @@ def __attrs_post_init__(self) -> None: self._file_or_iterable, ) - self._line_iter = enumerate(self._file_or_iterable, 1) + iterable: Iterable[Union[str, bytes]] + if ( + self._fp is not None + and hasattr(self._fp, "readline") + and self._max_line_length is not None + ): + self._line_iter = ReadlineIterator( + self._fp, # type: ignore[misc] + max_line_length=self._max_line_length, + ) + else: + self._line_iter = enumerate(self._file_or_iterable, 1) # No type specified, None not allowed @overload @@ -293,6 +323,10 @@ def read( ) raise exc from orig_exc + if self._max_line_length is not None and len(line) > self._max_line_length: + # TODO: add tests for this + raise InvalidLineError("line too long", line, lineno) + if line.startswith("\x1e"): # RFC7464 text sequence line = line[1:] @@ -634,3 +668,60 @@ def repr_for_fp(fp: typing.IO[Any]) -> str: return repr(name) else: return repr(fp) + + +@attr.s(auto_attribs=True) +class ReadlineIterator(typing.Iterator[Tuple[int, AnyStr]]): + """ + Iterator over a file-like object using ``.readline()``, enforcing a length limit. + + This can be used to avoid reading too large values into memory. + """ + + # TODO: add more tests + + # Note: this iterator is ‘special’ in the sense that it can continue after + # a call to next() resulted in an exception. Usually this exception will + # reach the application, which will usually abort reading from the file. + # However, Reader.iter(skip_invalid=True) continues afterwards: a too long + # line should not be parsed, but the next line may be fine. This is why the + # subsequent call to ``next()`` will continue with the next line. + # + # This code is implemented as a class instead of a simpler generator + # function, because the latter cannot do the above. + + fp: typing.IO[AnyStr] + max_line_length: int + at_line_boundary: bool = True + lineno: int = 1 + + def __next__(self) -> Tuple[int, Union[AnyStr]]: + """ + Read the next line. + + If needed, this reads past a previously detected too long line. + """ + # If previously interrupted, read until the next line boundary. + # TODO: make this nicer and simpler, e.g. why not read in + # chunk of size ‘max_line_length’ here as well, and reduce + # duplicated/convoluted logic. + if not self.at_line_boundary: + buf_size = 16 * 1024 + while True: + line = self.fp.readline(buf_size) + if not line: + raise StopIteration + if line.endswith("\n" if isinstance(line, str) else b"\n"): + self.at_line_boundary = True + break + + line = self.fp.readline(self.max_line_length + 1) + if not line: + raise StopIteration + + self.lineno += 1 + if len(line) > self.max_line_length: + self.at_line_boundary = False + raise InvalidLineError("line too long", line, self.lineno) + + return self.lineno, line diff --git a/tests/test_jsonlines.py b/tests/test_jsonlines.py index b67bc21..35d6622 100644 --- a/tests/test_jsonlines.py +++ b/tests/test_jsonlines.py @@ -88,6 +88,17 @@ def test_skip_invalid() -> None: assert next(it) == 34 +def test_skip_invalid_long_lines() -> None: + """ + A line length limited reader is able to skip over too long lines. + """ + fp = io.StringIO("12\ninvalid\n34") + reader = jsonlines.Reader(fp, max_line_length=3) + it = reader.iter(skip_invalid=True) + assert next(it) == 12 + assert next(it) == 34 + + def test_empty_strings_in_iterable() -> None: input = ["123", "", "456"] it = iter(jsonlines.Reader(input))