From 09a097cc7c10f9a0ac0f1430786d63e149c92245 Mon Sep 17 00:00:00 2001 From: wouter bolsterlee Date: Mon, 18 Oct 2021 22:55:53 +0200 Subject: [PATCH] wip: Add support for a maximum line length while reading Fixes #39. --- jsonlines/jsonlines.py | 93 +++++++++++++++++++++++++++++++++++++++-- tests/test_jsonlines.py | 11 +++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/jsonlines/jsonlines.py b/jsonlines/jsonlines.py index 4aafcee..cff9201 100644 --- a/jsonlines/jsonlines.py +++ b/jsonlines/jsonlines.py @@ -13,6 +13,7 @@ import typing from typing import ( Any, + AnyStr, Callable, Dict, Iterable, @@ -182,9 +183,26 @@ class Reader(ReaderWriterBase): decoder. If specified, it must be a callable that accepts a (unicode) string and returns the decoded object. - :param file_or_iterable: file-like object or iterable yielding lines as - strings + The `max_line_length` argument limits the maximum line length. If + specified, this prevents reading and parsing of too large values. + When reading from an input file that has a ``.readline()`` method, + that will be used. For custom iterables, it is not possible to + limit the size of yielded items, but the limit will still prevent + JSON parsing of too large lines. Note that the limit applies per + line, not to the total amount of data. + + .. warning:: + + Use `max_line_length` as a safety measure for untrusted input: + without a limit, (potentially malicious) large input without + newlines will be read into memory in its entirety, and parsed + afterwards. This could quickly exhaust memory and other system + resources. + + :param file_or_iterable: file-like object or iterable yielding + lines as strings :param loads: custom json decoder callable + :param max_line_length: the maximum line length to read/parse """ _file_or_iterable: Union[ @@ -192,6 +210,7 @@ class Reader(ReaderWriterBase): ] _line_iter: Iterator[Tuple[int, Union[bytes, str]]] = attr.ib(init=False) _loads: LoadsCallable = attr.ib(default=default_loads, kw_only=True) + _max_line_length: Optional[int] = attr.ib(default=None, kw_only=True) def __attrs_post_init__(self) -> None: if isinstance(self._file_or_iterable, io.IOBase): @@ -200,7 +219,18 @@ def __attrs_post_init__(self) -> None: self._file_or_iterable, ) - self._line_iter = enumerate(self._file_or_iterable, 1) + iterable: Iterable[Union[str, bytes]] + if ( + self._fp is not None + and hasattr(self._fp, "readline") + and self._max_line_length is not None + ): + self._line_iter = ReadlineIterator( + self._fp, # type: ignore[misc] + max_line_length=self._max_line_length, + ) + else: + self._line_iter = enumerate(self._file_or_iterable, 1) # No type specified, None not allowed @overload @@ -301,6 +331,10 @@ def read( ) raise exc from orig_exc + if self._max_line_length is not None and len(line) > self._max_line_length: + # TODO: add tests for this + raise InvalidLineError("line too long", line, lineno) + if line.startswith(SKIPPABLE_SINGLE_INITIAL_CHARS): line = line[1:] @@ -643,3 +677,56 @@ def repr_for_fp(fp: typing.IO[Any]) -> str: return repr(name) else: return repr(fp) + + +@attr.s(auto_attribs=True) +class ReadlineIterator(typing.Iterator[Tuple[int, AnyStr]]): + """ + Iterate over file-like objects using ``.readline()`` with a maximum length. + + This is useful to avoid reading too large values into memory. A too long + line causes ``__next__()`` to raise an error. However, if called again + afterwards, it reads past that line and continues with the next line. + + This behaviour makes this iterator ‘special’: it can continue after an + exception. In normal scenarios, this exception will reach the application, + which will then abort reading from the file, and the iterator will never + continue. However, ``Reader.iter(skip_invalid=True)`` needs to continue + afterwards: it should skip over too long lines, but the next line may be + fine. + + # TODO: is this desirable behaviour at all? maybe a too long line is just + # bad bad bad and ``skip_invalid`` should not silently ignore it? + """ + # Note: this code is implemented as a class instead of a simpler generator + # function, because the latter cannot continue after raising an exception. + + fp: typing.IO[AnyStr] + max_line_length: int + at_line_boundary: bool = True + lineno: int = 1 + + def __next__(self) -> Tuple[int, Union[AnyStr]]: + """ + Read the next line. + """ + # If previously interrupted, read until the next line boundary to catch + # up. + while not self.at_line_boundary: + line = self.fp.readline(self.max_line_length) + if not line: + raise StopIteration + self.at_line_boundary = line.endswith( + "\n" if isinstance(line, str) else b"\n" + ) + + line = self.fp.readline(self.max_line_length + 1) + if not line: + raise StopIteration + + self.lineno += 1 + if len(line) > self.max_line_length: + self.at_line_boundary = False + raise InvalidLineError("line too long", line, self.lineno) + + return self.lineno, line diff --git a/tests/test_jsonlines.py b/tests/test_jsonlines.py index 6a6ba6f..f2da806 100644 --- a/tests/test_jsonlines.py +++ b/tests/test_jsonlines.py @@ -133,6 +133,17 @@ def test_skip_invalid() -> None: assert next(it) == 34 +def test_skip_invalid_long_lines() -> None: + """ + A line length limited reader is able to skip over too long lines. + """ + fp = io.StringIO("12\ninvalid\n34") + reader = jsonlines.Reader(fp, max_line_length=3) + it = reader.iter(skip_invalid=True) + assert next(it) == 12 + assert next(it) == 34 + + def test_empty_strings_in_iterable() -> None: input = ["123", "", "456"] it = iter(jsonlines.Reader(input))