diff --git a/src/monty/io.py b/src/monty/io.py index 5d6aba481..806bb52b7 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -164,7 +164,7 @@ def reverse_readfile( def reverse_readline( - m_file, + m_file, # TODO: expected type is unclear blk_size: int = 4096, max_mem: int = 4_000_000, ) -> Iterator[str]: @@ -179,6 +179,17 @@ def reverse_readline( Files larger than max_mem are read one block each time. + NOTE: this function expect a file stream, and m_file + should NOT be the name of the file. + + TODO: + - is it possible to support binary file stream + - Test gzip seek speed (not supported previously) + - Test bzip2 seek speed (any improvement) + https://stackoverflow.com/questions/25734252/ + why-is-seeking-from-the-end-of-a-file-allowed-for- + bzip2-files-and-not-gzip-files + Reference: Based on code by Peter Astrand , using modifications by Raymond Hettinger and Kevin German. @@ -186,7 +197,7 @@ def reverse_readline( file-backwards-yet-another-implementat/ Args: - m_file (File): File stream to read (backwards). + m_file: File stream to read (backwards). blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of RAM to use in bytes, which determines when to reverse a file in-memory versus @@ -196,6 +207,10 @@ def reverse_readline( Yields: Lines from the back of the file. """ + # Check for illegal usage + if isinstance(m_file, str | Path): + raise TypeError("this function expect a file stream, not file name") + # Generate line ending l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end)) @@ -224,9 +239,6 @@ def reverse_readline( # For bz2 files, seek is expensive. It is therefore in our best # interest to maximize the block size within RAM usage limit. - - # TODO: not sure if bzip2 has any improvement on seek, need test - # https://stackoverflow.com/questions/25734252/why-is-seeking-from-the-end-of-a-file-allowed-for-bzip2-files-and-not-gzip-files if isinstance(m_file, bz2.BZ2File): blk_size = min(max_mem, file_size) diff --git a/tests/test_io.py b/tests/test_io.py index 30de8171a..c5204bb4d 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -151,7 +151,7 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") - @pytest.mark.parametrize("ram", [4, 4_096, 4_0000_000]) + @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped. @@ -170,24 +170,25 @@ def test_file_with_empty_lines(self, l_end, ram): revert_contents = tuple(reverse_readline(file, max_mem=ram)) assert revert_contents[::-1] == contents - # TODO: finish following tests - # # Test gzip file - # gzip_filename = f"{filename}.gz" - # with gzip.open(gzip_filename, "w") as file_out: - # for line in contents: - # file_out.write(line.encode()) + # Test gzip file + gzip_filename = f"{filename}.gz" + with gzip.open(gzip_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) - # revert_contents_gzip = tuple(reverse_readline(gzip_filename)) - # assert revert_contents_gzip[::-1] == contents + with gzip.open(gzip_filename) as g_file: + revert_contents_gzip = tuple(reverse_readline(g_file)) + assert revert_contents_gzip[::-1] == contents - # # Test bzip2 file - # bz2_filename = f"{filename}.bz2" - # with bz2.open(bz2_filename, "w") as file_out: - # for line in contents: - # file_out.write(line.encode()) + # Test bzip2 file + bz2_filename = f"{filename}.bz2" + with bz2.open(bz2_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) - # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) - # assert revert_contents_bz2[::-1] == contents + with bz2.open(bz2_filename) as b_file: + revert_contents_bz2 = tuple(reverse_readline(b_file)) + assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) @@ -217,6 +218,12 @@ def test_line_ending(self, l_end, ram): # assert line == contents[len(contents) - idx - 1] # assert isinstance(line, str) + @pytest.mark.parametrize("file", ["./file", Path("./file")]) + def test_illegal_usage(self, file): + with pytest.raises(TypeError, match="this function expect a file stream"): + for _ in reverse_readline(file): + pass + class TestReverseReadfile: NUM_LINES = 3000