From e2952c4637d773e5f6612e5f6594f5cfeddeb925 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:21:08 +0800 Subject: [PATCH 01/96] add l_end arg to reverse_readfile --- src/monty/io.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index f24a63dd..cb7133c6 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -22,7 +22,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from typing import IO, Generator, Union + from typing import IO, Generator, Literal, Union def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: @@ -54,7 +54,10 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: return open(filename, *args, **kwargs) -def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]: +def reverse_readfile( + filename: Union[str, Path], + l_end: Literal["AUTO", "\n", "\r\n"], +) -> Generator[str, str, None]: """ A much faster reverse read of file by using Python's mmap to generate a memory-mapped file. It is slower for very small files than @@ -63,6 +66,8 @@ def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]: Args: filename (str | Path): File to read. + l_end ("AUTO", "\n", "\r\n"): Line ending. Use "AUTO" to + automatically decide line ending based on OS. Yields: Lines from the file in reverse order. @@ -72,11 +77,12 @@ def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): for line in reversed(file.readlines()): yield line.decode("utf-8").rstrip(os.linesep) + else: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) n = len(filemap) while n > 0: - i = filemap.rfind(os.linesep.encode(), 0, n) + i = filemap.rfind(l_end.encode(), 0, n) yield filemap[i + 1 : n].decode("utf-8").rstrip(os.linesep) n = i From 619a38ddaa509b34f78219afb88f57c897b3184a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:22:01 +0800 Subject: [PATCH 02/96] rstrip remove hard coded trailing white space --- src/monty/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index cb7133c6..df8c5e00 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -76,14 +76,14 @@ def reverse_readfile( with zopen(filename, "rb") as file: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): for line in reversed(file.readlines()): - yield line.decode("utf-8").rstrip(os.linesep) + yield line.decode("utf-8").rstrip() else: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) n = len(filemap) while n > 0: i = filemap.rfind(l_end.encode(), 0, n) - yield filemap[i + 1 : n].decode("utf-8").rstrip(os.linesep) + yield filemap[i + 1 : n].decode("utf-8").rstrip() n = i except ValueError: From 6cbea600720bebc72b535e7f8e76b1cfbd64e88f Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:22:27 +0800 Subject: [PATCH 03/96] add default value for l_end --- src/monty/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index df8c5e00..e590c8e8 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -56,7 +56,7 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: def reverse_readfile( filename: Union[str, Path], - l_end: Literal["AUTO", "\n", "\r\n"], + l_end: Literal["AUTO", "\n", "\r\n"] = "AUTO", ) -> Generator[str, str, None]: """ A much faster reverse read of file by using Python's mmap to generate a From 0de9696e5ab28b13b0874454cc281ca2c771787d Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:37:08 +0800 Subject: [PATCH 04/96] add l_end to reverse_readline --- src/monty/io.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index e590c8e8..3e07f736 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -56,7 +56,7 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: def reverse_readfile( filename: Union[str, Path], - l_end: Literal["AUTO", "\n", "\r\n"] = "AUTO", + l_end: Literal["AUTO"] | str = "AUTO", ) -> Generator[str, str, None]: """ A much faster reverse read of file by using Python's mmap to generate a @@ -66,12 +66,15 @@ def reverse_readfile( Args: filename (str | Path): File to read. - l_end ("AUTO", "\n", "\r\n"): Line ending. Use "AUTO" to + l_end ("AUTO" | str): Line ending. Use "AUTO" to automatically decide line ending based on OS. Yields: Lines from the file in reverse order. """ + # Generate line ending + l_end = os.linesep if "AUTO" else l_end + try: with zopen(filename, "rb") as file: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): @@ -91,7 +94,10 @@ def reverse_readfile( def reverse_readline( - m_file, blk_size: int = 4096, max_mem: int = 4000000 + m_file, + blk_size: int = 4096, + max_mem: int = 4000000, + l_end: Literal["AUTO"] | str = "AUTO", ) -> Generator[str, str, None]: """ Generator function to read a file line-by-line, but backwards. @@ -115,12 +121,17 @@ def reverse_readline( operation. This is used to determine when to reverse a file in-memory versus seeking portions of a file. For bz2 files, this sets the maximum block size. + l_end ("AUTO" | str): Line ending. Use "AUTO" to + automatically decide line ending based on OS. Returns: Generator that yields lines from the file. Behave similarly to the file.readline() function, except the lines are returned from the back of the file. """ + # Generate line ending + l_end = os.linesep if "AUTO" else l_end + # Check if the file stream is a bit stream or not is_text = isinstance(m_file, io.TextIOWrapper) @@ -148,17 +159,17 @@ def reverse_readline( m_file.seek(0, 2) lastchar = m_file.read(1) if is_text else m_file.read(1).decode("utf-8") - trailing_newline = lastchar == os.linesep + trailing_newline = lastchar == l_end while True: - newline_pos = buf.rfind(os.linesep) + newline_pos = buf.rfind(l_end) pos = m_file.tell() if newline_pos != -1: # Found a newline line = buf[newline_pos + 1 :] buf = buf[:newline_pos] if pos or newline_pos or trailing_newline: - line += os.linesep + line += l_end yield line elif pos: @@ -171,7 +182,7 @@ def reverse_readline( buf = m_file.read(toread).decode("utf-8") + buf m_file.seek(pos - toread, 0) if pos == toread: - buf = os.linesep + buf + buf = l_end + buf else: # Start-of-file From f114afe3a58114a8390e1443da6df1b2358a3f3c Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:38:21 +0800 Subject: [PATCH 05/96] continue CI test jobs upon failure --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 24e816d9..b2c93d6f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,6 +6,7 @@ jobs: build: strategy: max-parallel: 20 + fail-fast: false matrix: os: [ubuntu-latest, macos-14, windows-latest] python-version: ["3.9", "3.x"] From deb1ad7feaba7577bdc4cd30e5829695d35f5430 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:41:27 +0800 Subject: [PATCH 06/96] bump codecov to v4 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b2c93d6f..44459744 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,6 +30,6 @@ jobs: run: pytest --cov=monty --cov-report html:coverage_reports tests - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} From c4a845c357a596a1f4ef9dafefd05ec5a63a6a07 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 14:53:50 +0800 Subject: [PATCH 07/96] tweak docstring --- src/monty/io.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 3e07f736..2ebe06fd 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -116,7 +116,7 @@ def reverse_readline( Args: m_file (File): File stream to read (backwards) - blk_size (int): The buffer size. Defaults to 4096. + blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of memory to involve in this operation. This is used to determine when to reverse a file in-memory versus seeking portions of a file. For bz2 files, @@ -124,10 +124,9 @@ def reverse_readline( l_end ("AUTO" | str): Line ending. Use "AUTO" to automatically decide line ending based on OS. - Returns: - Generator that yields lines from the file. Behave similarly to the - file.readline() function, except the lines are returned from the back - of the file. + Yields: + Lines from the file. Behave similarly to the file.readline function, + except the lines are returned from the back of the file. """ # Generate line ending l_end = os.linesep if "AUTO" else l_end From afbe5739c65c1acd03017c702afca3ad4c2073f5 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Wed, 4 Sep 2024 15:33:47 +0800 Subject: [PATCH 08/96] var name and docstring tweak --- src/monty/io.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 2ebe06fd..52bf2280 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -94,7 +94,7 @@ def reverse_readfile( def reverse_readline( - m_file, + m_file, # TODO: add type blk_size: int = 4096, max_mem: int = 4000000, l_end: Literal["AUTO"] | str = "AUTO", @@ -131,34 +131,35 @@ def reverse_readline( # Generate line ending l_end = os.linesep if "AUTO" else l_end - # Check if the file stream is a bit stream or not + # Check if the file stream is a buffered text stream is_text = isinstance(m_file, io.TextIOWrapper) try: file_size = os.path.getsize(m_file.name) except AttributeError: - # Bz2 files do not have name attribute. Just set file_size to above - # max_mem for now. + # Bz2 files do not have name attribute. + # Just set file_size to max_mem for now. file_size = max_mem + 1 - # If the file size is within our desired RAM use, just reverse it in memory - # GZip files must use this method because there is no way to negative seek + # If the file size is within desired RAM limit, just reverse it in memory. + # GZip files must use this method because there is no way to negative seek. # For windows, we also read the whole file. if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt": for line in reversed(m_file.readlines()): yield line.rstrip() + else: if isinstance(m_file, bz2.BZ2File): - # for bz2 files, seeks are expensive. It is therefore in our best + # For bz2 files, seeks are expensive. It is therefore in our best # interest to maximize the blk_size within limits of desired RAM # use. blk_size = min(max_mem, file_size) buf = "" m_file.seek(0, 2) - lastchar = m_file.read(1) if is_text else m_file.read(1).decode("utf-8") + last_char = m_file.read(1) if is_text else m_file.read(1).decode("utf-8") - trailing_newline = lastchar == l_end + trailing_newline = last_char == l_end while True: newline_pos = buf.rfind(l_end) @@ -173,14 +174,14 @@ def reverse_readline( elif pos: # Need to fill buffer - toread = min(blk_size, pos) - m_file.seek(pos - toread, 0) + to_read = min(blk_size, pos) + m_file.seek(pos - to_read, 0) if is_text: - buf = m_file.read(toread) + buf + buf = m_file.read(to_read) + buf else: - buf = m_file.read(toread).decode("utf-8") + buf - m_file.seek(pos - toread, 0) - if pos == toread: + buf = m_file.read(to_read).decode("utf-8") + buf + m_file.seek(pos - to_read, 0) + if pos == to_read: buf = l_end + buf else: From a4c4fe3c961f0acc35605db1238644fd1d5d9d97 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 17:16:38 +0800 Subject: [PATCH 09/96] add helper function to get line ending and unit test --- src/monty/io.py | 34 ++++++++++++++++++++++++++++++++++ tests/test_io.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/src/monty/io.py b/src/monty/io.py index 52bf2280..7e698d11 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -54,6 +54,40 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: return open(filename, *args, **kwargs) +def _get_line_ending( + file: str | Path | io.TextIOWrapper, +) -> Literal["\r\n", "\n", "\r"]: + """Helper function to get line ending of a file. + + This function assumes the file has a single consistent line ending. + + Returns: + "\n": Unix line ending. + "\r\n": Windows line ending. + "\r": Classic MacOS line ending. + + Raises: + ValueError: If file is empty or line ending is unknown. + """ + if isinstance(file, (str, Path)): + with open(file, "rb") as f: + first_line = f.readline() + elif isinstance(file, io.TextIOWrapper): + first_line = file.buffer.readline() + + if not first_line: + raise ValueError("empty file.") + + if first_line.endswith(b"\r\n"): + return "\r\n" + elif first_line.endswith(b"\n"): + return "\n" + elif first_line.endswith(b"\r"): + return "\r" + else: + raise ValueError(f"Unknown line ending in file {repr(first_line)}.") + + def reverse_readfile( filename: Union[str, Path], l_end: Literal["AUTO"] | str = "AUTO", diff --git a/tests/test_io.py b/tests/test_io.py index 9daa17be..d55b9ad9 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -8,6 +8,7 @@ from monty.io import ( FileLock, FileLockException, + _get_line_ending, reverse_readfile, reverse_readline, zopen, @@ -17,6 +18,43 @@ TEST_DIR = os.path.join(os.path.dirname(__file__), "test_files") +class TestGetLineEnding: + @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + def test_get_line_ending(self, l_end): + """Test file with: + Unix line ending (\n) + Windows line ending (\r\n) + Classic MacOS line ending (\r) + """ + with ScratchDir("."): + test_file = "test_file.txt" + with open(test_file, "wb") as f: + f.write(f"This is a test{l_end}Second line{l_end}".encode()) + + assert _get_line_ending(test_file) == l_end + assert _get_line_ending(Path(test_file)) == l_end + + with open(test_file, "r") as f: + assert _get_line_ending(f) == l_end + + def test_empty_file(self): + with ScratchDir("."): + test_file = "empty_file.txt" + open(test_file, "w").close() + + with pytest.raises(ValueError, match="empty file"): + _get_line_ending(test_file) + + def test_unknown_line_ending(self): + with ScratchDir("."): + test_file = "test_unknown.txt" + with open(test_file, "wb") as f: + f.write(b"This is a test\036") + + with pytest.raises(ValueError, match="Unknown line ending"): + _get_line_ending(test_file) + + class TestReverseReadline: NUMLINES = 3000 From 86ea01b7074e56c33750e4f6b2af1854e45d2d7c Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 17:31:46 +0800 Subject: [PATCH 10/96] add bzip2 and gzip file support and test --- src/monty/io.py | 16 +++++++--------- tests/test_io.py | 25 ++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 7e698d11..05752f66 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -74,6 +74,10 @@ def _get_line_ending( first_line = f.readline() elif isinstance(file, io.TextIOWrapper): first_line = file.buffer.readline() + elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): + first_line = file.readline() + else: + raise TypeError(f"Unknown file type {type(file).__name__}") if not first_line: raise ValueError("empty file.") @@ -90,7 +94,6 @@ def _get_line_ending( def reverse_readfile( filename: Union[str, Path], - l_end: Literal["AUTO"] | str = "AUTO", ) -> Generator[str, str, None]: """ A much faster reverse read of file by using Python's mmap to generate a @@ -100,14 +103,12 @@ def reverse_readfile( Args: filename (str | Path): File to read. - l_end ("AUTO" | str): Line ending. Use "AUTO" to - automatically decide line ending based on OS. Yields: Lines from the file in reverse order. """ # Generate line ending - l_end = os.linesep if "AUTO" else l_end + l_end = _get_line_ending(filename) try: with zopen(filename, "rb") as file: @@ -128,10 +129,9 @@ def reverse_readfile( def reverse_readline( - m_file, # TODO: add type + m_file, blk_size: int = 4096, max_mem: int = 4000000, - l_end: Literal["AUTO"] | str = "AUTO", ) -> Generator[str, str, None]: """ Generator function to read a file line-by-line, but backwards. @@ -155,15 +155,13 @@ def reverse_readline( operation. This is used to determine when to reverse a file in-memory versus seeking portions of a file. For bz2 files, this sets the maximum block size. - l_end ("AUTO" | str): Line ending. Use "AUTO" to - automatically decide line ending based on OS. Yields: Lines from the file. Behave similarly to the file.readline function, except the lines are returned from the back of the file. """ # Generate line ending - l_end = os.linesep if "AUTO" else l_end + l_end = _get_line_ending(m_file) # Check if the file stream is a buffered text stream is_text = isinstance(m_file, io.TextIOWrapper) diff --git a/tests/test_io.py b/tests/test_io.py index d55b9ad9..ea382b78 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,5 +1,7 @@ from __future__ import annotations +import bz2 +import gzip import os from pathlib import Path from unittest.mock import patch @@ -28,8 +30,9 @@ def test_get_line_ending(self, l_end): """ with ScratchDir("."): test_file = "test_file.txt" + test_line = f"This is a test{l_end}Second line{l_end}".encode() with open(test_file, "wb") as f: - f.write(f"This is a test{l_end}Second line{l_end}".encode()) + f.write(test_line) assert _get_line_ending(test_file) == l_end assert _get_line_ending(Path(test_file)) == l_end @@ -37,6 +40,26 @@ def test_get_line_ending(self, l_end): with open(test_file, "r") as f: assert _get_line_ending(f) == l_end + # Test gzip file + with gzip.open(f"{test_file}.gz", "wb") as f: + f.write(test_line) + + with gzip.open(f"{test_file}.gz", "rb") as f: + assert _get_line_ending(f) == l_end + + # Test bzip2 file + with bz2.open(f"{test_file}.bz2", "wb") as f: + f.write(test_line) + + with bz2.open(f"{test_file}.bz2", "rb") as f: + assert _get_line_ending(f) == l_end + + def test_unknown_file_type(self): + unknown_file = 123 + + with pytest.raises(TypeError, match="Unknown file type int"): + _get_line_ending(unknown_file) + def test_empty_file(self): with ScratchDir("."): test_file = "empty_file.txt" From c7ec2de1f7ac887339a913adc9531afca1a69ecb Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 17:34:38 +0800 Subject: [PATCH 11/96] sort import and tweak docstring --- src/monty/io.py | 10 +++++----- tests/test_io.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 05752f66..487ada91 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -9,11 +9,6 @@ import errno import gzip import io - -try: - import lzma -except ImportError: - lzma = None # type: ignore[assignment] import mmap import os import subprocess @@ -21,6 +16,11 @@ from pathlib import Path from typing import TYPE_CHECKING +try: + import lzma +except ImportError: + lzma = None # type: ignore[assignment] + if TYPE_CHECKING: from typing import IO, Generator, Literal, Union diff --git a/tests/test_io.py b/tests/test_io.py index ea382b78..ef8dd2c7 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -23,7 +23,7 @@ class TestGetLineEnding: @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) def test_get_line_ending(self, l_end): - """Test file with: + """Test files with: Unix line ending (\n) Windows line ending (\r\n) Classic MacOS line ending (\r) From ae125c340bb187bf0fb45ae3dec355c3d3d72029 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 17:41:03 +0800 Subject: [PATCH 12/96] use unix line ending \n as default if empty --- src/monty/io.py | 4 +++- tests/test_io.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 487ada91..0ffda9bb 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -13,6 +13,7 @@ import os import subprocess import time +import warnings from pathlib import Path from typing import TYPE_CHECKING @@ -80,7 +81,8 @@ def _get_line_ending( raise TypeError(f"Unknown file type {type(file).__name__}") if not first_line: - raise ValueError("empty file.") + warnings.warn("File empty, use default line ending \n.", stacklevel=2) + return "\n" if first_line.endswith(b"\r\n"): return "\r\n" diff --git a/tests/test_io.py b/tests/test_io.py index ef8dd2c7..5258d49b 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -65,8 +65,8 @@ def test_empty_file(self): test_file = "empty_file.txt" open(test_file, "w").close() - with pytest.raises(ValueError, match="empty file"): - _get_line_ending(test_file) + with pytest.warns(match="File empty, use default line ending \n"): + assert _get_line_ending(test_file) == "\n" def test_unknown_line_ending(self): with ScratchDir("."): @@ -119,9 +119,10 @@ def test_empty_file(self): Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. """ - with open(os.path.join(TEST_DIR, "empty_file.txt")) as f: - for _line in reverse_readline(f): - raise ValueError("an empty file is being read!") + with pytest.warns(match="File empty, use default line ending \n."): + with open(os.path.join(TEST_DIR, "empty_file.txt")) as f: + for _line in reverse_readline(f): + raise ValueError("an empty file is being read!") @pytest.fixture() def test_line_ending(self): @@ -189,8 +190,9 @@ def test_empty_file(self): Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. """ - for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): - raise ValueError("an empty file is being read!") + with pytest.warns(match="File empty, use default line ending \n."): + for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): + raise ValueError("an empty file is being read!") @pytest.fixture def test_line_ending(self): From 343e0db126c6861e81ecc0553807c9d8f9b0c7bd Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 17:42:01 +0800 Subject: [PATCH 13/96] fix docstring --- src/monty/io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index 0ffda9bb..c3c7e6b8 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -68,7 +68,10 @@ def _get_line_ending( "\r": Classic MacOS line ending. Raises: - ValueError: If file is empty or line ending is unknown. + ValueError: If line ending is unknown. + + Warns: + If file is empty, "\n" would be used as default. """ if isinstance(file, (str, Path)): with open(file, "rb") as f: From d288e0d7713ff643ddfb711761dd0023722158b6 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 18:03:03 +0800 Subject: [PATCH 14/96] update unit test --- tests/test_io.py | 67 ++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 51 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 5258d49b..f6ded461 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -4,7 +4,6 @@ import gzip import os from pathlib import Path -from unittest.mock import patch import pytest from monty.io import ( @@ -124,35 +123,17 @@ def test_empty_file(self): for _line in reverse_readline(f): raise ValueError("an empty file is being read!") - @pytest.fixture() - def test_line_ending(self): + @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) + def test_line_ending(self, l_end): contents = ("Line1", "Line2", "Line3") - # Mock Linux/MacOS - with patch("os.name", "posix"): - linux_line_end = os.linesep - assert linux_line_end == "\n" - - with ScratchDir("./test_files"): - with open("sample_unix_mac.txt", "w", newline=linux_line_end) as file: - file.write(linux_line_end.join(contents)) - - with open("sample_unix_mac.txt") as file: - for idx, line in enumerate(reverse_readfile(file)): - assert line == contents[len(contents) - idx - 1] - - # Mock Windows - with patch("os.name", "nt"): - windows_line_end = os.linesep - assert windows_line_end == "\r\n" - - with ScratchDir("./test_files"): - with open("sample_windows.txt", "w", newline=windows_line_end) as file: - file.write(windows_line_end.join(contents)) + with ScratchDir("."): + with open("test_file.txt", "wb") as file: + file.write((l_end.join(contents) + l_end).encode()) - with open("sample_windows.txt") as file: - for idx, line in enumerate(reverse_readfile(file)): - assert line == contents[len(contents) - idx - 1] + with open("test_file.txt", "r") as file: # + for idx, line in enumerate(reverse_readline(file)): + assert line.strip() == contents[len(contents) - idx - 1] class TestReverseReadfile: @@ -194,33 +175,17 @@ def test_empty_file(self): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): raise ValueError("an empty file is being read!") - @pytest.fixture - def test_line_ending(self): + @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) + def test_line_ending(self, l_end): contents = ("Line1", "Line2", "Line3") - # Mock Linux/MacOS - with patch("os.name", "posix"): - linux_line_end = os.linesep - assert linux_line_end == "\n" - - with ScratchDir("./test_files"): - with open("sample_unix_mac.txt", "w", newline=linux_line_end) as file: - file.write(linux_line_end.join(contents)) - - for idx, line in enumerate(reverse_readfile("sample_unix_mac.txt")): - assert line == contents[len(contents) - idx - 1] - - # Mock Windows - with patch("os.name", "nt"): - windows_line_end = os.linesep - assert windows_line_end == "\r\n" - - with ScratchDir("./test_files"): - with open("sample_windows.txt", "w", newline=windows_line_end) as file: - file.write(windows_line_end.join(contents)) + with ScratchDir("."): + with open("test_file.txt", "wb") as file: + file.write((l_end.join(contents) + l_end).encode()) - for idx, line in enumerate(reverse_readfile("sample_windows.txt")): - assert line == contents[len(contents) - idx - 1] + with open("test_file.txt", "r") as file: # + for idx, line in enumerate(reverse_readline(file)): + assert line.strip() == contents[len(contents) - idx - 1] class TestZopen: From 064c0648c37775f28805fc81ae799d76fb691de4 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 18:05:12 +0800 Subject: [PATCH 15/96] remove accidental comment sign --- tests/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index f6ded461..b2a788f3 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -131,7 +131,7 @@ def test_line_ending(self, l_end): with open("test_file.txt", "wb") as file: file.write((l_end.join(contents) + l_end).encode()) - with open("test_file.txt", "r") as file: # + with open("test_file.txt", "r") as file: for idx, line in enumerate(reverse_readline(file)): assert line.strip() == contents[len(contents) - idx - 1] @@ -183,7 +183,7 @@ def test_line_ending(self, l_end): with open("test_file.txt", "wb") as file: file.write((l_end.join(contents) + l_end).encode()) - with open("test_file.txt", "r") as file: # + with open("test_file.txt", "r") as file: for idx, line in enumerate(reverse_readline(file)): assert line.strip() == contents[len(contents) - idx - 1] From 30624aa6eeab3e5d3c7718224d62e55eca3233bf Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 19:52:40 +0800 Subject: [PATCH 16/96] use if after return --- src/monty/io.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index c3c7e6b8..cd1e3f65 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -83,18 +83,19 @@ def _get_line_ending( else: raise TypeError(f"Unknown file type {type(file).__name__}") + # Return Unix "\n" line ending as default if file is empty if not first_line: warnings.warn("File empty, use default line ending \n.", stacklevel=2) return "\n" if first_line.endswith(b"\r\n"): return "\r\n" - elif first_line.endswith(b"\n"): + if first_line.endswith(b"\n"): return "\n" - elif first_line.endswith(b"\r"): + if first_line.endswith(b"\r"): return "\r" - else: - raise ValueError(f"Unknown line ending in file {repr(first_line)}.") + + raise ValueError(f"Unknown line ending in line {repr(first_line)}.") def reverse_readfile( From 30841233ae8edb76180f3087b9d5843272102bdc Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 5 Sep 2024 19:53:17 +0800 Subject: [PATCH 17/96] tweak docstring --- src/monty/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index cd1e3f65..cd2d45a2 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -70,7 +70,7 @@ def _get_line_ending( Raises: ValueError: If line ending is unknown. - Warns: + Warnings: If file is empty, "\n" would be used as default. """ if isinstance(file, (str, Path)): From 1639725d3cdc829095f5b46e124e5741abd00575 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 14:05:45 +0800 Subject: [PATCH 18/96] reset pointer to fix test in windows --- src/monty/io.py | 1 + tests/test_io.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index cd2d45a2..585c0db5 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -80,6 +80,7 @@ def _get_line_ending( first_line = file.buffer.readline() elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() + file.seek(0) # reset pointer else: raise TypeError(f"Unknown file type {type(file).__name__}") diff --git a/tests/test_io.py b/tests/test_io.py index b2a788f3..47f2f53e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -28,7 +28,7 @@ def test_get_line_ending(self, l_end): Classic MacOS line ending (\r) """ with ScratchDir("."): - test_file = "test_file.txt" + test_file = "test_l_end.txt" test_line = f"This is a test{l_end}Second line{l_end}".encode() with open(test_file, "wb") as f: f.write(test_line) @@ -36,7 +36,7 @@ def test_get_line_ending(self, l_end): assert _get_line_ending(test_file) == l_end assert _get_line_ending(Path(test_file)) == l_end - with open(test_file, "r") as f: + with open(test_file, "r", encoding="utf-8") as f: assert _get_line_ending(f) == l_end # Test gzip file @@ -86,7 +86,7 @@ def test_reverse_readline(self): order, i.e. the first line that is read corresponds to the last line. number """ - with open(os.path.join(TEST_DIR, "3000_lines.txt")) as f: + with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f)): assert ( int(line) == self.NUMLINES - idx @@ -96,7 +96,7 @@ def test_reverse_readline_fake_big(self): """ Make sure that large text files are read properly. """ - with open(os.path.join(TEST_DIR, "3000_lines.txt")) as f: + with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert ( int(line) == self.NUMLINES - idx @@ -111,7 +111,7 @@ def test_reverse_readline_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line.strip()) - assert lines[-1].strip(), ["HelloWorld." in b"HelloWorld."] + assert lines[-1].strip() == b"HelloWorld." def test_empty_file(self): """ @@ -119,7 +119,7 @@ def test_empty_file(self): is called, which was a problem with an earlier implementation. """ with pytest.warns(match="File empty, use default line ending \n."): - with open(os.path.join(TEST_DIR, "empty_file.txt")) as f: + with open(os.path.join(TEST_DIR, "empty_file.txt"), encoding="utf-8") as f: for _line in reverse_readline(f): raise ValueError("an empty file is being read!") @@ -131,7 +131,7 @@ def test_line_ending(self, l_end): with open("test_file.txt", "wb") as file: file.write((l_end.join(contents) + l_end).encode()) - with open("test_file.txt", "r") as file: + with open("test_file.txt", "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): assert line.strip() == contents[len(contents) - idx - 1] @@ -183,7 +183,7 @@ def test_line_ending(self, l_end): with open("test_file.txt", "wb") as file: file.write((l_end.join(contents) + l_end).encode()) - with open("test_file.txt", "r") as file: + with open("test_file.txt", "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): assert line.strip() == contents[len(contents) - idx - 1] From dfe553d92cd16216fe02a993cba8b62240d06b15 Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Fri, 6 Sep 2024 14:18:46 +0800 Subject: [PATCH 19/96] encode in linux --- tests/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 47f2f53e..4b2f5af9 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -111,7 +111,7 @@ def test_reverse_readline_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line.strip()) - assert lines[-1].strip() == b"HelloWorld." + assert lines[-1].strip().encode() == b"HelloWorld." def test_empty_file(self): """ From 3b7a81158951e8ae95c7b6e144535f03fb72dbbc Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 14:25:42 +0800 Subject: [PATCH 20/96] reset pointer in win only --- src/monty/io.py | 3 ++- tests/test_io.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 585c0db5..1629c62b 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -80,7 +80,8 @@ def _get_line_ending( first_line = file.buffer.readline() elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() - file.seek(0) # reset pointer + if os.name == "nt": + file.seek(0) # reset pointer else: raise TypeError(f"Unknown file type {type(file).__name__}") diff --git a/tests/test_io.py b/tests/test_io.py index 4b2f5af9..47f2f53e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -111,7 +111,7 @@ def test_reverse_readline_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line.strip()) - assert lines[-1].strip().encode() == b"HelloWorld." + assert lines[-1].strip() == b"HelloWorld." def test_empty_file(self): """ From b2917076130f7df1b75ba0c0530219a1c12d50c2 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 14:39:08 +0800 Subject: [PATCH 21/96] yield str in windows --- src/monty/io.py | 5 ++--- tests/test_io.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 1629c62b..49e4980d 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -80,8 +80,7 @@ def _get_line_ending( first_line = file.buffer.readline() elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() - if os.name == "nt": - file.seek(0) # reset pointer + file.seek(0) # reset pointer else: raise TypeError(f"Unknown file type {type(file).__name__}") @@ -186,7 +185,7 @@ def reverse_readline( # For windows, we also read the whole file. if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt": for line in reversed(m_file.readlines()): - yield line.rstrip() + yield line.rstrip() if isinstance(line, str) else line.rstrip().decode() else: if isinstance(m_file, bz2.BZ2File): diff --git a/tests/test_io.py b/tests/test_io.py index 47f2f53e..3c2f9227 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -111,7 +111,7 @@ def test_reverse_readline_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line.strip()) - assert lines[-1].strip() == b"HelloWorld." + assert lines[-1].strip() == "HelloWorld." def test_empty_file(self): """ From aeba5a704ccf82471e86b2abb744e6ee9d3858d0 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 14:47:17 +0800 Subject: [PATCH 22/96] use Iterator as return type --- src/monty/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 49e4980d..0fab2201 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -23,7 +23,7 @@ lzma = None # type: ignore[assignment] if TYPE_CHECKING: - from typing import IO, Generator, Literal, Union + from typing import IO, Iterator, Literal, Union def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: @@ -101,7 +101,7 @@ def _get_line_ending( def reverse_readfile( filename: Union[str, Path], -) -> Generator[str, str, None]: +) -> Iterator[str]: """ A much faster reverse read of file by using Python's mmap to generate a memory-mapped file. It is slower for very small files than @@ -139,7 +139,7 @@ def reverse_readline( m_file, blk_size: int = 4096, max_mem: int = 4000000, -) -> Generator[str, str, None]: +) -> Iterator[str]: """ Generator function to read a file line-by-line, but backwards. This allows one to efficiently get data at the end of a file. From 1d7adda822bd12e7d071218b0670440ce2e67146 Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Fri, 6 Sep 2024 16:59:26 +0800 Subject: [PATCH 23/96] assert iterator return str --- tests/test_io.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 3c2f9227..fc0197f1 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -88,6 +88,7 @@ def test_reverse_readline(self): """ with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f)): + assert isinstance(line, str) assert ( int(line) == self.NUMLINES - idx ), f"read_backwards read {line} whereas it should have read {self.NUMLINES - idx}" @@ -98,6 +99,7 @@ def test_reverse_readline_fake_big(self): """ with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f, max_mem=0)): + assert isinstance(line, str) assert ( int(line) == self.NUMLINES - idx ), f"read_backwards read {line} whereas it should have read {self.NUMLINES - idx}" @@ -110,8 +112,9 @@ def test_reverse_readline_bz2(self): lines = [] with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): - lines.append(line.strip()) - assert lines[-1].strip() == "HelloWorld." + lines.append(line) + assert lines == ["\n", "\n", "HelloWorld."] # test file has two empty lines + assert all(isinstance(line, str) for line in lines) def test_empty_file(self): """ @@ -133,7 +136,8 @@ def test_line_ending(self, l_end): with open("test_file.txt", "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): - assert line.strip() == contents[len(contents) - idx - 1] + assert line == contents[len(contents) - idx - 1] + assert isinstance(line, str) class TestReverseReadfile: @@ -146,6 +150,7 @@ def test_reverse_readfile(self): """ fname = os.path.join(TEST_DIR, "3000_lines.txt") for idx, line in enumerate(reverse_readfile(fname)): + assert isinstance(line, str) assert int(line) == self.NUMLINES - idx def test_reverse_readfile_gz(self): @@ -155,6 +160,7 @@ def test_reverse_readfile_gz(self): """ fname = os.path.join(TEST_DIR, "3000_lines.txt.gz") for idx, line in enumerate(reverse_readfile(fname)): + assert isinstance(line, str) assert int(line) == self.NUMLINES - idx def test_reverse_readfile_bz2(self): @@ -164,6 +170,7 @@ def test_reverse_readfile_bz2(self): """ fname = os.path.join(TEST_DIR, "3000_lines.txt.bz2") for idx, line in enumerate(reverse_readfile(fname)): + assert isinstance(line, str) assert int(line) == self.NUMLINES - idx def test_empty_file(self): @@ -185,7 +192,8 @@ def test_line_ending(self, l_end): with open("test_file.txt", "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): - assert line.strip() == contents[len(contents) - idx - 1] + assert isinstance(line, str) + assert line == contents[len(contents) - idx - 1] class TestZopen: From e6312a6cd8676ec1cc62664e4bc3b8116511423b Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Fri, 6 Sep 2024 17:30:35 +0800 Subject: [PATCH 24/96] strict usage of rstrp(line_ending) --- src/monty/io.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 0fab2201..90bb9215 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -121,14 +121,14 @@ def reverse_readfile( with zopen(filename, "rb") as file: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): for line in reversed(file.readlines()): - yield line.decode("utf-8").rstrip() + yield line.decode("utf-8").rstrip(l_end) else: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) n = len(filemap) while n > 0: i = filemap.rfind(l_end.encode(), 0, n) - yield filemap[i + 1 : n].decode("utf-8").rstrip() + yield filemap[i + 1 : n].decode("utf-8") n = i except ValueError: @@ -185,7 +185,11 @@ def reverse_readline( # For windows, we also read the whole file. if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt": for line in reversed(m_file.readlines()): - yield line.rstrip() if isinstance(line, str) else line.rstrip().decode() + yield ( + line.rstrip(l_end) + if isinstance(line, str) + else line.rstrip(l_end).decode() + ) else: if isinstance(m_file, bz2.BZ2File): From 7911e0a6b013333a196ed3fb79c3a84dc5a3d4aa Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Fri, 6 Sep 2024 17:34:46 +0800 Subject: [PATCH 25/96] better assert no error --- tests/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index fc0197f1..c1d495b8 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -124,7 +124,7 @@ def test_empty_file(self): with pytest.warns(match="File empty, use default line ending \n."): with open(os.path.join(TEST_DIR, "empty_file.txt"), encoding="utf-8") as f: for _line in reverse_readline(f): - raise ValueError("an empty file is being read!") + pytest.fail("No error should be thrown.") @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): @@ -180,7 +180,7 @@ def test_empty_file(self): """ with pytest.warns(match="File empty, use default line ending \n."): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): - raise ValueError("an empty file is being read!") + pytest.fail("No error should be thrown.") @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): From 6cb5349a437b99afde9e6fcbd939ff32ea7e5b21 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 17:42:56 +0800 Subject: [PATCH 26/96] decode before strip, but tests are failing --- src/monty/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 90bb9215..8f989d5f 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -176,19 +176,19 @@ def reverse_readline( try: file_size = os.path.getsize(m_file.name) except AttributeError: - # Bz2 files do not have name attribute. + # Bz2 files do not have "name" attribute. # Just set file_size to max_mem for now. file_size = max_mem + 1 # If the file size is within desired RAM limit, just reverse it in memory. # GZip files must use this method because there is no way to negative seek. # For windows, we also read the whole file. - if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt": + if os.name == "nt" or file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): yield ( line.rstrip(l_end) if isinstance(line, str) - else line.rstrip(l_end).decode() + else line.decode().rstrip(l_end) ) else: From 79f07f1cb83de60c292409037f83fb593050370f Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 6 Sep 2024 20:39:39 +0800 Subject: [PATCH 27/96] add test_file_with_empty_lines for readfile, Win still not working --- tests/test_io.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_io.py b/tests/test_io.py index c1d495b8..330fcccf 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -126,6 +126,11 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") + @pytest.mark.skip("TODO: WIP") + @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + def test_file_with_empty_lines(self, l_end): + """Empty lines should not be skipped.""" + @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): contents = ("Line1", "Line2", "Line3") @@ -182,6 +187,32 @@ def test_empty_file(self): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): pytest.fail("No error should be thrown.") + @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + def test_file_with_empty_lines(self, l_end): + """Empty lines should not be skipped. + + TODO: not working for "\r\n" for some reason. + """ + expected_contents = ("line1", "", "line3") + filename = "test_empty_line.txt" + + with ScratchDir("."): + # Test text file + with open(filename, "w", newline="", encoding="utf-8") as file: + for line in expected_contents: + file.write(line + l_end) + + # Sanity check: ensure the text file is correctly written + with open(filename, "rb") as file: + raw_content = file.read() + expected_raw_content = (l_end.join(expected_contents) + l_end).encode( + "utf-8" + ) + assert raw_content == expected_raw_content + + revert_contents = tuple(reverse_readfile(filename)) + assert revert_contents[::-1] == (*expected_contents, "") + @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): contents = ("Line1", "Line2", "Line3") From d2ea9898bb4a3b2e2ae22d3cb93d3b1519670314 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 11:21:44 +0800 Subject: [PATCH 28/96] allow zipped file directly into get_line_end, and add test --- src/monty/io.py | 23 ++++++++++++------ tests/test_io.py | 63 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 8f989d5f..b69018b5 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -74,7 +74,7 @@ def _get_line_ending( If file is empty, "\n" would be used as default. """ if isinstance(file, (str, Path)): - with open(file, "rb") as f: + with zopen(file, "rb") as f: first_line = f.readline() elif isinstance(file, io.TextIOWrapper): first_line = file.buffer.readline() @@ -114,7 +114,7 @@ def reverse_readfile( Yields: Lines from the file in reverse order. """ - # Generate line ending + # Get line ending l_end = _get_line_ending(filename) try: @@ -125,11 +125,20 @@ def reverse_readfile( else: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) - n = len(filemap) - while n > 0: - i = filemap.rfind(l_end.encode(), 0, n) - yield filemap[i + 1 : n].decode("utf-8") - n = i + file_size = len(filemap) + while file_size > 0: + line_end_pos = filemap.rfind(l_end.encode(), 0, file_size) + # The last line doesn't have a line ending + if line_end_pos == -1: + yield filemap[:file_size].decode("utf-8").rstrip(l_end) + break + + yield ( + filemap[line_end_pos + len(l_end) : file_size] + .decode("utf-8") + .rstrip(l_end) + ) + file_size = line_end_pos except ValueError: return diff --git a/tests/test_io.py b/tests/test_io.py index 330fcccf..eb9090f0 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -40,19 +40,29 @@ def test_get_line_ending(self, l_end): assert _get_line_ending(f) == l_end # Test gzip file - with gzip.open(f"{test_file}.gz", "wb") as f: + gzip_filename = f"{test_file}.gz" + with gzip.open(gzip_filename, "wb") as f: f.write(test_line) - with gzip.open(f"{test_file}.gz", "rb") as f: + # Opened file + with gzip.open(gzip_filename, "rb") as f: assert _get_line_ending(f) == l_end - # Test bzip2 file - with bz2.open(f"{test_file}.bz2", "wb") as f: + # Filename directly + assert _get_line_ending(gzip_filename) == l_end + + # Test opened bzip2 file + bz2_filename = f"{test_file}.bz2" + with bz2.open(bz2_filename, "wb") as f: f.write(test_line) - with bz2.open(f"{test_file}.bz2", "rb") as f: + # Opened file + with bz2.open(bz2_filename, "rb") as f: assert _get_line_ending(f) == l_end + # Filename directly + assert _get_line_ending(bz2_filename) == l_end + def test_unknown_file_type(self): unknown_file = 123 @@ -189,29 +199,40 @@ def test_empty_file(self): @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) def test_file_with_empty_lines(self, l_end): - """Empty lines should not be skipped. - - TODO: not working for "\r\n" for some reason. - """ - expected_contents = ("line1", "", "line3") + """Empty lines should not be skipped.""" + contents = ("line1", "", "line3") filename = "test_empty_line.txt" with ScratchDir("."): # Test text file - with open(filename, "w", newline="", encoding="utf-8") as file: - for line in expected_contents: + with open(filename, "w", newline=l_end, encoding="utf-8") as file: + for line in contents: file.write(line + l_end) - # Sanity check: ensure the text file is correctly written - with open(filename, "rb") as file: - raw_content = file.read() - expected_raw_content = (l_end.join(expected_contents) + l_end).encode( - "utf-8" - ) - assert raw_content == expected_raw_content - revert_contents = tuple(reverse_readfile(filename)) - assert revert_contents[::-1] == (*expected_contents, "") + assert revert_contents[::-1] == (*contents, "") + + # Test bzip2 file + bz2_filename = f"{filename}.bz2" + with bz2.open( + bz2_filename, "wt", newline=l_end, encoding="utf-8" + ) as file_out: + for line in contents: + file_out.write(line + l_end) + + revert_contents_bz2 = tuple(reverse_readfile(bz2_filename)) + assert revert_contents_bz2[::-1] == (*contents, "") + + # Test gzip file + gzip_filename = f"{filename}.gz" + with gzip.open( + gzip_filename, "wt", newline=l_end, encoding="utf-8" + ) as file_out: + for line in contents: + file_out.write(line + l_end) + + revert_contents_gzip = tuple(reverse_readfile(gzip_filename)) + assert revert_contents_gzip[::-1] == (*contents, "") @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): From 6803c7be4aa7fc80cdbab51753072a97f5a3fa00 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 16:16:47 +0800 Subject: [PATCH 29/96] remove seemingly unused file --- tests/test_files/3000lines.txt.gz | Bin 6495 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/test_files/3000lines.txt.gz diff --git a/tests/test_files/3000lines.txt.gz b/tests/test_files/3000lines.txt.gz deleted file mode 100644 index 55e186f55936f794397a4ad09c64babbfc71cf6d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6495 zcmdT^cU%+c(+5t^oVS!s6REoNj*5s##Ih@~ViWQNW`F1VmauX>UCBMD+dR-e39r-p~B;%+53O{mwiyJNg^~!N7F7 zo6>hPY;A2r0=I36u#Sw5R5Fq--dulyI8ID+{Tkf{FK(B zR9SRtLcB|v!R(vds0PPy{g{+u7df}1&gqWwgw{I8B6Z%LzK#0vBFEIjKE2N2x~+~+ z6VA#!JM!iECH0P}7NXl>y``Ow=Ld>-^|7hJ`c*rh%fe(uvtLI&AJUK1?~8ce5!O*Z zoFa4L>8Oi@DKbaia7RLljK)iFs_o>7GU^+nt2(_!ar&Vn9mS%nseNN{-K6ySH zL?x+1I#)+R`|?Zc=cL+*M4jjAucma&Es^!t)VFSl*1nb^>L-NO|h&NtCLknYkXq4Ez3`O9IM%q zJ~j4y)v+4?SV7BylO9-2VC<=ub0@2?`^p1!vZ$WZqWqC)%R<&%Ab^7yVv9&Gxa%S3W9X;jM!WM#UhpWA_mp}gmAzjODZJmjD;LT>=kg)5_R{*Vg6EFh7_ccBX^{F{ z5(TAJ>Ybsv$DNgXw&z#-+}@x{s~Tn-4hZTxm=6wqyc=G7#`oPqjQA+=HaYTD(fv7^yHO^_2 zQg>*vqOoJG1tMYm-j*w0!^oqpv!i0qj{^S7N4g(Z)bS^0{I;$ecTG(6v*LGjy$s~> zV#{PE`uwpeRf*1i3i#s6f57SKH*<}jTlBW!sY{_eT3Wv~wUAabN~t&SR@5QY3ymvo z@4Wn3;glyUhGDE#s@Ksa$XC4XSQu&jwfNuK6or4y5URgW(ecroAALl!N<|%oH1F(y zgyX%(p6*|l9h7jk_sR~vkhn4z>DO58r0w6#b>MAon=o8IUFuUP+Lro4?<#FaT$}t^ zd%o01wEvZY;|tS!p98M7it0xBeF%(ud$dOJ=YNG`lPyz}@E7PO7J2)gL`6blC<{fq zGBp&f{@T6OR545WSu=I)GwpR|WzN!Z$aK(|?bpsfkGxwa%-tOy^HQ9ClKPpajMus?LX}eAkRov~2jC9I7wsmN4MSV+u zw4*am?O+b6p@k6RxR9sD%~{L9zBb9Ub9H%Gv9>$%nNya%tLR}xOm}3RQ&AmB=CNq{D*2~0*O16rsSpp9w+I;akyi|PV;s2-q?>H|~IDZo^8 zDgdDnV1OC`hNvN6gc@}~wZZ1}lZ71Y5U;BbPEzrBYpGzZ?ve527g_3=W@ zAGLsY!3VQ7)wMLVKrNycNsD~(3IOUt3*a;+9TY>*BKQDP2c$vCFquUsiVdM9@L{GU z_!D#!PGiwYVq<6poW*nk6Cf3Y%%+pYrqC+*1aldf1ua0**mR269CC-xF}=WRP%=X1 z(5YfeXdQfs83Mk5ZX#(M`XsS6~Ufg)AnKVl;h8WE#PYu?ji?FJ&Di z&NK>N(vwCoXShM9;MJ^L;u@oiOU{SY?XKh23F=D653s~_FY7w%ls9E8wG?|o7vU|e z2gEF+1xwS?=~m)(&}Dcl>pAh7QSwsq5xTY53;GR?Vzm)p7~Nc&c7$#t_J&H}1lAyN z$Vg=w`6%61yiw8eeJp^aYrJ4t+EMy6@n+~B@Bx+%iDsOK11vWmBCpoCsKm3%5pOHynDy3CTO`EcE?T}2t9#MvD`@Kj5jZTg58-Z4u+n= z`7B@3Lt}ioTgDxG@mAtqivpY6YY92 z;pi;o5f9~Eo4vn(AWe$V-54?7guTKrE5H$q!e7BpS?5WcOzy2Xo)s_$+=;)2Ygi?u z-6j)V4S4}hU<}>`H?hh{0+S`K$9VxXFb8q6ug64%F6W2f>_98OZ)STgqH^U$sCnM%;cXGaI z5TgNaMnDwTM5ZgylbVbk6_3ZN=5^}@c#j*Z9ru=DWk2fQ(vOCVC zGq{ie;=(>iHZohas^BY2;L6}ZG$fAQPwqAATHTXFSjpf+^N~Gl0)=8e-7PYgz+m7If^gWHl$qw?Zauk# zRgB}%V&o9pl(NSBqFdxC!fM7zXgQL>cA!L=ce(YPBDlrKRO^aWGIu6*6Svo>lm%ap zQSVO#n@yV5@E_hjox6s62KPWNus2ignqzC=Gr8{EbNG6sh#gL8F|S^8>rAc(HxJ){ zTxTa!ez4GThtKAEa`SN?H5uw^3eL-1JDxqi+zGK)aA3_8?`*LdBDOfxc0EU9tYga{#KYR_UJn@NcE0YsbQK#j8O^6bec_%iN5 zT*N@JLJwO`UmJOmu!V6CI*eFwJgJu~!`Jp)B=|8NK*x|7oB(R2Wo-!UnU@16oZ4bp zz4n$c*PmOCXCw1C$vHZ^ zt`sjswsZcV-nGKk!-cuqxG(UlNHnL5+G16`{#Idpw>;5%qN{Q|J4UeBI~!P4Z2j@2Bnibbbg=vVF6?%i!LJQ4W#+h2mH1Pu=BL@9)0LPv)P# z%dge{=`Ozxe`=y%x4(O$U!Q+^qThi3(?q`!e`=E7cLDB6!*k{d$Mw{zbdiJ;>^Sp; zKiHj_C;ZXw{yh8aD86I@wvR6%V2AmVAFvaA$wcfDU-Bb%n=b*dO1?xDYvfDRuwK4I z9UCu@XkZk91jLL45+XKJAR%FhKtjgW2qYA2i$Frfq6Csj*gk&(_)+mtZVZ8#0J~kegOu;C)WGZHaOCW3}E-}CmTw;i=!6io67F=SC zMd1*@#@DSOw^Hd3bYCh@yxt`F4SSSm}s_AmU8N_|%8stbRwJzqBK+h3wN8MVT9 zw5zqkL7KEyI7}O;9T~sfd74Gtzqy6ax(m(z%@iLd?wPR9nKL4M>wIQJIN*GLL^$N! zIx;ep=)CD)xn@#AhoQUw^tk=shP8IEQZy(L4ZaW!z7!2MiUwOmgYBZhPSIemXr$b^ Rsl&NUze4XG0k&6C`X5qlh=c$D From 351452a17c0ac321a585094a96e872ac46b26c2b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 16:19:59 +0800 Subject: [PATCH 30/96] update test file to include line end for the last line --- tests/test_files/3000_lines.txt | 2 +- tests/test_files/3000_lines.txt.bz2 | Bin 3557 -> 3561 bytes tests/test_files/3000_lines.txt.gz | Bin 6496 -> 6497 bytes 3 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_files/3000_lines.txt b/tests/test_files/3000_lines.txt index 2f8c055b..1127304b 100644 --- a/tests/test_files/3000_lines.txt +++ b/tests/test_files/3000_lines.txt @@ -2997,4 +2997,4 @@ 2997 2998 2999 -3000 \ No newline at end of file +3000 diff --git a/tests/test_files/3000_lines.txt.bz2 b/tests/test_files/3000_lines.txt.bz2 index 3a028199ae3c64964d83cfd70aacefe2179a211e..177141f0a9b9bfad70e027d8e32dd05427cb3a56 100644 GIT binary patch literal 3561 zcmcIne{56N6~2z=SP()|pa?-o%=(g5EUxMXNN6+W6lvlqqU}nHlzLsN;bxfR#@UEs zz#(~AXd7u0wO%AwO@O6RR<)rtt0C9g9xrwzG>DQ=l&*yM&ObCdU}|j#C^jUR>^#Q~ zcJj7XRd@fe?BnBezweyyopb!`D|@%zPfFyap{VT3+Oyr_FJS|tF#zZ%iE?7#_| z-oH>S)2sN5xAb?*bCiJ_s~D9{aTKixto{`$wykqDdv_nptsN-xfpp! z&e{2eEb62BOd0A~ztQN>>gU=7A)Sq?tWgeML0P$+Ilp0>oOyl2DhR$~>3U^gsywH? z=LOqpeP$V|pS^W(q}`tZ7$MiGEm3!%>OOWhDS6{DmoUG z%oV)ig>xPipIDTx1-Z})W8mowS6h8i5#CXs@lo6EiYk?HYX6Dl`-d(b*!rpZ*iF}Q zipj)?Sqxh&mZm75ZOS~;{z}Cu&1GBdj?KMm$Foh@H>2P~^P^(eDhPth{GzN=WR_6; z3HbOzp-`CqeL)l{?DpovPVlP3-{3@ogswCH??A*qMy^n^6{vY<4$Xs2L2rJW&=$Ci zL;Ha;yZsMrY0up$7dIWNqFD3 zxB$1S-%gvx*`GvcT%rp?!L_B<7@@V(_b>@EycMV26h{>g*DXGvcp%mcb>rDWXBzAz zx|Y~j*aGQ36CWCi4%s=7#RO94QCTMBwbas>Qo18BwYEUI&llB5$B43hvzT}u3>vVL zWq1$F-iMgI3}3kyEZz_%mklvY#u(mk7Yy~$EwonqfY2JA)!vW)rW+5~XASr?%h(@) zVVyWeJR_~A$Yd{H*$Rei-!+O~cU`7+<7}aeLnt%69}|cFOH51*?{_1lwETt8D)Sve zH}R}FfY8d4XMCX<(0XzT>lAL8d8RO?^kW#|KpM2!IInC-m@4S?PIi_^=c#XTu|oI< z#UDARDeht!=MX)nc=1otql%fvXkMjfT7gtY@*fpBzBk>yBi^e z;1!5x1{-iE%MftL_65CUg6^YrU2Nf>!B8Kq50XK;n;OSiEr(E1jYahOotEZ@t3jql zw#vn}WR>Gukobm66QjEV_LwIkq0q>?@nicQfGI74xf{1oGbU^=@7KrK`?q*v(2eRk z*+SUTBzA*%UF>G7%&ZYV2aQUnBH`(TmXT z1_Uu}RnVbDqXbrkA!B=duqqP^MJe<^OLvc?Y-xEDL_ydXS9FaYibb!{YC_{lIcZkU>*?i7AWu?zET zuQPzql6Q|w(|iAGi#GB#-10?x0&I>BeS;!(L9d6zg~AB-^WO^sJ^k7IL)}EJ128Kh zlT24-3tSD7B+*)mok0h?$u2di?*v#Xyq}A!37`2K zc-QglpTxvX@1MkE#>nr~-jG1NO4k>?NZueRA`;yi5<~~#Tvd)SQIQHN$q*Z%aC&{M<+(++X=rc=CG@*~5O8i6 zZEDXSV1M_!-}n2S@9C#relDxpIW+m>Y2qCmbQQFce!fa z6PN4%hSufAZcuq<42g{$2b$LzHCgCTGt)Gd?9gwkN^BgQ$w_E<%2UhR3$kxg3TN&OC6Tm||Q>eDerwv%HyiK@Xnd(}T-iqIk3nk>2gl&(PC^cK^EdaL3^_i8r3{H#L>-~4%xl6baZMoE0rb`p#m58T+KZSd4=hz zBq?14W3GwXa)$skt)GcA^9R<*<|B)ZFe6u?^i-*9qsp6gw9H zup9-yO!{X<+vTtytBeFb)4E7#>D~EiihmVw`vOjQ7)lNJH{!W&NV$I8d5_}$#To&V z7O6Hs&DlY5m%%fX{;791Z7{Y+|Ku#3OcKwtc}XM4Jg~mj@ux zCU4)O<QOru~dheX}U_OKQcupR@EAUd%foK{b5DuVA^Ov`sJ0;;IT0d3DT-?@(WD< z&GIHALYigFw3FvBOyXqc1v33beK(ovLND~itucPVb;SYZZ?h(1g%nHDj*0DkX)pggdLmZse! zw3z48zC|de_;*%Og6xaTT8%yn8sDncAoqVofga>$V`N6If$8IfB-~jX zJjRDRMw!;*xZl2H!hIZTzX#Bgoz#JQ$OM_n;PVa!!HX2vl5R56$(!K2#$WBDrQUu_ z@>%{%5M)|~YFda-gbT!l1wz?z&fFmmRFt)X7sQetQuN6N3FOw zpS9lwyqQ)qPbR%&juy3XY2Se4|K)@OxXy&ZOf|z2~a12 zQi=s{FPCA5nr3$G3Qq5`2g>-ej=5jP_t)-qxd~|HQv9x*?V<;^@UJZvYxU9lMNQ4s zPD1M_+rpU#gDFYeUMnVxgxkVPwqQ#B?||iO!oEoYn7C8a~VQM$kW*g&?w`%kDNi)7iVj|qWG1{z>rHsNWgEavovw zV4C*z$zMXnO-s5c9++ZV@{Wd~<-bJn+z8ud3e_cU^jmbh0pPv>^dA!?%1 Date: Mon, 9 Sep 2024 16:32:26 +0800 Subject: [PATCH 31/96] skip the first match of line ending char --- src/monty/io.py | 32 ++++++++++++++++++------------- tests/test_io.py | 49 ++++++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index b69018b5..568a8ee4 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -62,6 +62,10 @@ def _get_line_ending( This function assumes the file has a single consistent line ending. + WARNING: as per the POSIX standard, a line is: + A sequence of zero or more non- characters plus a terminating character. + as such this would fail if the last line is missing a terminating character. + Returns: "\n": Unix line ending. "\r\n": Windows line ending. @@ -73,6 +77,7 @@ def _get_line_ending( Warnings: If file is empty, "\n" would be used as default. """ + # TODO: critical, read the last N (~2) chars instead of everything if isinstance(file, (str, Path)): with zopen(file, "rb") as f: first_line = f.readline() @@ -121,23 +126,24 @@ def reverse_readfile( with zopen(filename, "rb") as file: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): for line in reversed(file.readlines()): - yield line.decode("utf-8").rstrip(l_end) + # "readlines" would keep the line end character + yield line.decode("utf-8") else: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) file_size = len(filemap) + count = 0 # TODO: more elegant way to skip first match while file_size > 0: line_end_pos = filemap.rfind(l_end.encode(), 0, file_size) - # The last line doesn't have a line ending - if line_end_pos == -1: - yield filemap[:file_size].decode("utf-8").rstrip(l_end) - break - - yield ( - filemap[line_end_pos + len(l_end) : file_size] - .decode("utf-8") - .rstrip(l_end) - ) + # The first match is the not the last line + if count > 0: + yield ( + filemap[line_end_pos + len(l_end) : file_size].decode( + "utf-8" + ) + + l_end + ) + count += 1 file_size = line_end_pos except ValueError: @@ -195,9 +201,9 @@ def reverse_readline( if os.name == "nt" or file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): yield ( - line.rstrip(l_end) + line.rstrip(l_end) # TODO: remove rstrip if isinstance(line, str) - else line.decode().rstrip(l_end) + else line.decode().rstrip(l_end) # TODO: remove rstrip ) else: diff --git a/tests/test_io.py b/tests/test_io.py index eb9090f0..d10f12bc 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -23,13 +23,14 @@ class TestGetLineEnding: @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) def test_get_line_ending(self, l_end): """Test files with: - Unix line ending (\n) - Windows line ending (\r\n) + Unix line ending (\n). + Windows line ending (\r\n). Classic MacOS line ending (\r) """ + test_file = "test_l_end.txt" + test_line = f"This is a test{l_end}Second line{l_end}".encode() + with ScratchDir("."): - test_file = "test_l_end.txt" - test_line = f"This is a test{l_end}Second line{l_end}".encode() with open(test_file, "wb") as f: f.write(test_line) @@ -156,7 +157,7 @@ def test_line_ending(self, l_end): class TestReverseReadfile: - NUMLINES = 3000 + NUM_LINES = 3000 def test_reverse_readfile(self): """ @@ -166,7 +167,7 @@ def test_reverse_readfile(self): fname = os.path.join(TEST_DIR, "3000_lines.txt") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUMLINES - idx + assert int(line) == self.NUM_LINES - idx def test_reverse_readfile_gz(self): """ @@ -176,7 +177,7 @@ def test_reverse_readfile_gz(self): fname = os.path.join(TEST_DIR, "3000_lines.txt.gz") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUMLINES - idx + assert int(line) == self.NUM_LINES - idx def test_reverse_readfile_bz2(self): """ @@ -186,7 +187,7 @@ def test_reverse_readfile_bz2(self): fname = os.path.join(TEST_DIR, "3000_lines.txt.bz2") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUMLINES - idx + assert int(line) == self.NUM_LINES - idx def test_empty_file(self): """ @@ -200,28 +201,17 @@ def test_empty_file(self): @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) def test_file_with_empty_lines(self, l_end): """Empty lines should not be skipped.""" - contents = ("line1", "", "line3") + contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" with ScratchDir("."): # Test text file with open(filename, "w", newline=l_end, encoding="utf-8") as file: for line in contents: - file.write(line + l_end) + file.write(line) revert_contents = tuple(reverse_readfile(filename)) - assert revert_contents[::-1] == (*contents, "") - - # Test bzip2 file - bz2_filename = f"{filename}.bz2" - with bz2.open( - bz2_filename, "wt", newline=l_end, encoding="utf-8" - ) as file_out: - for line in contents: - file_out.write(line + l_end) - - revert_contents_bz2 = tuple(reverse_readfile(bz2_filename)) - assert revert_contents_bz2[::-1] == (*contents, "") + assert revert_contents[::-1] == contents # Test gzip file gzip_filename = f"{filename}.gz" @@ -229,10 +219,21 @@ def test_file_with_empty_lines(self, l_end): gzip_filename, "wt", newline=l_end, encoding="utf-8" ) as file_out: for line in contents: - file_out.write(line + l_end) + file_out.write(line) revert_contents_gzip = tuple(reverse_readfile(gzip_filename)) - assert revert_contents_gzip[::-1] == (*contents, "") + assert revert_contents_gzip[::-1] == contents + + # Test bzip2 file + bz2_filename = f"{filename}.bz2" + with bz2.open( + bz2_filename, "wt", newline=l_end, encoding="utf-8" + ) as file_out: + for line in contents: + file_out.write(line) + + revert_contents_bz2 = tuple(reverse_readfile(bz2_filename)) + assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) def test_line_ending(self, l_end): From 74a451ff576e0455a3a6ae2dbd82a7958cdb6d0b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 17:22:37 +0800 Subject: [PATCH 32/96] fix test for zipped format --- src/monty/io.py | 3 +++ tests/test_io.py | 30 +++++++++++++----------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 568a8ee4..bfab24b2 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -46,6 +46,7 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: _name, ext = os.path.splitext(filename) ext = ext.upper() + if ext == ".BZ2": return bz2.open(filename, *args, **kwargs) if ext in {".GZ", ".Z"}: @@ -146,6 +147,8 @@ def reverse_readfile( count += 1 file_size = line_end_pos + # Cannot mmap an empty file + # TODO: check file size instead, at least except ValueError: return diff --git a/tests/test_io.py b/tests/test_io.py index d10f12bc..8ad38a12 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -167,7 +167,7 @@ def test_reverse_readfile(self): fname = os.path.join(TEST_DIR, "3000_lines.txt") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUM_LINES - idx + assert line == f"{str(self.NUM_LINES - idx)}\n" def test_reverse_readfile_gz(self): """ @@ -177,7 +177,7 @@ def test_reverse_readfile_gz(self): fname = os.path.join(TEST_DIR, "3000_lines.txt.gz") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUM_LINES - idx + assert line == f"{str(self.NUM_LINES - idx)}\n" def test_reverse_readfile_bz2(self): """ @@ -187,7 +187,7 @@ def test_reverse_readfile_bz2(self): fname = os.path.join(TEST_DIR, "3000_lines.txt.bz2") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert int(line) == self.NUM_LINES - idx + assert line == f"{str(self.NUM_LINES - idx)}\n" def test_empty_file(self): """ @@ -205,32 +205,28 @@ def test_file_with_empty_lines(self, l_end): filename = "test_empty_line.txt" with ScratchDir("."): - # Test text file - with open(filename, "w", newline=l_end, encoding="utf-8") as file: - for line in contents: - file.write(line) + # # Test text file + # with open(filename, "w", newline=l_end, encoding="utf-8") as file: + # for line in contents: + # file.write(line) - revert_contents = tuple(reverse_readfile(filename)) - assert revert_contents[::-1] == contents + # revert_contents = tuple(reverse_readfile(filename)) + # assert revert_contents[::-1] == contents # Test gzip file gzip_filename = f"{filename}.gz" - with gzip.open( - gzip_filename, "wt", newline=l_end, encoding="utf-8" - ) as file_out: + with gzip.open(gzip_filename, "w") as file_out: for line in contents: - file_out.write(line) + file_out.write(line.encode()) revert_contents_gzip = tuple(reverse_readfile(gzip_filename)) assert revert_contents_gzip[::-1] == contents # Test bzip2 file bz2_filename = f"{filename}.bz2" - with bz2.open( - bz2_filename, "wt", newline=l_end, encoding="utf-8" - ) as file_out: + with bz2.open(bz2_filename, "w") as file_out: for line in contents: - file_out.write(line) + file_out.write(line.encode()) revert_contents_bz2 = tuple(reverse_readfile(bz2_filename)) assert revert_contents_bz2[::-1] == contents From ef454b36e88dad5a76909cdefe985fbdd9343ac3 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 17:23:02 +0800 Subject: [PATCH 33/96] drop test for legacy MacOS \r --- tests/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 8ad38a12..7af1d6c3 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -198,7 +198,7 @@ def test_empty_file(self): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): pytest.fail("No error should be thrown.") - @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end): """Empty lines should not be skipped.""" contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") From 3516c275df893d360d9a6576128ec809b7bcdb0f Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 17:40:35 +0800 Subject: [PATCH 34/96] make capture of mmap valueerror more narrow --- src/monty/io.py | 49 +++++++++++++++++++++++------------------------- tests/test_io.py | 2 +- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index bfab24b2..e277b09b 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -123,34 +123,31 @@ def reverse_readfile( # Get line ending l_end = _get_line_ending(filename) - try: - with zopen(filename, "rb") as file: - if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): - for line in reversed(file.readlines()): - # "readlines" would keep the line end character - yield line.decode("utf-8") + with zopen(filename, "rb") as file: + if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): + for line in reversed(file.readlines()): + # "readlines" would keep the line end character + yield line.decode("utf-8") - else: + else: + try: filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) - file_size = len(filemap) - count = 0 # TODO: more elegant way to skip first match - while file_size > 0: - line_end_pos = filemap.rfind(l_end.encode(), 0, file_size) - # The first match is the not the last line - if count > 0: - yield ( - filemap[line_end_pos + len(l_end) : file_size].decode( - "utf-8" - ) - + l_end - ) - count += 1 - file_size = line_end_pos - - # Cannot mmap an empty file - # TODO: check file size instead, at least - except ValueError: - return + except ValueError: + warnings.warn("trying to mmap an empty file.", stacklevel=2) + return + + file_size = len(filemap) + count = 0 # TODO: more elegant way to skip first match + while file_size > 0: + line_end_pos = filemap.rfind(l_end.encode(), 0, file_size) + # The first match is the not the last line + if count > 0: + yield ( + filemap[line_end_pos + len(l_end) : file_size].decode("utf-8") + + l_end + ) + count += 1 + file_size = line_end_pos def reverse_readline( diff --git a/tests/test_io.py b/tests/test_io.py index 7af1d6c3..813c6991 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -25,7 +25,7 @@ def test_get_line_ending(self, l_end): """Test files with: Unix line ending (\n). Windows line ending (\r\n). - Classic MacOS line ending (\r) + Classic MacOS line ending (\r). """ test_file = "test_l_end.txt" test_line = f"This is a test{l_end}Second line{l_end}".encode() From 5e1344cb0c1442d98282c49720de4e6b6cadbb6b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 17:43:40 +0800 Subject: [PATCH 35/96] use platform.system for windows check --- src/monty/io.py | 13 +++++++------ tests/test_shutil.py | 2 +- tests/test_tempfile.py | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index e277b09b..ab51999c 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -11,6 +11,7 @@ import io import mmap import os +import platform import subprocess import time import warnings @@ -198,13 +199,13 @@ def reverse_readline( # If the file size is within desired RAM limit, just reverse it in memory. # GZip files must use this method because there is no way to negative seek. # For windows, we also read the whole file. - if os.name == "nt" or file_size < max_mem or isinstance(m_file, gzip.GzipFile): + if ( + platform.system() == "Windows" + or file_size < max_mem + or isinstance(m_file, gzip.GzipFile) + ): for line in reversed(m_file.readlines()): - yield ( - line.rstrip(l_end) # TODO: remove rstrip - if isinstance(line, str) - else line.decode().rstrip(l_end) # TODO: remove rstrip - ) + yield (line if isinstance(line, str) else line.decode()) else: if isinstance(m_file, bz2.BZ2File): diff --git a/tests/test_shutil.py b/tests/test_shutil.py index 16cd9aa7..f1b545b9 100644 --- a/tests/test_shutil.py +++ b/tests/test_shutil.py @@ -30,7 +30,7 @@ def setup_method(self): os.mkdir(os.path.join(test_dir, "cpr_src", "sub")) with open(os.path.join(test_dir, "cpr_src", "sub", "testr"), "w") as f: f.write("what2") - if os.name != "nt": + if platform.system() != "Windows": os.symlink( os.path.join(test_dir, "cpr_src", "test"), os.path.join(test_dir, "cpr_src", "mysymlink"), diff --git a/tests/test_tempfile.py b/tests/test_tempfile.py index 4dfd3b17..5b9c72b9 100644 --- a/tests/test_tempfile.py +++ b/tests/test_tempfile.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import platform import shutil import pytest @@ -120,7 +121,7 @@ def test_no_copy(self): assert "scratch_text" not in files def test_symlink(self): - if os.name != "nt": + if platform.system() != "Windows": with ScratchDir( self.scratch_root, copy_from_current_on_enter=False, From 283a383ecca16907370e311a8713c76848032e2e Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 17:58:32 +0800 Subject: [PATCH 36/96] WIP: test for \r\n is failing for some reason --- src/monty/io.py | 3 +-- tests/test_io.py | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index ab51999c..603f1d4c 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -144,8 +144,7 @@ def reverse_readfile( # The first match is the not the last line if count > 0: yield ( - filemap[line_end_pos + len(l_end) : file_size].decode("utf-8") - + l_end + filemap[line_end_pos + 1 : file_size].decode("utf-8") + l_end ) count += 1 file_size = line_end_pos diff --git a/tests/test_io.py b/tests/test_io.py index 813c6991..dd48a9ab 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -88,6 +88,7 @@ def test_unknown_line_ending(self): _get_line_ending(test_file) +@pytest.mark.skip("TODO: WIP") class TestReverseReadline: NUMLINES = 3000 @@ -205,13 +206,13 @@ def test_file_with_empty_lines(self, l_end): filename = "test_empty_line.txt" with ScratchDir("."): - # # Test text file - # with open(filename, "w", newline=l_end, encoding="utf-8") as file: - # for line in contents: - # file.write(line) + # Test text file + with open(filename, "w", newline=l_end, encoding="utf-8") as file: + for line in contents: + file.write(line) - # revert_contents = tuple(reverse_readfile(filename)) - # assert revert_contents[::-1] == contents + revert_contents = tuple(reverse_readfile(filename)) + assert revert_contents[::-1] == contents # Test gzip file gzip_filename = f"{filename}.gz" @@ -231,18 +232,17 @@ def test_file_with_empty_lines(self, l_end): revert_contents_bz2 = tuple(reverse_readfile(bz2_filename)) assert revert_contents_bz2[::-1] == contents - @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end): - contents = ("Line1", "Line2", "Line3") + contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") + file_name = "test_file.txt" with ScratchDir("."): - with open("test_file.txt", "wb") as file: - file.write((l_end.join(contents) + l_end).encode()) + with open(file_name, "w", newline=l_end, encoding="utf-8") as file: + for line in contents: + file.write(line) - with open("test_file.txt", "r", encoding="utf-8") as file: - for idx, line in enumerate(reverse_readline(file)): - assert isinstance(line, str) - assert line == contents[len(contents) - idx - 1] + assert tuple(reverse_readfile(file_name))[::-1] == contents class TestZopen: From ace92646289fb0add166dd6e524dd18d047b2f07 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 19:14:31 +0800 Subject: [PATCH 37/96] fix \r\n location, but it looks silly, need opt --- src/monty/io.py | 20 +++++++++++++++----- tests/test_io.py | 8 +++++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 603f1d4c..94581110 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -140,14 +140,24 @@ def reverse_readfile( file_size = len(filemap) count = 0 # TODO: more elegant way to skip first match while file_size > 0: - line_end_pos = filemap.rfind(l_end.encode(), 0, file_size) - # The first match is the not the last line - if count > 0: + # Find line segment start and end positions + seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) + sec_end_pos = file_size - len(l_end) + 1 + + # The first line doesn't have an ending character at its head + if seg_start_pos == -1: + yield (filemap[:sec_end_pos].decode("utf-8") + l_end) + + # Skip the first match (the last line ending character) + elif count > 0: yield ( - filemap[line_end_pos + 1 : file_size].decode("utf-8") + l_end + filemap[seg_start_pos + len(l_end) : sec_end_pos].decode( + "utf-8" + ) + + l_end ) count += 1 - file_size = line_end_pos + file_size = seg_start_pos def reverse_readline( diff --git a/tests/test_io.py b/tests/test_io.py index dd48a9ab..62a8477a 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -235,14 +235,16 @@ def test_file_with_empty_lines(self, l_end): @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end): contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") - file_name = "test_file.txt" + filename = "test_file.txt" with ScratchDir("."): - with open(file_name, "w", newline=l_end, encoding="utf-8") as file: + with open(filename, "w", newline=l_end, encoding="utf-8") as file: for line in contents: file.write(line) - assert tuple(reverse_readfile(file_name))[::-1] == contents + revert_contents = tuple(reverse_readfile(filename)) + print(revert_contents) + assert revert_contents[::-1] == contents class TestZopen: From 9e92bb9921aa3b4829b31f9f58dfa509b3cfbac4 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 19:20:37 +0800 Subject: [PATCH 38/96] add TODO tag --- src/monty/io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index 94581110..7585b724 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -143,8 +143,10 @@ def reverse_readfile( # Find line segment start and end positions seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) sec_end_pos = file_size - len(l_end) + 1 + # TODO: -len(l_end) and then + l_end looks silly, + # but otherwise get "\r\r" at the end of each line - # The first line doesn't have an ending character at its head + # The first line doesn't have an ending character at its start if seg_start_pos == -1: yield (filemap[:sec_end_pos].decode("utf-8") + l_end) From 6a8f8b51fba50f1e5900be65002c49703c0b868e Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 19:31:16 +0800 Subject: [PATCH 39/96] finally fixed, avoid newline otherwise \n get overwrite by \r\n causing double \r --- src/monty/io.py | 7 ++----- tests/test_io.py | 5 ++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 7585b724..cafb8257 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -142,13 +142,11 @@ def reverse_readfile( while file_size > 0: # Find line segment start and end positions seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) - sec_end_pos = file_size - len(l_end) + 1 - # TODO: -len(l_end) and then + l_end looks silly, - # but otherwise get "\r\r" at the end of each line + sec_end_pos = file_size + len(l_end) # The first line doesn't have an ending character at its start if seg_start_pos == -1: - yield (filemap[:sec_end_pos].decode("utf-8") + l_end) + yield (filemap[:sec_end_pos].decode("utf-8")) # Skip the first match (the last line ending character) elif count > 0: @@ -156,7 +154,6 @@ def reverse_readfile( filemap[seg_start_pos + len(l_end) : sec_end_pos].decode( "utf-8" ) - + l_end ) count += 1 file_size = seg_start_pos diff --git a/tests/test_io.py b/tests/test_io.py index 62a8477a..ff5b4655 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -207,7 +207,7 @@ def test_file_with_empty_lines(self, l_end): with ScratchDir("."): # Test text file - with open(filename, "w", newline=l_end, encoding="utf-8") as file: + with open(filename, "w", newline="", encoding="utf-8") as file: for line in contents: file.write(line) @@ -238,12 +238,11 @@ def test_line_ending(self, l_end): filename = "test_file.txt" with ScratchDir("."): - with open(filename, "w", newline=l_end, encoding="utf-8") as file: + with open(filename, "w", newline="", encoding="utf-8") as file: for line in contents: file.write(line) revert_contents = tuple(reverse_readfile(filename)) - print(revert_contents) assert revert_contents[::-1] == contents From f69c2053e5b779a9247fdb15492058c259da59f8 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 20:20:35 +0800 Subject: [PATCH 40/96] fix line ending for text mode --- tests/test_io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index ff5b4655..46402ba2 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -168,7 +168,8 @@ def test_reverse_readfile(self): fname = os.path.join(TEST_DIR, "3000_lines.txt") for idx, line in enumerate(reverse_readfile(fname)): assert isinstance(line, str) - assert line == f"{str(self.NUM_LINES - idx)}\n" + # OS would automatically convert line ending in text mode + assert line == f"{str(self.NUM_LINES - idx)}{os.linesep}" def test_reverse_readfile_gz(self): """ From 28e6cc4032a595c11bfe52c321f82af692095b9a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 20:48:58 +0800 Subject: [PATCH 41/96] drop test of \r altogether --- src/monty/io.py | 13 ++++++------- tests/test_io.py | 8 +++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index cafb8257..f922c578 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -59,19 +59,19 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: def _get_line_ending( file: str | Path | io.TextIOWrapper, -) -> Literal["\r\n", "\n", "\r"]: +) -> Literal["\r\n", "\n"]: """Helper function to get line ending of a file. This function assumes the file has a single consistent line ending. WARNING: as per the POSIX standard, a line is: A sequence of zero or more non- characters plus a terminating character. - as such this would fail if the last line is missing a terminating character. + as such this func would fail if the last line misses a terminating character. + https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html Returns: "\n": Unix line ending. "\r\n": Windows line ending. - "\r": Classic MacOS line ending. Raises: ValueError: If line ending is unknown. @@ -100,9 +100,8 @@ def _get_line_ending( return "\r\n" if first_line.endswith(b"\n"): return "\n" - if first_line.endswith(b"\r"): - return "\r" + # It's likely the line is missing a line ending for its last line raise ValueError(f"Unknown line ending in line {repr(first_line)}.") @@ -144,11 +143,11 @@ def reverse_readfile( seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) sec_end_pos = file_size + len(l_end) - # The first line doesn't have an ending character at its start + # The first line (original) doesn't have an ending character at its start if seg_start_pos == -1: yield (filemap[:sec_end_pos].decode("utf-8")) - # Skip the first match (the last line ending character) + # Skip the first match (the original last line ending character) elif count > 0: yield ( filemap[seg_start_pos + len(l_end) : sec_end_pos].decode( diff --git a/tests/test_io.py b/tests/test_io.py index 46402ba2..a025d3cf 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -20,12 +20,11 @@ class TestGetLineEnding: - @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_get_line_ending(self, l_end): """Test files with: Unix line ending (\n). Windows line ending (\r\n). - Classic MacOS line ending (\r). """ test_file = "test_l_end.txt" test_line = f"This is a test{l_end}Second line{l_end}".encode() @@ -88,7 +87,6 @@ def test_unknown_line_ending(self): _get_line_ending(test_file) -@pytest.mark.skip("TODO: WIP") class TestReverseReadline: NUMLINES = 3000 @@ -139,11 +137,11 @@ def test_empty_file(self): pytest.fail("No error should be thrown.") @pytest.mark.skip("TODO: WIP") - @pytest.mark.parametrize("l_end", ["\n", "\r\n", "\r"]) + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end): """Empty lines should not be skipped.""" - @pytest.mark.parametrize("l_end", ["\n", "\r", "\r\n"]) + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end): contents = ("Line1", "Line2", "Line3") From bf3b90a6ef1ac5697dccb6ef2300121e38f081bb Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 20:56:13 +0800 Subject: [PATCH 42/96] remove manual counter --- src/monty/io.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index f922c578..649818a2 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -137,7 +137,6 @@ def reverse_readfile( return file_size = len(filemap) - count = 0 # TODO: more elegant way to skip first match while file_size > 0: # Find line segment start and end positions seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) @@ -148,13 +147,12 @@ def reverse_readfile( yield (filemap[:sec_end_pos].decode("utf-8")) # Skip the first match (the original last line ending character) - elif count > 0: + elif file_size != len(filemap): yield ( filemap[seg_start_pos + len(l_end) : sec_end_pos].decode( "utf-8" ) ) - count += 1 file_size = seg_start_pos From a03b301ba456c8f3582ee154a8d44f1e3f1a0ee8 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 9 Sep 2024 21:13:00 +0800 Subject: [PATCH 43/96] copy unit test, to be fixed --- src/monty/io.py | 12 +++++------ tests/test_io.py | 52 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 649818a2..3b4baec2 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -210,13 +210,12 @@ def reverse_readline( or isinstance(m_file, gzip.GzipFile) ): for line in reversed(m_file.readlines()): - yield (line if isinstance(line, str) else line.decode()) + yield (line if isinstance(line, str) else line.decode("utf-8")) else: if isinstance(m_file, bz2.BZ2File): - # For bz2 files, seeks are expensive. It is therefore in our best - # interest to maximize the blk_size within limits of desired RAM - # use. + # For bz2 files, seeking is expensive. It is therefore in our best + # interest to maximize the blk_size within RAM usage limit. blk_size = min(max_mem, file_size) buf = "" @@ -228,9 +227,10 @@ def reverse_readline( while True: newline_pos = buf.rfind(l_end) pos = m_file.tell() + + # Found a newline if newline_pos != -1: - # Found a newline - line = buf[newline_pos + 1 :] + line = buf[newline_pos + len(l_end) :] buf = buf[:newline_pos] if pos or newline_pos or trailing_newline: line += l_end diff --git a/tests/test_io.py b/tests/test_io.py index a025d3cf..3f3542dd 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -99,20 +99,17 @@ def test_reverse_readline(self): with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f)): assert isinstance(line, str) - assert ( - int(line) == self.NUMLINES - idx - ), f"read_backwards read {line} whereas it should have read {self.NUMLINES - idx}" + assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" def test_reverse_readline_fake_big(self): """ - Make sure that large text files are read properly. + Make sure that large text files are read properly, + by setting max_mem to 0. """ with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert isinstance(line, str) - assert ( - int(line) == self.NUMLINES - idx - ), f"read_backwards read {line} whereas it should have read {self.NUMLINES - idx}" + assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" def test_reverse_readline_bz2(self): """ @@ -136,20 +133,51 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") - @pytest.mark.skip("TODO: WIP") @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end): """Empty lines should not be skipped.""" + contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") + filename = "test_empty_line.txt" + + with ScratchDir("."): + # Test text file + with open(filename, "w", newline="", encoding="utf-8") as file: + for line in contents: + file.write(line) + + with zopen(filename) as file: + revert_contents = tuple(reverse_readline(file)) + assert revert_contents[::-1] == contents + + # Test gzip file + gzip_filename = f"{filename}.gz" + with gzip.open(gzip_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) + + revert_contents_gzip = tuple(reverse_readline(gzip_filename)) + assert revert_contents_gzip[::-1] == contents + + # Test bzip2 file + bz2_filename = f"{filename}.bz2" + with bz2.open(bz2_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) + + revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) + assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end): - contents = ("Line1", "Line2", "Line3") + contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") + file_name = "test_file.txt" with ScratchDir("."): - with open("test_file.txt", "wb") as file: - file.write((l_end.join(contents) + l_end).encode()) + with open(file_name, "w", newline="", encoding="utf-8") as file: + for line in contents: + file.write(line) - with open("test_file.txt", "r", encoding="utf-8") as file: + with open(file_name, "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): assert line == contents[len(contents) - idx - 1] assert isinstance(line, str) From 138b7569666fadbfb20451fc6917dbec7caf8b2a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 10 Sep 2024 11:30:22 +0800 Subject: [PATCH 44/96] update warn msg upon empty file --- src/monty/io.py | 23 ++++++++++++----------- tests/test_io.py | 34 +++++++++++++++++----------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 3b4baec2..fb41dc88 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -74,7 +74,8 @@ def _get_line_ending( "\r\n": Windows line ending. Raises: - ValueError: If line ending is unknown. + ValueError: If line ending is unknown, likely the file is + missing a terminating character. Warnings: If file is empty, "\n" would be used as default. @@ -93,7 +94,7 @@ def _get_line_ending( # Return Unix "\n" line ending as default if file is empty if not first_line: - warnings.warn("File empty, use default line ending \n.", stacklevel=2) + warnings.warn("File is empty, return Unix line ending \n.", stacklevel=2) return "\n" if first_line.endswith(b"\r\n"): @@ -162,22 +163,22 @@ def reverse_readline( max_mem: int = 4000000, ) -> Iterator[str]: """ - Generator function to read a file line-by-line, but backwards. - This allows one to efficiently get data at the end of a file. + Read a file line-by-line, but backwards. This allows one to + efficiently get data from the end of a file. Read file forwards and reverse in memory for files smaller than the - max_mem parameter, or for gzip files where reverse seeks are not supported. + max_mem parameter, or for Gzip files where reverse seeks are not supported. Files larger than max_mem are dynamically read backwards. Reference: - Based on code by Peter Astrand , using modifications - by Raymond Hettinger and Kevin German. - http://code.activestate.com/recipes/439045-read-a-text-file-backwards - -yet-another-implementat/ + Based on code by Peter Astrand , using + modifications by Raymond Hettinger and Kevin German. + http://code.activestate.com/recipes/439045-read-a-text- + file-backwards-yet-another-implementat/ Args: - m_file (File): File stream to read (backwards) + m_file (File): File stream to read (backwards). blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of memory to involve in this operation. This is used to determine when to reverse a file @@ -202,7 +203,7 @@ def reverse_readline( file_size = max_mem + 1 # If the file size is within desired RAM limit, just reverse it in memory. - # GZip files must use this method because there is no way to negative seek. + # Gzip files must use this method because there is no way to negative seek. # For windows, we also read the whole file. if ( platform.system() == "Windows" diff --git a/tests/test_io.py b/tests/test_io.py index 3f3542dd..4a0e4fcc 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -74,7 +74,7 @@ def test_empty_file(self): test_file = "empty_file.txt" open(test_file, "w").close() - with pytest.warns(match="File empty, use default line ending \n"): + with pytest.warns(match="File is empty, return Unix line ending \n"): assert _get_line_ending(test_file) == "\n" def test_unknown_line_ending(self): @@ -128,7 +128,7 @@ def test_empty_file(self): Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. """ - with pytest.warns(match="File empty, use default line ending \n."): + with pytest.warns(match="File is empty, return Unix line ending \n."): with open(os.path.join(TEST_DIR, "empty_file.txt"), encoding="utf-8") as f: for _line in reverse_readline(f): pytest.fail("No error should be thrown.") @@ -149,23 +149,23 @@ def test_file_with_empty_lines(self, l_end): revert_contents = tuple(reverse_readline(file)) assert revert_contents[::-1] == contents - # Test gzip file - gzip_filename = f"{filename}.gz" - with gzip.open(gzip_filename, "w") as file_out: - for line in contents: - file_out.write(line.encode()) + # # Test gzip file + # gzip_filename = f"{filename}.gz" + # with gzip.open(gzip_filename, "w") as file_out: + # for line in contents: + # file_out.write(line.encode()) - revert_contents_gzip = tuple(reverse_readline(gzip_filename)) - assert revert_contents_gzip[::-1] == contents + # revert_contents_gzip = tuple(reverse_readline(gzip_filename)) + # assert revert_contents_gzip[::-1] == contents - # Test bzip2 file - bz2_filename = f"{filename}.bz2" - with bz2.open(bz2_filename, "w") as file_out: - for line in contents: - file_out.write(line.encode()) + # # Test bzip2 file + # bz2_filename = f"{filename}.bz2" + # with bz2.open(bz2_filename, "w") as file_out: + # for line in contents: + # file_out.write(line.encode()) - revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) - assert revert_contents_bz2[::-1] == contents + # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) + # assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end): @@ -222,7 +222,7 @@ def test_empty_file(self): Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. """ - with pytest.warns(match="File empty, use default line ending \n."): + with pytest.warns(match="File is empty, return Unix line ending \n."): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): pytest.fail("No error should be thrown.") From 29dc50dc7b570d7e1c17475361edbdb0cb04bb0e Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 10 Sep 2024 11:50:50 +0800 Subject: [PATCH 45/96] tweak docstring --- src/monty/io.py | 26 ++++++++++++++------------ tests/test_multiprocessing.py | 8 ++++---- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index fb41dc88..88d8ee76 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -160,16 +160,19 @@ def reverse_readfile( def reverse_readline( m_file, blk_size: int = 4096, - max_mem: int = 4000000, + max_mem: int = 4_000_000, ) -> Iterator[str]: """ - Read a file line-by-line, but backwards. This allows one to - efficiently get data from the end of a file. + Read a file backwards line-by-line, and behave similarly to + the file.readline function. This allows one to efficiently + get data from the end of a file. - Read file forwards and reverse in memory for files smaller than the - max_mem parameter, or for Gzip files where reverse seeks are not supported. + Cases where file would be read forwards and reversed in RAM: + - If file size is smaller than RAM usage limit (max_mem). + - In Windows. TODO: explain reason. + - For Gzip files, as reverse seeks are not supported. - Files larger than max_mem are dynamically read backwards. + Files larger than max_mem are read one segment each time. Reference: Based on code by Peter Astrand , using @@ -180,14 +183,13 @@ def reverse_readline( Args: m_file (File): File stream to read (backwards). blk_size (int): The buffer size in bytes. Defaults to 4096. - max_mem (int): The maximum amount of memory to involve in this - operation. This is used to determine when to reverse a file - in-memory versus seeking portions of a file. For bz2 files, - this sets the maximum block size. + max_mem (int): The maximum amount of RAM to use in bytes, + which determines when to reverse a file in-memory versus + seeking segments of a file. For bz2 files, this sets + the block size. Yields: - Lines from the file. Behave similarly to the file.readline function, - except the lines are returned from the back of the file. + Lines from the back of the file. """ # Generate line ending l_end = _get_line_ending(m_file) diff --git a/tests/test_multiprocessing.py b/tests/test_multiprocessing.py index e3fe1b9f..e507c321 100644 --- a/tests/test_multiprocessing.py +++ b/tests/test_multiprocessing.py @@ -6,12 +6,12 @@ def test_imap_tqdm(): - results = imap_tqdm(4, sqrt, range(10000)) - assert len(results) == 10000 + results = imap_tqdm(4, sqrt, range(10_000)) + assert len(results) == 10_000 assert results[0] == 0 assert results[400] == 20 assert results[9999] == 99.99499987499375 - results = imap_tqdm(4, sqrt, (i**2 for i in range(10000))) - assert len(results) == 10000 + results = imap_tqdm(4, sqrt, (i**2 for i in range(10_000))) + assert len(results) == 10_000 assert results[0] == 0 assert results[400] == 400 From 8f9496448d29812cc80e2d282a453afcc5466f8f Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 10 Sep 2024 15:51:38 +0800 Subject: [PATCH 46/96] add comments --- src/monty/io.py | 72 +++++++++++++++++++++++++++--------------------- tests/test_io.py | 13 +++++++-- 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 88d8ee76..bd61f4dc 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -170,9 +170,9 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - In Windows. TODO: explain reason. - - For Gzip files, as reverse seeks are not supported. + - For Gzip files, as reverse seeks are not supported. # TODO: now supported - Files larger than max_mem are read one segment each time. + Files larger than max_mem are read one block each time. Reference: Based on code by Peter Astrand , using @@ -185,17 +185,18 @@ def reverse_readline( blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of RAM to use in bytes, which determines when to reverse a file in-memory versus - seeking segments of a file. For bz2 files, this sets + seeking blocks of a file. For bz2 files, this sets the block size. Yields: Lines from the back of the file. """ # Generate line ending - l_end = _get_line_ending(m_file) + l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) + len_l_end: Literal[1, 2] = len(l_end) - # Check if the file stream is a buffered text stream - is_text = isinstance(m_file, io.TextIOWrapper) + # Check if the file stream is a buffered text stream (text instead of binary) + is_text: bool = isinstance(m_file, io.TextIOWrapper) try: file_size = os.path.getsize(m_file.name) @@ -208,7 +209,10 @@ def reverse_readline( # Gzip files must use this method because there is no way to negative seek. # For windows, we also read the whole file. if ( - platform.system() == "Windows" + platform.system() == "Windows" # TODO: platform is not important, len_l_end is + or ( + len_l_end != 1 + ) # TODO: the following code wouldn't work for "\r\n" as its len is 2 or file_size < max_mem or isinstance(m_file, gzip.GzipFile) ): @@ -216,43 +220,49 @@ def reverse_readline( yield (line if isinstance(line, str) else line.decode("utf-8")) else: + # For bz2 files, seek is expensive. It is therefore in our best + # interest to maximize the block size within RAM usage limit. + + # TODO: not sure if bzip2 has any improvement on seek, need test + # https://stackoverflow.com/questions/25734252/why-is-seeking-from-the-end-of-a-file-allowed-for-bzip2-files-and-not-gzip-files if isinstance(m_file, bz2.BZ2File): - # For bz2 files, seeking is expensive. It is therefore in our best - # interest to maximize the blk_size within RAM usage limit. blk_size = min(max_mem, file_size) - buf = "" + buffer: str = "" m_file.seek(0, 2) - last_char = m_file.read(1) if is_text else m_file.read(1).decode("utf-8") - - trailing_newline = last_char == l_end while True: - newline_pos = buf.rfind(l_end) - pos = m_file.tell() - - # Found a newline - if newline_pos != -1: - line = buf[newline_pos + len(l_end) :] - buf = buf[:newline_pos] - if pos or newline_pos or trailing_newline: + l_end_pos: int = buffer.rfind(l_end) + pt_pos: int = ( + m_file.tell() + ) # pointer position (also size of remaining file to read) + + # Line ending found within buffer + if l_end_pos != -1: + line = buffer[l_end_pos + len_l_end :] + buffer = buffer[:l_end_pos] # buffer doesn't include l_end + if pt_pos != 0 or l_end_pos != 0: # TODO: why is this condition needed? line += l_end yield line - elif pos: - # Need to fill buffer - to_read = min(blk_size, pos) - m_file.seek(pos - to_read, 0) + # Line ending not in current buffer, load next block into the buffer + elif pt_pos > 0: + to_read: int = min(blk_size, pt_pos) + m_file.seek(pt_pos - to_read) if is_text: - buf = m_file.read(to_read) + buf + buffer += m_file.read(to_read) else: - buf = m_file.read(to_read).decode("utf-8") + buf - m_file.seek(pos - to_read, 0) - if pos == to_read: - buf = l_end + buf + buffer += m_file.read(to_read).decode("utf-8") + + # Move pointer forward # TODO: why pointer is moved forward again? + m_file.seek(pt_pos - to_read) + + # Add a l_end to the start of file + if pt_pos == to_read: + buffer = l_end + buffer + # Start of file (no more l_end found, and pt_pos at the start) else: - # Start-of-file return diff --git a/tests/test_io.py b/tests/test_io.py index 4a0e4fcc..7f2b9b5e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -106,7 +106,9 @@ def test_reverse_readline_fake_big(self): Make sure that large text files are read properly, by setting max_mem to 0. """ - with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: + with open( + os.path.join(TEST_DIR, "3000_lines.txt"), mode="r", encoding="utf-8" + ) as f: for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" @@ -145,7 +147,7 @@ def test_file_with_empty_lines(self, l_end): for line in contents: file.write(line) - with zopen(filename) as file: + with zopen(filename, mode="r") as file: revert_contents = tuple(reverse_readline(file)) assert revert_contents[::-1] == contents @@ -177,11 +179,18 @@ def test_line_ending(self, l_end): for line in contents: file.write(line) + # Test text mode with open(file_name, "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): assert line == contents[len(contents) - idx - 1] assert isinstance(line, str) + # # TODO: Test binary mode + # with open(file_name, "rb") as file: + # for idx, line in enumerate(reverse_readline(file)): + # assert line == contents[len(contents) - idx - 1] + # assert isinstance(line, str) + class TestReverseReadfile: NUM_LINES = 3000 From 38047275afaffeead6e859985246d413106f60b1 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 15:50:04 +0800 Subject: [PATCH 47/96] Fix line ending handling in reverse readline --- src/monty/io.py | 33 ++++++++++++--------------------- tests/test_io.py | 17 +++++++++++------ 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index bd61f4dc..fb9db2bf 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -11,12 +11,11 @@ import io import mmap import os -import platform import subprocess import time import warnings from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, cast try: import lzma @@ -24,7 +23,7 @@ lzma = None # type: ignore[assignment] if TYPE_CHECKING: - from typing import IO, Iterator, Literal, Union + from typing import IO, Iterator, Union def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: @@ -169,7 +168,6 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - - In Windows. TODO: explain reason. - For Gzip files, as reverse seeks are not supported. # TODO: now supported Files larger than max_mem are read one block each time. @@ -193,7 +191,7 @@ def reverse_readline( """ # Generate line ending l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) - len_l_end: Literal[1, 2] = len(l_end) + len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end)) # Check if the file stream is a buffered text stream (text instead of binary) is_text: bool = isinstance(m_file, io.TextIOWrapper) @@ -207,15 +205,7 @@ def reverse_readline( # If the file size is within desired RAM limit, just reverse it in memory. # Gzip files must use this method because there is no way to negative seek. - # For windows, we also read the whole file. - if ( - platform.system() == "Windows" # TODO: platform is not important, len_l_end is - or ( - len_l_end != 1 - ) # TODO: the following code wouldn't work for "\r\n" as its len is 2 - or file_size < max_mem - or isinstance(m_file, gzip.GzipFile) - ): + if file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): yield (line if isinstance(line, str) else line.decode("utf-8")) @@ -230,20 +220,21 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) + eof_pos = m_file.tell() # need end of file to skip first empty line while True: l_end_pos: int = buffer.rfind(l_end) - pt_pos: int = ( - m_file.tell() - ) # pointer position (also size of remaining file to read) + # Pointer position (also size of remaining file to read) + pt_pos: int = m_file.tell() # Line ending found within buffer if l_end_pos != -1: line = buffer[l_end_pos + len_l_end :] buffer = buffer[:l_end_pos] # buffer doesn't include l_end - if pt_pos != 0 or l_end_pos != 0: # TODO: why is this condition needed? - line += l_end - yield line + + # Skip first match (the last line ending) + if l_end_pos != eof_pos: + yield line + l_end # Line ending not in current buffer, load next block into the buffer elif pt_pos > 0: @@ -254,7 +245,7 @@ def reverse_readline( else: buffer += m_file.read(to_read).decode("utf-8") - # Move pointer forward # TODO: why pointer is moved forward again? + # Move pointer forward m_file.seek(pt_pos - to_read) # Add a l_end to the start of file diff --git a/tests/test_io.py b/tests/test_io.py index 7f2b9b5e..d13cf32d 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -143,11 +143,11 @@ def test_file_with_empty_lines(self, l_end): with ScratchDir("."): # Test text file - with open(filename, "w", newline="", encoding="utf-8") as file: + with open(filename, "wb") as file: for line in contents: - file.write(line) + file.write(line.encode()) - with zopen(filename, mode="r") as file: + with open(filename, mode="r", newline="") as file: revert_contents = tuple(reverse_readline(file)) assert revert_contents[::-1] == contents @@ -175,14 +175,19 @@ def test_line_ending(self, l_end): file_name = "test_file.txt" with ScratchDir("."): - with open(file_name, "w", newline="", encoding="utf-8") as file: + with open(file_name, "wb") as file: for line in contents: - file.write(line) + file.write(line.encode()) # Test text mode with open(file_name, "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file)): - assert line == contents[len(contents) - idx - 1] + # Open text in "r" mode would trigger OS + # line ending handing + assert ( + line.rstrip(os.linesep) + l_end + == contents[len(contents) - idx - 1] + ) assert isinstance(line, str) # # TODO: Test binary mode From 76c024389abc7b310f27e28a8a4749775df2cc9b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 16:42:07 +0800 Subject: [PATCH 48/96] significantly increase test --- src/monty/io.py | 17 ++++++++++++----- tests/test_io.py | 30 +++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index fb9db2bf..69246b9e 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -87,10 +87,17 @@ def _get_line_ending( first_line = file.buffer.readline() elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() - file.seek(0) # reset pointer else: raise TypeError(f"Unknown file type {type(file).__name__}") + # TODO: more critical: make a copy of file, otherwise pointer of a + # iterator could change + # Reset pointer + try: + file.seek(0) # type: ignore[union-attr] + except AttributeError: + pass + # Return Unix "\n" line ending as default if file is empty if not first_line: warnings.warn("File is empty, return Unix line ending \n.", stacklevel=2) @@ -207,7 +214,7 @@ def reverse_readline( # Gzip files must use this method because there is no way to negative seek. if file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): - yield (line if isinstance(line, str) else line.decode("utf-8")) + yield line if isinstance(line, str) else line.decode("utf-8") else: # For bz2 files, seek is expensive. It is therefore in our best @@ -220,7 +227,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - eof_pos = m_file.tell() # need end of file to skip first empty line + eof_pos = m_file.tell() # Needed to skip first match while True: l_end_pos: int = buffer.rfind(l_end) @@ -232,8 +239,8 @@ def reverse_readline( line = buffer[l_end_pos + len_l_end :] buffer = buffer[:l_end_pos] # buffer doesn't include l_end - # Skip first match (the last line ending) - if l_end_pos != eof_pos: + # Skip first match (which is the last line ending) + if eof_pos != l_end_pos: yield line + l_end # Line ending not in current buffer, load next block into the buffer diff --git a/tests/test_io.py b/tests/test_io.py index d13cf32d..e77e954e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -88,6 +88,14 @@ def test_unknown_line_ending(self): class TestReverseReadline: + """WARNING to future code: + "reverse_readline" has two branches, one is the in-RAM + reverse reading for un-supported file types or small files. + As the default RAM threshold is "big" at around 4 MB (usually + people just write a few lines to test), you could easily be + testing/debugging the in-RAM branch all the time (me for example). + """ + NUMLINES = 3000 def test_reverse_readline(self): @@ -101,18 +109,19 @@ def test_reverse_readline(self): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - def test_reverse_readline_fake_big(self): + @pytest.mark.parametrize("ram", [4, 4_000, 4_000_000]) + def test_reverse_readline_fake_big(self, ram): """ Make sure that large text files are read properly, - by setting max_mem to 0. + by setting max_mem to a very small value. """ with open( os.path.join(TEST_DIR, "3000_lines.txt"), mode="r", encoding="utf-8" ) as f: - for idx, line in enumerate(reverse_readline(f, max_mem=0)): - assert isinstance(line, str) + for idx, line in enumerate(reverse_readline(f, max_mem=ram)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" + @pytest.mark.skip("DEBUG: TODO") def test_reverse_readline_bz2(self): """ Make sure a file containing line numbers is read in reverse order, @@ -135,8 +144,9 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") + @pytest.mark.parametrize("ram", [4_000, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_file_with_empty_lines(self, l_end): + def test_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped.""" contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" @@ -148,9 +158,10 @@ def test_file_with_empty_lines(self, l_end): file.write(line.encode()) with open(filename, mode="r", newline="") as file: - revert_contents = tuple(reverse_readline(file)) + revert_contents = tuple(reverse_readline(file, max_mem=ram)) assert revert_contents[::-1] == contents + # TODO: finish following tests # # Test gzip file # gzip_filename = f"{filename}.gz" # with gzip.open(gzip_filename, "w") as file_out: @@ -169,8 +180,9 @@ def test_file_with_empty_lines(self, l_end): # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) # assert revert_contents_bz2[::-1] == contents + @pytest.mark.parametrize("ram", [4, 4_000, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_line_ending(self, l_end): + def test_line_ending(self, l_end, ram): contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") file_name = "test_file.txt" @@ -181,7 +193,7 @@ def test_line_ending(self, l_end): # Test text mode with open(file_name, "r", encoding="utf-8") as file: - for idx, line in enumerate(reverse_readline(file)): + for idx, line in enumerate(reverse_readline(file, max_mem=ram)): # Open text in "r" mode would trigger OS # line ending handing assert ( @@ -190,7 +202,7 @@ def test_line_ending(self, l_end): ) assert isinstance(line, str) - # # TODO: Test binary mode + # # TODO: Support/test binary mode # with open(file_name, "rb") as file: # for idx, line in enumerate(reverse_readline(file)): # assert line == contents[len(contents) - idx - 1] From 3b68d14070d528cd695197d5c82f2a6fff537fdf Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 17:03:14 +0800 Subject: [PATCH 49/96] add warning for overly small mem size --- src/monty/io.py | 5 +++++ tests/test_io.py | 26 +++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 69246b9e..5d6aba48 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -217,6 +217,11 @@ def reverse_readline( yield line if isinstance(line, str) else line.decode("utf-8") else: + # RAM limit should be greater than block size, + # as file as read into RAM one block each time + if max_mem < blk_size: + warnings.warn(f"{max_mem=} smaller than {blk_size=}", stacklevel=2) + # For bz2 files, seek is expensive. It is therefore in our best # interest to maximize the block size within RAM usage limit. diff --git a/tests/test_io.py b/tests/test_io.py index e77e954e..226baa3e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -109,16 +109,18 @@ def test_reverse_readline(self): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - @pytest.mark.parametrize("ram", [4, 4_000, 4_000_000]) - def test_reverse_readline_fake_big(self, ram): + def test_reverse_readline_fake_big(self): """ Make sure that large text files are read properly, by setting max_mem to a very small value. """ - with open( - os.path.join(TEST_DIR, "3000_lines.txt"), mode="r", encoding="utf-8" - ) as f: - for idx, line in enumerate(reverse_readline(f, max_mem=ram)): + with ( + open( + os.path.join(TEST_DIR, "3000_lines.txt"), mode="r", encoding="utf-8" + ) as f, + pytest.warns(match="max_mem=0 smaller than blk_size="), + ): + for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" @pytest.mark.skip("DEBUG: TODO") @@ -144,7 +146,7 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") - @pytest.mark.parametrize("ram", [4_000, 4_0000_000]) + @pytest.mark.parametrize("ram", [4_096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped.""" @@ -180,7 +182,7 @@ def test_file_with_empty_lines(self, l_end, ram): # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) # assert revert_contents_bz2[::-1] == contents - @pytest.mark.parametrize("ram", [4, 4_000, 4_0000_000]) + @pytest.mark.parametrize("ram", [4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end, ram): contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") @@ -194,8 +196,7 @@ def test_line_ending(self, l_end, ram): # Test text mode with open(file_name, "r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file, max_mem=ram)): - # Open text in "r" mode would trigger OS - # line ending handing + # OS would automatically change line ending in text mode assert ( line.rstrip(os.linesep) + l_end == contents[len(contents) - idx - 1] @@ -248,7 +249,10 @@ def test_empty_file(self): Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. """ - with pytest.warns(match="File is empty, return Unix line ending \n."): + with ( + pytest.warns(match="File is empty, return Unix line ending \n."), + pytest.warns(match="trying to mmap an empty file"), + ): for _line in reverse_readfile(os.path.join(TEST_DIR, "empty_file.txt")): pytest.fail("No error should be thrown.") From cd3bbd1bc01fc6456f75fba37a622649187c4a1b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 17:21:53 +0800 Subject: [PATCH 50/96] update unit test --- tests/test_io.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 226baa3e..30de8171 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -113,6 +113,9 @@ def test_reverse_readline_fake_big(self): """ Make sure that large text files are read properly, by setting max_mem to a very small value. + + TODO: when max_mem = 0, the first item generated is "\n", + but the sequential items are correct. """ with ( open( @@ -123,7 +126,9 @@ def test_reverse_readline_fake_big(self): for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - @pytest.mark.skip("DEBUG: TODO") + def test_small_blk_size(self): + """TODO: test small block size.""" + def test_reverse_readline_bz2(self): """ Make sure a file containing line numbers is read in reverse order, @@ -133,7 +138,7 @@ def test_reverse_readline_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line) - assert lines == ["\n", "\n", "HelloWorld."] # test file has two empty lines + assert lines == ["\n", "HelloWorld.\n"] # test file has one empty line assert all(isinstance(line, str) for line in lines) def test_empty_file(self): @@ -146,10 +151,12 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") - @pytest.mark.parametrize("ram", [4_096, 4_0000_000]) + @pytest.mark.parametrize("ram", [4, 4_096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end, ram): - """Empty lines should not be skipped.""" + """Empty lines should not be skipped. + Using a very small RAM size to force non in-RAM mode. + """ contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" @@ -182,9 +189,10 @@ def test_file_with_empty_lines(self, l_end, ram): # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) # assert revert_contents_bz2[::-1] == contents - @pytest.mark.parametrize("ram", [4096, 4_0000_000]) + @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_line_ending(self, l_end, ram): + """Using a very small RAM size to force non in-RAM mode.""" contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") file_name = "test_file.txt" From 36a02f4f097115f8c7b910df4240ee010f425c4b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 17:46:54 +0800 Subject: [PATCH 51/96] add test for illegal usage --- src/monty/io.py | 22 +++++++++++++++++----- tests/test_io.py | 39 +++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 5d6aba48..01210b9c 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -164,7 +164,7 @@ def reverse_readfile( def reverse_readline( - m_file, + m_file, # TODO: expected type is unclear blk_size: int = 4096, max_mem: int = 4_000_000, ) -> Iterator[str]: @@ -179,6 +179,17 @@ def reverse_readline( Files larger than max_mem are read one block each time. + NOTE: this function expect a file stream, and m_file + should NOT be the name of the file. + + TODO: + - is it possible to support binary file stream + - Test gzip seek speed (not supported previously) + - Test bzip2 seek speed (any improvement) + https://stackoverflow.com/questions/25734252/ + why-is-seeking-from-the-end-of-a-file-allowed-for- + bzip2-files-and-not-gzip-files + Reference: Based on code by Peter Astrand , using modifications by Raymond Hettinger and Kevin German. @@ -186,7 +197,7 @@ def reverse_readline( file-backwards-yet-another-implementat/ Args: - m_file (File): File stream to read (backwards). + m_file: File stream to read (backwards). blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of RAM to use in bytes, which determines when to reverse a file in-memory versus @@ -196,6 +207,10 @@ def reverse_readline( Yields: Lines from the back of the file. """ + # Check for illegal usage + if isinstance(m_file, str | Path): + raise TypeError("expect a file stream, not file name") + # Generate line ending l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end)) @@ -224,9 +239,6 @@ def reverse_readline( # For bz2 files, seek is expensive. It is therefore in our best # interest to maximize the block size within RAM usage limit. - - # TODO: not sure if bzip2 has any improvement on seek, need test - # https://stackoverflow.com/questions/25734252/why-is-seeking-from-the-end-of-a-file-allowed-for-bzip2-files-and-not-gzip-files if isinstance(m_file, bz2.BZ2File): blk_size = min(max_mem, file_size) diff --git a/tests/test_io.py b/tests/test_io.py index 30de8171..e6cab33f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -151,7 +151,7 @@ def test_empty_file(self): for _line in reverse_readline(f): pytest.fail("No error should be thrown.") - @pytest.mark.parametrize("ram", [4, 4_096, 4_0000_000]) + @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped. @@ -170,24 +170,25 @@ def test_file_with_empty_lines(self, l_end, ram): revert_contents = tuple(reverse_readline(file, max_mem=ram)) assert revert_contents[::-1] == contents - # TODO: finish following tests - # # Test gzip file - # gzip_filename = f"{filename}.gz" - # with gzip.open(gzip_filename, "w") as file_out: - # for line in contents: - # file_out.write(line.encode()) + # Test gzip file + gzip_filename = f"{filename}.gz" + with gzip.open(gzip_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) - # revert_contents_gzip = tuple(reverse_readline(gzip_filename)) - # assert revert_contents_gzip[::-1] == contents + with gzip.open(gzip_filename) as g_file: + revert_contents_gzip = tuple(reverse_readline(g_file)) + assert revert_contents_gzip[::-1] == contents - # # Test bzip2 file - # bz2_filename = f"{filename}.bz2" - # with bz2.open(bz2_filename, "w") as file_out: - # for line in contents: - # file_out.write(line.encode()) + # Test bzip2 file + bz2_filename = f"{filename}.bz2" + with bz2.open(bz2_filename, "w") as file_out: + for line in contents: + file_out.write(line.encode()) - # revert_contents_bz2 = tuple(reverse_readline(bz2_filename)) - # assert revert_contents_bz2[::-1] == contents + with bz2.open(bz2_filename) as b_file: + revert_contents_bz2 = tuple(reverse_readline(b_file)) + assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) @@ -217,6 +218,12 @@ def test_line_ending(self, l_end, ram): # assert line == contents[len(contents) - idx - 1] # assert isinstance(line, str) + @pytest.mark.parametrize("file", ["./file", Path("./file")]) + def test_illegal_usage(self, file): + with pytest.raises(TypeError, match="expect a file stream, not file name"): + for _ in reverse_readline(file): + pass + class TestReverseReadfile: NUM_LINES = 3000 From 64579221023de1b5451abc654275dd3480f9f5fb Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 19:05:24 +0800 Subject: [PATCH 52/96] use tuple for instance check for now --- src/monty/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index 01210b9c..9b0e5f6e 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -208,7 +208,7 @@ def reverse_readline( Lines from the back of the file. """ # Check for illegal usage - if isinstance(m_file, str | Path): + if isinstance(m_file, (str, Path)): raise TypeError("expect a file stream, not file name") # Generate line ending From 14dad79d568b451d827e218ff57433e48ace7367 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 19:08:48 +0800 Subject: [PATCH 53/96] avoid repeated len(l_end) call --- src/monty/io.py | 9 ++++----- tests/test_io.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 9b0e5f6e..ae8673e9 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -129,6 +129,7 @@ def reverse_readfile( """ # Get line ending l_end = _get_line_ending(filename) + len_l_end = len(l_end) with zopen(filename, "rb") as file: if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): @@ -147,7 +148,7 @@ def reverse_readfile( while file_size > 0: # Find line segment start and end positions seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) - sec_end_pos = file_size + len(l_end) + sec_end_pos = file_size + len_l_end # The first line (original) doesn't have an ending character at its start if seg_start_pos == -1: @@ -156,9 +157,7 @@ def reverse_readfile( # Skip the first match (the original last line ending character) elif file_size != len(filemap): yield ( - filemap[seg_start_pos + len(l_end) : sec_end_pos].decode( - "utf-8" - ) + filemap[seg_start_pos + len_l_end : sec_end_pos].decode("utf-8") ) file_size = seg_start_pos @@ -185,7 +184,7 @@ def reverse_readline( TODO: - is it possible to support binary file stream - Test gzip seek speed (not supported previously) - - Test bzip2 seek speed (any improvement) + - Test bzip2 seek speed (for any improvement?) https://stackoverflow.com/questions/25734252/ why-is-seeking-from-the-end-of-a-file-allowed-for- bzip2-files-and-not-gzip-files diff --git a/tests/test_io.py b/tests/test_io.py index e6cab33f..2119c70e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -88,7 +88,7 @@ def test_unknown_line_ending(self): class TestReverseReadline: - """WARNING to future code: + """WARNING for future coder: "reverse_readline" has two branches, one is the in-RAM reverse reading for un-supported file types or small files. As the default RAM threshold is "big" at around 4 MB (usually From 306a8f12d55d86971c7c9c9169ad1ad668f2a322 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 19:15:25 +0800 Subject: [PATCH 54/96] tweak unit test names --- tests/test_io.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 2119c70e..35c1af0a 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -109,11 +109,13 @@ def test_reverse_readline(self): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - def test_reverse_readline_fake_big(self): + def test_big_file(self): """ Make sure that large text files are read properly, by setting max_mem to a very small value. + TODO: rewrite test + TODO: when max_mem = 0, the first item generated is "\n", but the sequential items are correct. """ @@ -129,7 +131,7 @@ def test_reverse_readline_fake_big(self): def test_small_blk_size(self): """TODO: test small block size.""" - def test_reverse_readline_bz2(self): + def test_read_bz2(self): """ Make sure a file containing line numbers is read in reverse order, i.e. the first line that is read corresponds to the last line number. @@ -141,7 +143,7 @@ def test_reverse_readline_bz2(self): assert lines == ["\n", "HelloWorld.\n"] # test file has one empty line assert all(isinstance(line, str) for line in lines) - def test_empty_file(self): + def test_read_empty_file(self): """ Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. @@ -153,7 +155,7 @@ def test_empty_file(self): @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_file_with_empty_lines(self, l_end, ram): + def test_read_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped. Using a very small RAM size to force non in-RAM mode. """ @@ -192,7 +194,7 @@ def test_file_with_empty_lines(self, l_end, ram): @pytest.mark.parametrize("ram", [4, 4096, 4_0000_000]) @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_line_ending(self, l_end, ram): + def test_different_line_endings(self, l_end, ram): """Using a very small RAM size to force non in-RAM mode.""" contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") file_name = "test_file.txt" @@ -219,10 +221,9 @@ def test_line_ending(self, l_end, ram): # assert isinstance(line, str) @pytest.mark.parametrize("file", ["./file", Path("./file")]) - def test_illegal_usage(self, file): + def test_illegal_file_type(self, file): with pytest.raises(TypeError, match="expect a file stream, not file name"): - for _ in reverse_readline(file): - pass + next(reverse_readline(file)) class TestReverseReadfile: @@ -239,7 +240,7 @@ def test_reverse_readfile(self): # OS would automatically convert line ending in text mode assert line == f"{str(self.NUM_LINES - idx)}{os.linesep}" - def test_reverse_readfile_gz(self): + def test_read_gz(self): """ Make sure a file containing line numbers is read in reverse order, i.e. the first line that is read corresponds to the last line number. @@ -249,7 +250,7 @@ def test_reverse_readfile_gz(self): assert isinstance(line, str) assert line == f"{str(self.NUM_LINES - idx)}\n" - def test_reverse_readfile_bz2(self): + def test_read_bz2(self): """ Make sure a file containing line numbers is read in reverse order, i.e. the first line that is read corresponds to the last line number. @@ -259,7 +260,7 @@ def test_reverse_readfile_bz2(self): assert isinstance(line, str) assert line == f"{str(self.NUM_LINES - idx)}\n" - def test_empty_file(self): + def test_read_empty_file(self): """ Make sure an empty file does not throw an error when reverse_readline is called, which was a problem with an earlier implementation. @@ -272,7 +273,7 @@ def test_empty_file(self): pytest.fail("No error should be thrown.") @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_file_with_empty_lines(self, l_end): + def test_read_file_with_empty_lines(self, l_end): """Empty lines should not be skipped.""" contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" @@ -305,7 +306,7 @@ def test_file_with_empty_lines(self, l_end): assert revert_contents_bz2[::-1] == contents @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) - def test_line_ending(self, l_end): + def test_different_line_endings(self, l_end): contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") filename = "test_file.txt" From 168bc423f41131ede38bf4127d37257e4f4fd85a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Fri, 13 Sep 2024 20:57:57 +0800 Subject: [PATCH 55/96] supress newline convert --- tests/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 35c1af0a..318225e4 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -104,7 +104,7 @@ def test_reverse_readline(self): order, i.e. the first line that is read corresponds to the last line. number """ - with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8") as f: + with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8", newline="") as f: for idx, line in enumerate(reverse_readline(f)): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" From dcdfde57a8802ad19f571810b70ee49c1840d208 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Sep 2024 12:58:12 +0000 Subject: [PATCH 56/96] pre-commit auto-fixes --- tests/test_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 318225e4..bee63cde 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -104,7 +104,9 @@ def test_reverse_readline(self): order, i.e. the first line that is read corresponds to the last line. number """ - with open(os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8", newline="") as f: + with open( + os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8", newline="" + ) as f: for idx, line in enumerate(reverse_readline(f)): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" From f255121b6e48f24d14f25d5d30ba85104856efce Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 19:46:22 +0800 Subject: [PATCH 57/96] support bufferedreader and clarify m_file type --- src/monty/io.py | 43 +++++++++++++++++++++++++------------------ tests/test_io.py | 38 ++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index ae8673e9..4b6d1b46 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -57,7 +57,12 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO: def _get_line_ending( - file: str | Path | io.TextIOWrapper, + file: str + | Path + | io.TextIOWrapper + | io.BufferedReader + | gzip.GzipFile + | bz2.BZ2File, ) -> Literal["\r\n", "\n"]: """Helper function to get line ending of a file. @@ -79,20 +84,19 @@ def _get_line_ending( Warnings: If file is empty, "\n" would be used as default. """ - # TODO: critical, read the last N (~2) chars instead of everything + # TODO: Read the last N chars instead of the entire line? if isinstance(file, (str, Path)): with zopen(file, "rb") as f: first_line = f.readline() elif isinstance(file, io.TextIOWrapper): first_line = file.buffer.readline() - elif isinstance(file, (gzip.GzipFile, bz2.BZ2File)): + elif isinstance(file, (io.BufferedReader, gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() else: raise TypeError(f"Unknown file type {type(file).__name__}") - # TODO: more critical: make a copy of file, otherwise pointer of a - # iterator could change - # Reset pointer + # TODO: Make a copy of file if possible? otherwise pointer of a iterator could change + # Reset pointer to position 0 try: file.seek(0) # type: ignore[union-attr] except AttributeError: @@ -163,7 +167,7 @@ def reverse_readfile( def reverse_readline( - m_file, # TODO: expected type is unclear + m_file: io.BufferedReader | io.TextIOWrapper | gzip.GzipFile | bz2.BZ2File, blk_size: int = 4096, max_mem: int = 4_000_000, ) -> Iterator[str]: @@ -172,6 +176,10 @@ def reverse_readline( the file.readline function. This allows one to efficiently get data from the end of a file. + Supported file stream formats: + - TextIOWrapper (text mode) | BufferedReader (binary mode) + - gzip/bzip2 file stream + Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - For Gzip files, as reverse seeks are not supported. # TODO: now supported @@ -182,7 +190,6 @@ def reverse_readline( should NOT be the name of the file. TODO: - - is it possible to support binary file stream - Test gzip seek speed (not supported previously) - Test bzip2 seek speed (for any improvement?) https://stackoverflow.com/questions/25734252/ @@ -218,7 +225,7 @@ def reverse_readline( is_text: bool = isinstance(m_file, io.TextIOWrapper) try: - file_size = os.path.getsize(m_file.name) + file_size: int = os.path.getsize(m_file.name) except AttributeError: # Bz2 files do not have "name" attribute. # Just set file_size to max_mem for now. @@ -228,11 +235,11 @@ def reverse_readline( # Gzip files must use this method because there is no way to negative seek. if file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): - yield line if isinstance(line, str) else line.decode("utf-8") + yield line if isinstance(line, str) else cast(bytes, line).decode("utf-8") else: # RAM limit should be greater than block size, - # as file as read into RAM one block each time + # as file is read into RAM one block each time. if max_mem < blk_size: warnings.warn(f"{max_mem=} smaller than {blk_size=}", stacklevel=2) @@ -247,7 +254,7 @@ def reverse_readline( while True: l_end_pos: int = buffer.rfind(l_end) - # Pointer position (also size of remaining file to read) + # Pointer position (also size of remaining file) pt_pos: int = m_file.tell() # Line ending found within buffer @@ -255,8 +262,8 @@ def reverse_readline( line = buffer[l_end_pos + len_l_end :] buffer = buffer[:l_end_pos] # buffer doesn't include l_end - # Skip first match (which is the last line ending) - if eof_pos != l_end_pos: + # Skip first match (the last line ending) + if l_end_pos != eof_pos: yield line + l_end # Line ending not in current buffer, load next block into the buffer @@ -264,9 +271,9 @@ def reverse_readline( to_read: int = min(blk_size, pt_pos) m_file.seek(pt_pos - to_read) if is_text: - buffer += m_file.read(to_read) + buffer += cast(str, m_file.read(to_read)) else: - buffer += m_file.read(to_read).decode("utf-8") + buffer += cast(bytes, m_file.read(to_read)).decode("utf-8") # Move pointer forward m_file.seek(pt_pos - to_read) @@ -275,8 +282,8 @@ def reverse_readline( if pt_pos == to_read: buffer = l_end + buffer - # Start of file (no more l_end found, and pt_pos at the start) - else: + # Start of file (no l_end found, and pt_pos at the start) + else: # l_end_pos == -1 and pt_post == 0 return diff --git a/tests/test_io.py b/tests/test_io.py index bee63cde..bece260c 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -25,6 +25,10 @@ def test_get_line_ending(self, l_end): """Test files with: Unix line ending (\n). Windows line ending (\r\n). + + For: + - Text file: both text mode and binary mode + - gzip file and bzip2 file """ test_file = "test_l_end.txt" test_line = f"This is a test{l_end}Second line{l_end}".encode() @@ -36,27 +40,32 @@ def test_get_line_ending(self, l_end): assert _get_line_ending(test_file) == l_end assert _get_line_ending(Path(test_file)) == l_end + # Test text mode with open(test_file, "r", encoding="utf-8") as f: assert _get_line_ending(f) == l_end + # Test binary mode + with open(test_file, "rb") as f: + assert _get_line_ending(f) == l_end + # Test gzip file gzip_filename = f"{test_file}.gz" with gzip.open(gzip_filename, "wb") as f: f.write(test_line) - # Opened file + # Opened file stream with gzip.open(gzip_filename, "rb") as f: assert _get_line_ending(f) == l_end # Filename directly assert _get_line_ending(gzip_filename) == l_end - # Test opened bzip2 file + # Test bzip2 file stream bz2_filename = f"{test_file}.bz2" with bz2.open(bz2_filename, "wb") as f: f.write(test_line) - # Opened file + # Opened file stream with bz2.open(bz2_filename, "rb") as f: assert _get_line_ending(f) == l_end @@ -104,6 +113,7 @@ def test_reverse_readline(self): order, i.e. the first line that is read corresponds to the last line. number """ + # Test text mode with open( os.path.join(TEST_DIR, "3000_lines.txt"), encoding="utf-8", newline="" ) as f: @@ -111,14 +121,19 @@ def test_reverse_readline(self): assert isinstance(line, str) assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" + # Test binary mode + with open(os.path.join(TEST_DIR, "3000_lines.txt"), mode="rb") as f: + for idx, line in enumerate(reverse_readline(f)): + assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" + def test_big_file(self): """ Make sure that large text files are read properly, by setting max_mem to a very small value. - TODO: rewrite test + TODO: rewrite test with a real big file - TODO: when max_mem = 0, the first item generated is "\n", + DEBUG: when max_mem = 0, the first item generated is "\n", but the sequential items are correct. """ with ( @@ -130,8 +145,8 @@ def test_big_file(self): for idx, line in enumerate(reverse_readline(f, max_mem=0)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - def test_small_blk_size(self): - """TODO: test small block size.""" + def test_blk_size(self): + """TODO: test different block sizes.""" def test_read_bz2(self): """ @@ -216,11 +231,10 @@ def test_different_line_endings(self, l_end, ram): ) assert isinstance(line, str) - # # TODO: Support/test binary mode - # with open(file_name, "rb") as file: - # for idx, line in enumerate(reverse_readline(file)): - # assert line == contents[len(contents) - idx - 1] - # assert isinstance(line, str) + # Test binary mode + with open(file_name, "rb") as file: + for idx, line in enumerate(reverse_readline(file)): + assert line == contents[len(contents) - idx - 1] @pytest.mark.parametrize("file", ["./file", Path("./file")]) def test_illegal_file_type(self, file): From 57157ef2973c7077b6f8f13d844eaeac8769eff9 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 19:58:28 +0800 Subject: [PATCH 58/96] clarify and test missing last l_end char --- src/monty/io.py | 5 ++--- tests/test_io.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 4b6d1b46..32dfd3db 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -95,8 +95,7 @@ def _get_line_ending( else: raise TypeError(f"Unknown file type {type(file).__name__}") - # TODO: Make a copy of file if possible? otherwise pointer of a iterator could change - # Reset pointer to position 0 + # Reset pointer to start of file try: file.seek(0) # type: ignore[union-attr] except AttributeError: @@ -112,7 +111,7 @@ def _get_line_ending( if first_line.endswith(b"\n"): return "\n" - # It's likely the line is missing a line ending for its last line + # It's likely the line is missing a line ending for the first line raise ValueError(f"Unknown line ending in line {repr(first_line)}.") diff --git a/tests/test_io.py b/tests/test_io.py index bece260c..58b3289e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -72,6 +72,18 @@ def test_get_line_ending(self, l_end): # Filename directly assert _get_line_ending(bz2_filename) == l_end + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) + def test_miss_last_l_end(self, l_end): + """Make sure this still works if the last l_end is missing.""" + test_line = f"This is a test{l_end}Second line".encode() + test_file = "test_l_end.txt" + + with ScratchDir("."): + with open(test_file, "wb") as f: + f.write(test_line) + + assert _get_line_ending(test_file) == l_end + def test_unknown_file_type(self): unknown_file = 123 From 4276032c67b644ca9355a97d53c88e7ff905157c Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 20:10:02 +0800 Subject: [PATCH 59/96] clean up docstring --- src/monty/io.py | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 32dfd3db..861d52e3 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -68,9 +68,9 @@ def _get_line_ending( This function assumes the file has a single consistent line ending. - WARNING: as per the POSIX standard, a line is: - A sequence of zero or more non- characters plus a terminating character. - as such this func would fail if the last line misses a terminating character. + WARNING: as per the POSIX standard, a line is: "A sequence of zero or + more non- characters plus a terminating character.", as such this func + would fail if the only line misses a terminating character. https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html Returns: @@ -78,8 +78,7 @@ def _get_line_ending( "\r\n": Windows line ending. Raises: - ValueError: If line ending is unknown, likely the file is - missing a terminating character. + ValueError: If line ending is unknown. Warnings: If file is empty, "\n" would be used as default. @@ -151,16 +150,16 @@ def reverse_readfile( while file_size > 0: # Find line segment start and end positions seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) - sec_end_pos = file_size + len_l_end + seg_end_pos = file_size + len_l_end - # The first line (original) doesn't have an ending character at its start + # The first (originally) line doesn't have an ending character at its head if seg_start_pos == -1: - yield (filemap[:sec_end_pos].decode("utf-8")) + yield (filemap[:seg_end_pos].decode("utf-8")) - # Skip the first match (the original last line ending character) + # Skip the first match (the last line ending character) elif file_size != len(filemap): yield ( - filemap[seg_start_pos + len_l_end : sec_end_pos].decode("utf-8") + filemap[seg_start_pos + len_l_end : seg_end_pos].decode("utf-8") ) file_size = seg_start_pos @@ -181,12 +180,7 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - - For Gzip files, as reverse seeks are not supported. # TODO: now supported - - Files larger than max_mem are read one block each time. - - NOTE: this function expect a file stream, and m_file - should NOT be the name of the file. + - Gzip files, as reverse seeks are not supported. # TODO: now supported TODO: - Test gzip seek speed (not supported previously) @@ -206,11 +200,17 @@ def reverse_readline( blk_size (int): The buffer size in bytes. Defaults to 4096. max_mem (int): The maximum amount of RAM to use in bytes, which determines when to reverse a file in-memory versus - seeking blocks of a file. For bz2 files, this sets - the block size. + seeking blocks of a file each time. For bz2 files, + this sets the block size. Yields: Lines from the back of the file. + + Raises: + TypeError: If m_file is the name of the file (expect file stream). + + Warnings: + If max_mem is smaller than blk_size. """ # Check for illegal usage if isinstance(m_file, (str, Path)): @@ -281,8 +281,8 @@ def reverse_readline( if pt_pos == to_read: buffer = l_end + buffer - # Start of file (no l_end found, and pt_pos at the start) - else: # l_end_pos == -1 and pt_post == 0 + # Start of file + else: # l_end_pos == -1 (not found) and pt_post == 0 (start) return @@ -384,8 +384,7 @@ def get_open_fds() -> int: """ Get the number of open file descriptors for current process. - Warnings: - Will only work on UNIX-like OS-es. + Warning, this will only work on UNIX-like OS. Returns: int: The number of open file descriptors for current process. From 33257c500cbc49198e46ccceaa3f5a7793eb515d Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 20:31:16 +0800 Subject: [PATCH 60/96] clarify myth around blk size and max mem --- src/monty/io.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 861d52e3..7bd6b56f 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -183,6 +183,8 @@ def reverse_readline( - Gzip files, as reverse seeks are not supported. # TODO: now supported TODO: + - Could buffer get overly large (buffer += to_read) if + rfind(l_end) missed several times in a row (line longer than blk_size)? - Test gzip seek speed (not supported previously) - Test bzip2 seek speed (for any improvement?) https://stackoverflow.com/questions/25734252/ @@ -197,11 +199,11 @@ def reverse_readline( Args: m_file: File stream to read (backwards). - blk_size (int): The buffer size in bytes. Defaults to 4096. - max_mem (int): The maximum amount of RAM to use in bytes, - which determines when to reverse a file in-memory versus - seeking blocks of a file each time. For bz2 files, - this sets the block size. + blk_size (int): The block size to read each time in bytes. + Defaults to 4096. # TODO: it's unclear what this actually controls? + max_mem (int): Threshold to determine when to reverse a file + in-memory versus reading blocks of a file each time. + For bz2 files, this sets the block size. Yields: Lines from the back of the file. From f75abd305aa1fb25deff95ba73146738f98e7987 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 20:51:45 +0800 Subject: [PATCH 61/96] fix skip last l_end (temporarily) --- src/monty/io.py | 5 +++-- tests/test_io.py | 10 ++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 7bd6b56f..f37a01fb 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -251,7 +251,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - eof_pos = m_file.tell() # Needed to skip first match + count = 0 # TODO: better way to skip first match while True: l_end_pos: int = buffer.rfind(l_end) @@ -264,7 +264,8 @@ def reverse_readline( buffer = buffer[:l_end_pos] # buffer doesn't include l_end # Skip first match (the last line ending) - if l_end_pos != eof_pos: + if count != 0: + count += 1 yield line + l_end # Line ending not in current buffer, load next block into the buffer diff --git a/tests/test_io.py b/tests/test_io.py index 58b3289e..bd9c1f92 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -138,15 +138,10 @@ def test_reverse_readline(self): for idx, line in enumerate(reverse_readline(f)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - def test_big_file(self): + def test_fake_big_file(self): """ Make sure that large text files are read properly, by setting max_mem to a very small value. - - TODO: rewrite test with a real big file - - DEBUG: when max_mem = 0, the first item generated is "\n", - but the sequential items are correct. """ with ( open( @@ -169,8 +164,7 @@ def test_read_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line) - assert lines == ["\n", "HelloWorld.\n"] # test file has one empty line - assert all(isinstance(line, str) for line in lines) + assert lines == ["HelloWorld.\n"] # test file has one single line def test_read_empty_file(self): """ From 709ed177e77918cb005664d2feab2cec765ace4a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 21:18:26 +0800 Subject: [PATCH 62/96] fix skip first l_end match --- src/monty/io.py | 23 ++++++++++++----------- tests/test_io.py | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index f37a01fb..8148ece4 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -181,6 +181,8 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - Gzip files, as reverse seeks are not supported. # TODO: now supported + # WARNING: gzip might decompress in-RAM, and be careful about + the RAM usage (compression ratio) TODO: - Could buffer get overly large (buffer += to_read) if @@ -222,15 +224,11 @@ def reverse_readline( l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end)) - # Check if the file stream is a buffered text stream (text instead of binary) - is_text: bool = isinstance(m_file, io.TextIOWrapper) - - try: + # Bz2 files do not have "name" attribute, just set to max_mem for now + if hasattr(m_file, "name"): file_size: int = os.path.getsize(m_file.name) - except AttributeError: - # Bz2 files do not have "name" attribute. - # Just set file_size to max_mem for now. - file_size = max_mem + 1 + else: + file_size = max_mem # If the file size is within desired RAM limit, just reverse it in memory. # Gzip files must use this method because there is no way to negative seek. @@ -249,13 +247,16 @@ def reverse_readline( if isinstance(m_file, bz2.BZ2File): blk_size = min(max_mem, file_size) + # Check if the file stream is text (instead of binary) + is_text: bool = isinstance(m_file, io.TextIOWrapper) + buffer: str = "" m_file.seek(0, 2) count = 0 # TODO: better way to skip first match while True: l_end_pos: int = buffer.rfind(l_end) - # Pointer position (also size of remaining file) + # Pointer position (also size of remaining block) pt_pos: int = m_file.tell() # Line ending found within buffer @@ -264,8 +265,8 @@ def reverse_readline( buffer = buffer[:l_end_pos] # buffer doesn't include l_end # Skip first match (the last line ending) - if count != 0: - count += 1 + count += 1 + if count != 1: yield line + l_end # Line ending not in current buffer, load next block into the buffer diff --git a/tests/test_io.py b/tests/test_io.py index bd9c1f92..e07da0bd 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -164,7 +164,7 @@ def test_read_bz2(self): with zopen(os.path.join(TEST_DIR, "myfile_bz2.bz2"), "rb") as f: for line in reverse_readline(f): lines.append(line) - assert lines == ["HelloWorld.\n"] # test file has one single line + assert lines == ["\n", "HelloWorld.\n"] # test file has one empty line def test_read_empty_file(self): """ From 4799c2426f93763cb8aa24ca9ef1530e0a9b3871 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 21:52:36 +0800 Subject: [PATCH 63/96] update test big file, but it's failing and need to fix --- src/monty/io.py | 6 +++--- tests/test_io.py | 31 ++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 8148ece4..e5a52ebb 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -252,7 +252,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - count = 0 # TODO: better way to skip first match + skip_1st_l_end = False # TODO: better way to skip first match while True: l_end_pos: int = buffer.rfind(l_end) @@ -265,9 +265,9 @@ def reverse_readline( buffer = buffer[:l_end_pos] # buffer doesn't include l_end # Skip first match (the last line ending) - count += 1 - if count != 1: + if skip_1st_l_end: yield line + l_end + skip_1st_l_end = True # Line ending not in current buffer, load next block into the buffer elif pt_pos > 0: diff --git a/tests/test_io.py b/tests/test_io.py index e07da0bd..18d8365e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -138,19 +138,28 @@ def test_reverse_readline(self): for idx, line in enumerate(reverse_readline(f)): assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" - def test_fake_big_file(self): + @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) + def test_big_file(self, l_end): """ - Make sure that large text files are read properly, - by setting max_mem to a very small value. + Test read big file. + + A file of 300,000 lines is about 2 MB, but the default max_mem + is still around 4 MB, so we have to reduce it. """ - with ( - open( - os.path.join(TEST_DIR, "3000_lines.txt"), mode="r", encoding="utf-8" - ) as f, - pytest.warns(match="max_mem=0 smaller than blk_size="), - ): - for idx, line in enumerate(reverse_readline(f, max_mem=0)): - assert line == f"{str(self.NUMLINES - idx)}{os.linesep}" + file_name = "big_file.txt" + num_lines = 300_000 + + with ScratchDir("."): + # Write test file (~ 2 MB) + with open(file_name, "wb") as file: + for num in range(1, num_lines + 1): + file.write(f"{num}{l_end}".encode()) + + assert os.path.getsize(file_name) > 1_000_000 # 1 MB + + with open(file_name) as file: + for idx, line in enumerate(reverse_readline(file, max_mem=4096)): + assert line == f"{str(num_lines - idx)}{os.linesep}" def test_blk_size(self): """TODO: test different block sizes.""" From 702e6ca6de1a0dea89b54e25ad23db25540922d4 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 14 Sep 2024 22:24:18 +0800 Subject: [PATCH 64/96] tweak comment --- src/monty/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index e5a52ebb..2b44ffb7 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -256,7 +256,7 @@ def reverse_readline( while True: l_end_pos: int = buffer.rfind(l_end) - # Pointer position (also size of remaining block) + # Pointer position (also size of remaining file) pt_pos: int = m_file.tell() # Line ending found within buffer From d0a0fdaf4826a637d7538f89c2e9589cd6afa5a2 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 11:01:54 +0800 Subject: [PATCH 65/96] update TODO list --- src/monty/io.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 2b44ffb7..df217efb 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -82,8 +82,11 @@ def _get_line_ending( Warnings: If file is empty, "\n" would be used as default. + + TODO: + - Read the last N chars instead of the entire line? + - Unit test assert file.tell() is at start of file """ - # TODO: Read the last N chars instead of the entire line? if isinstance(file, (str, Path)): with zopen(file, "rb") as f: first_line = f.readline() @@ -94,11 +97,9 @@ def _get_line_ending( else: raise TypeError(f"Unknown file type {type(file).__name__}") - # Reset pointer to start of file - try: - file.seek(0) # type: ignore[union-attr] - except AttributeError: - pass + # Reset pointer to start of file if possible + if hasattr(file, "seek"): + file.seek(0) # Return Unix "\n" line ending as default if file is empty if not first_line: @@ -180,18 +181,15 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - - Gzip files, as reverse seeks are not supported. # TODO: now supported + - Gzip files, as reverse seeks are not supported. # WARNING: gzip might decompress in-RAM, and be careful about - the RAM usage (compression ratio) + the RAM usage (compression ratio) # TODO: confirm this TODO: - Could buffer get overly large (buffer += to_read) if - rfind(l_end) missed several times in a row (line longer than blk_size)? + rfind(l_end) missed several times in a row (line longer + than blk_size)? Need to profile RAM usage. - Test gzip seek speed (not supported previously) - - Test bzip2 seek speed (for any improvement?) - https://stackoverflow.com/questions/25734252/ - why-is-seeking-from-the-end-of-a-file-allowed-for- - bzip2-files-and-not-gzip-files Reference: Based on code by Peter Astrand , using From 5af7145cecdb4389fb0f2e2fca6bd20cf8ecb029 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 11:41:35 +0800 Subject: [PATCH 66/96] fix incorrect buffer refill position --- src/monty/io.py | 8 ++++---- tests/test_io.py | 8 +++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index df217efb..698ed7a3 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -200,7 +200,7 @@ def reverse_readline( Args: m_file: File stream to read (backwards). blk_size (int): The block size to read each time in bytes. - Defaults to 4096. # TODO: it's unclear what this actually controls? + Defaults to 4096. # TODO: unclear what this actually controls? max_mem (int): Threshold to determine when to reverse a file in-memory versus reading blocks of a file each time. For bz2 files, this sets the block size. @@ -272,9 +272,9 @@ def reverse_readline( to_read: int = min(blk_size, pt_pos) m_file.seek(pt_pos - to_read) if is_text: - buffer += cast(str, m_file.read(to_read)) + buffer = cast(str, m_file.read(to_read)) + buffer else: - buffer += cast(bytes, m_file.read(to_read)).decode("utf-8") + buffer = cast(bytes, m_file.read(to_read)).decode("utf-8") + buffer # Move pointer forward m_file.seek(pt_pos - to_read) @@ -284,7 +284,7 @@ def reverse_readline( buffer = l_end + buffer # Start of file - else: # l_end_pos == -1 (not found) and pt_post == 0 (start) + else: # l_end_pos == -1 and pt_post == 0 return diff --git a/tests/test_io.py b/tests/test_io.py index 18d8365e..c77ac8e3 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -157,10 +157,16 @@ def test_big_file(self, l_end): assert os.path.getsize(file_name) > 1_000_000 # 1 MB - with open(file_name) as file: + # Test text mode + with open(file_name, mode="r", encoding="utf-8") as file: for idx, line in enumerate(reverse_readline(file, max_mem=4096)): assert line == f"{str(num_lines - idx)}{os.linesep}" + # Test binary mode + with open(file_name, mode="rb") as file: + for idx, line in enumerate(reverse_readline(file, max_mem=4096)): + assert line == f"{str(num_lines - idx)}{l_end}" + def test_blk_size(self): """TODO: test different block sizes.""" From 384c231b5d178cb7282753be1ac8e29db2f95bcd Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 11:45:50 +0800 Subject: [PATCH 67/96] suppress OS l_end translate --- tests/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index c77ac8e3..fc42b117 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -158,9 +158,9 @@ def test_big_file(self, l_end): assert os.path.getsize(file_name) > 1_000_000 # 1 MB # Test text mode - with open(file_name, mode="r", encoding="utf-8") as file: + with open(file_name, mode="r", encoding="utf-8", newline="") as file: for idx, line in enumerate(reverse_readline(file, max_mem=4096)): - assert line == f"{str(num_lines - idx)}{os.linesep}" + assert line == f"{str(num_lines - idx)}{l_end}" # Test binary mode with open(file_name, mode="rb") as file: From 61022638a971db5fe6b166f33bb34e57073be558 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 11:54:06 +0800 Subject: [PATCH 68/96] suppress unrelated warnings --- tests/test_io.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index fc42b117..88fec84f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -3,6 +3,7 @@ import bz2 import gzip import os +import warnings from pathlib import Path import pytest @@ -167,9 +168,6 @@ def test_big_file(self, l_end): for idx, line in enumerate(reverse_readline(file, max_mem=4096)): assert line == f"{str(num_lines - idx)}{l_end}" - def test_blk_size(self): - """TODO: test different block sizes.""" - def test_read_bz2(self): """ Make sure a file containing line numbers is read in reverse order, @@ -197,6 +195,10 @@ def test_read_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped. Using a very small RAM size to force non in-RAM mode. """ + warnings.filterwarnings( + "ignore", message="max_mem=4 smaller than blk_size=4096" + ) + contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" @@ -234,6 +236,10 @@ def test_read_file_with_empty_lines(self, l_end, ram): @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_different_line_endings(self, l_end, ram): """Using a very small RAM size to force non in-RAM mode.""" + warnings.filterwarnings( + "ignore", message="max_mem=4 smaller than blk_size=4096" + ) + contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") file_name = "test_file.txt" From e9c22ed93f72f6c4745e1f8fe58b03b86a827be1 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 11:58:04 +0800 Subject: [PATCH 69/96] check file pointer reset --- src/monty/io.py | 3 +-- tests/test_io.py | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 698ed7a3..99440353 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -85,7 +85,6 @@ def _get_line_ending( TODO: - Read the last N chars instead of the entire line? - - Unit test assert file.tell() is at start of file """ if isinstance(file, (str, Path)): with zopen(file, "rb") as f: @@ -200,7 +199,7 @@ def reverse_readline( Args: m_file: File stream to read (backwards). blk_size (int): The block size to read each time in bytes. - Defaults to 4096. # TODO: unclear what this actually controls? + Defaults to 4096. max_mem (int): Threshold to determine when to reverse a file in-memory versus reading blocks of a file each time. For bz2 files, this sets the block size. diff --git a/tests/test_io.py b/tests/test_io.py index 88fec84f..69d21cd1 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -43,11 +43,15 @@ def test_get_line_ending(self, l_end): # Test text mode with open(test_file, "r", encoding="utf-8") as f: + start_pot = f.tell() assert _get_line_ending(f) == l_end + assert f.tell() == start_pot # Test binary mode with open(test_file, "rb") as f: + start_pot = f.tell() assert _get_line_ending(f) == l_end + assert f.tell() == start_pot # Test gzip file gzip_filename = f"{test_file}.gz" @@ -56,7 +60,9 @@ def test_get_line_ending(self, l_end): # Opened file stream with gzip.open(gzip_filename, "rb") as f: + start_pot = f.tell() assert _get_line_ending(f) == l_end + assert f.tell() == start_pot # Filename directly assert _get_line_ending(gzip_filename) == l_end @@ -68,7 +74,9 @@ def test_get_line_ending(self, l_end): # Opened file stream with bz2.open(bz2_filename, "rb") as f: + start_pot = f.tell() assert _get_line_ending(f) == l_end + assert f.tell() == start_pot # Filename directly assert _get_line_ending(bz2_filename) == l_end From 0be5a85e430e335d5efae8b6cc0375aad4ca83d8 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 17:07:35 +0800 Subject: [PATCH 70/96] track benchmark script in case we need it someday --- benchmark_monty.py | 186 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 benchmark_monty.py diff --git a/benchmark_monty.py b/benchmark_monty.py new file mode 100644 index 00000000..f8f260eb --- /dev/null +++ b/benchmark_monty.py @@ -0,0 +1,186 @@ +"""Utility script for monty reverse reader speed benchmark. + +Test matrix: +- Different Python versions. +- File of various sizes. +- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. +""" + +from __future__ import annotations + +import os +import platform +import subprocess +import time + +from monty.io import reverse_readline + +# Test config +FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) +PYTHON_VERS = ("3.12",) + +# Env config +CONDA_PATH = "/opt/anaconda3" +PR_URL = "git+https://github.com/DanielYang59/monty.git@readline-line-ending" + +ENV_NAME = "monty_benchmark_env" + + +def prepare_conda_env(python_version, from_url=False): + """Create conda environment, install monty, and get Python version.""" + subprocess.run( + [ + f"{CONDA_PATH}/bin/conda", + "create", + "-y", + "-n", + ENV_NAME, + f"python={python_version}", + ], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"Conda environment {ENV_NAME} created with Python {python_version}.") + + # Install monty + install_cmd = PR_URL if from_url else "monty" + subprocess.run( + [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/pip", "install", install_cmd], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"monty installed {'from URL' if from_url else 'from PyPI'}.") + + # Get Python version + result = subprocess.run( + [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/python", "--version"], + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def create_test_file(file_path, target_size_mb): + """Creates a text file with lines until the target size is reached.""" + target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes + line_number = 1 + + with open(file_path, "w") as f: + while os.path.getsize(file_path) < target_size: + f.write(f"This is line number {line_number}\n") + line_number += 1 + + total_lines = line_number - 1 + print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") + return total_lines + + +def test_readline(file_path, total_lines, readline_func, func_name="readline"): + """General function to test reading lines using a given readline function.""" + + # Read the last line + start = time.perf_counter() + with open(file_path, "r") as f: + _last_line = ( + next(readline_func(f)) + if func_name == "reverse_readline" + else f.readlines()[-1] + ) + last_time = time.perf_counter() - start + + # Calculate the 75% and 50% line numbers + line_75_idx = int(0.75 * total_lines) + line_50_idx = int(0.5 * total_lines) + + # Read the 75% line + start = time.perf_counter() # More accurate timer + with open(file_path, "r") as f: + if func_name == "reverse_readline": + for idx, _line in enumerate(readline_func(f), 1): + if idx == total_lines - line_75_idx: + break + else: + _line = f.readlines()[line_75_idx] + time_75 = time.perf_counter() - start + + # Read the 50% line + start = time.perf_counter() # More accurate timer + with open(file_path, "r") as f: + if func_name == "reverse_readline": + for idx, _line in enumerate(readline_func(f), 1): + if idx == total_lines - line_50_idx: + break + else: + _line = f.readlines()[line_50_idx] + time_50 = time.perf_counter() - start + + print( + f"{func_name.capitalize()} - Last line {total_lines} read, time taken: {last_time:.8f} s." + ) + print( + f"{func_name.capitalize()} - 75% line {line_75_idx} read, time taken: {time_75:.8f} s." + ) + print( + f"{func_name.capitalize()} - 50% line {line_50_idx} read, time taken: {time_50:.8f} s." + ) + + return last_time, time_75, time_50 + + +def run_benchmark(file_size_mb, python_version): + """Run benchmark for both monty and built-in readline.""" + print( + f"\nRunning benchmark for Python {python_version} and file size {file_size_mb} MB." + ) + + test_file = f"test_file_{file_size_mb}MB.txt" + total_lines = create_test_file(test_file, file_size_mb) + + print(f"\nTesting reverse_readline with file size {file_size_mb} MB...") + test_readline(test_file, total_lines, reverse_readline, "reverse_readline") + + print(f"\nTesting built-in readline with file size {file_size_mb} MB...") + test_readline(test_file, total_lines, iter, "readline") + + os.remove(test_file) + + +if __name__ == "__main__": + # Show OS info + os_info = platform.platform() + print(f"\nRunning on OS: {os_info}") + + for python_version in PYTHON_VERS: + for from_url in (False, True): + try: + source_type = "from URL" if from_url else "from PyPI" + print( + f"\n--- Test started for Python {python_version} ({source_type}) ---" + ) + + # Prepare the environment (create conda env and install monty) + installed_python_version = prepare_conda_env( + python_version, from_url=from_url + ) + + for file_size_mb in FILE_SIZES_MB: + # Run benchmark + run_benchmark(file_size_mb, installed_python_version) + + finally: + subprocess.run( + [ + f"{CONDA_PATH}/bin/conda", + "remove", + "-y", + "--name", + ENV_NAME, + "--all", + ], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"Conda environment {ENV_NAME} removed.") From edfa0cb0a87fd36d48ad6765342e136e52db64af Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 17:07:35 +0800 Subject: [PATCH 71/96] track benchmark script in case we need it someday --- benchmark/benchmark_monty.py | 190 +++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 benchmark/benchmark_monty.py diff --git a/benchmark/benchmark_monty.py b/benchmark/benchmark_monty.py new file mode 100644 index 00000000..fedcc124 --- /dev/null +++ b/benchmark/benchmark_monty.py @@ -0,0 +1,190 @@ +"""Utility script for monty reverse reader speed benchmark. + +Test matrix: +- Different Python versions. +- File of various sizes. +- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. +""" + +from __future__ import annotations + +import importlib +import os +import platform +import subprocess +import time + +import monty.io + +# Test config +FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) +PYTHON_VERS = ("3.12",) + +# Env config +CONDA_PATH = "/opt/anaconda3" +PR_URL = "git+https://github.com/DanielYang59/monty.git@readline-line-ending" + +ENV_NAME = "monty_benchmark_env" + + +def prepare_conda_env(python_version, from_url=False): + """Create conda environment, install monty, and get Python version.""" + subprocess.run( + [ + f"{CONDA_PATH}/bin/conda", + "create", + "-y", + "-n", + ENV_NAME, + f"python={python_version}", + ], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"Conda environment {ENV_NAME} created with Python {python_version}.") + + # Install monty + install_cmd = PR_URL if from_url else "monty" + subprocess.run( + [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/pip", "install", install_cmd], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"monty installed {'from URL' if from_url else 'from PyPI'}.") + + # Get Python version + result = subprocess.run( + [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/python", "--version"], + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def create_test_file(file_path, target_size_mb): + """Creates a text file with lines until the target size is reached.""" + target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes + line_number = 1 + + with open(file_path, "w") as f: + while os.path.getsize(file_path) < target_size: + f.write(f"This is line number {line_number}\n") + line_number += 1 + + total_lines = line_number - 1 + print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") + return total_lines + + +def test_readline(file_path, total_lines, readline_func, func_name="readline"): + """General function to test reading lines using a given readline function.""" + + # Read the last line + start = time.perf_counter() + with open(file_path, "r") as f: + _last_line = ( + next(readline_func(f)) + if func_name == "reverse_readline" + else f.readlines()[-1] + ) + last_time = time.perf_counter() - start + + # Calculate the 75% and 50% line numbers + line_75_idx = int(0.75 * total_lines) + line_50_idx = int(0.5 * total_lines) + + # Read the 75% line + start = time.perf_counter() # More accurate timer + with open(file_path, "r") as f: + if func_name == "reverse_readline": + for idx, _line in enumerate(readline_func(f), 1): + if idx == total_lines - line_75_idx: + break + else: + _line = f.readlines()[line_75_idx] + time_75 = time.perf_counter() - start + + # Read the 50% line + start = time.perf_counter() # More accurate timer + with open(file_path, "r") as f: + if func_name == "reverse_readline": + for idx, _line in enumerate(readline_func(f), 1): + if idx == total_lines - line_50_idx: + break + else: + _line = f.readlines()[line_50_idx] + time_50 = time.perf_counter() - start + + print( + f"{func_name.capitalize()} - Last line {total_lines} read, time taken: {last_time:.8f} s." + ) + print( + f"{func_name.capitalize()} - 75% line {line_75_idx} read, time taken: {time_75:.8f} s." + ) + print( + f"{func_name.capitalize()} - 50% line {line_50_idx} read, time taken: {time_50:.8f} s." + ) + + return last_time, time_75, time_50 + + +def run_benchmark(file_size_mb, python_version): + """Run benchmark for both monty and built-in readline.""" + print( + f"\nRunning benchmark for Python {python_version} and file size {file_size_mb} MB." + ) + + importlib.reload(monty.io) + from monty.io import reverse_readline + + test_file = f"test_file_{file_size_mb}MB.txt" + total_lines = create_test_file(test_file, file_size_mb) + + print(f"\nTesting reverse_readline with file size {file_size_mb} MB...") + test_readline(test_file, total_lines, reverse_readline, "reverse_readline") + + print(f"\nTesting built-in readline with file size {file_size_mb} MB...") + test_readline(test_file, total_lines, iter, "readline") + + os.remove(test_file) + + +if __name__ == "__main__": + # Show OS info + os_info = platform.platform() + print(f"\nRunning on OS: {os_info}") + + for python_version in PYTHON_VERS: + for from_url in (False, True): + try: + source_type = "from URL" if from_url else "from PyPI" + print( + f"\n--- Test started for Python {python_version} ({source_type}) ---" + ) + + # Prepare the environment (create conda env and install monty) + installed_python_version = prepare_conda_env( + python_version, from_url=from_url + ) + + for file_size_mb in FILE_SIZES_MB: + # Run benchmark + run_benchmark(file_size_mb, installed_python_version) + + finally: + subprocess.run( + [ + f"{CONDA_PATH}/bin/conda", + "remove", + "-y", + "--name", + ENV_NAME, + "--all", + ], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"Conda environment {ENV_NAME} removed.") From 8714cb4c055734b305efb454445440783a903738 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 15 Sep 2024 19:47:55 +0800 Subject: [PATCH 72/96] simplify test script, make env install manual --- benchmark/benchmark.py | 153 ++++++++++++++++++++++++++++ benchmark/benchmark_monty.py | 190 ----------------------------------- 2 files changed, 153 insertions(+), 190 deletions(-) create mode 100644 benchmark/benchmark.py delete mode 100644 benchmark/benchmark_monty.py diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py new file mode 100644 index 00000000..dfdf55ec --- /dev/null +++ b/benchmark/benchmark.py @@ -0,0 +1,153 @@ +"""Utility script for monty reverse reader speed benchmark. +- File of various sizes. +- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. +""" + +from __future__ import annotations + +import os +import platform +import sys +import time + +from monty.io import reverse_readfile, reverse_readline + +# Test config +FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) + + +def create_test_file(file_path, target_size_mb): + """Creates a text file with lines until the target size is reached.""" + target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes + line_number = 1 + + with open(file_path, "w") as f: + while os.path.getsize(file_path) < target_size: + f.write(f"This is line number {line_number}\n") + line_number += 1 + + total_lines = line_number - 1 + print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") + return total_lines + + +def print_separator(title: str): + print(f"{title}".center(80, "=")) + print("") + + +def test_builtin_readline(file_path, total_lines): + """Test built-in readline function.""" + start = time.perf_counter() + with open(file_path, "r") as f: + _last_line = f.readlines()[-1] + last_time = time.perf_counter() - start + + line_75_idx = int(0.75 * total_lines) + line_50_idx = int(0.5 * total_lines) + + start = time.perf_counter() + with open(file_path, "r") as f: + _line = f.readlines()[line_75_idx] + time_75 = time.perf_counter() - start + + start = time.perf_counter() + with open(file_path, "r") as f: + _line = f.readlines()[line_50_idx] + time_50 = time.perf_counter() - start + + print_separator("Built-in readline") + print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") + print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") + print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") + print_separator("End of Built-in readline") + + return last_time, time_75, time_50 + + +def test_reverse_readline(file_path, total_lines): + """Test reverse_readline function.""" + start = time.perf_counter() + with open(file_path, "r") as f: + _last_line = next(reverse_readline(f)) + last_time = time.perf_counter() - start + + line_75_idx = int(0.75 * total_lines) + line_50_idx = int(0.5 * total_lines) + + start = time.perf_counter() + with open(file_path, "r") as f: + for idx, _line in enumerate(reverse_readline(f), 1): + if idx == total_lines - line_75_idx: + break + time_75 = time.perf_counter() - start + + start = time.perf_counter() + with open(file_path, "r") as f: + for idx, _line in enumerate(reverse_readline(f), 1): + if idx == total_lines - line_50_idx: + break + time_50 = time.perf_counter() - start + + print_separator("reverse_readline") + print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") + print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") + print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") + print_separator("End of reverse_readline") + + return last_time, time_75, time_50 + + +def test_reverse_readfile(file_path, total_lines): + """Test reverse_readfile function.""" + start = time.perf_counter() + _last_line = next(reverse_readfile(file_path)) + last_time = time.perf_counter() - start + + line_75_idx = int(0.75 * total_lines) + line_50_idx = int(0.5 * total_lines) + + start = time.perf_counter() + for idx, _line in enumerate(reverse_readfile(file_path), 1): + if idx == total_lines - line_75_idx: + break + time_75 = time.perf_counter() - start + + start = time.perf_counter() + for idx, _line in enumerate(reverse_readfile(file_path), 1): + if idx == total_lines - line_50_idx: + break + time_50 = time.perf_counter() - start + + print_separator("reverse_readfile") + print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") + print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") + print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") + print_separator("End of reverse_readfile") + + return last_time, time_75, time_50 + + +def run_benchmark(file_size_mb): + """Run benchmark for all test functions.""" + print_separator(f"Benchmarking file size: {file_size_mb} MB") + + test_file = f"test_file_{file_size_mb}MB.txt" + total_lines = create_test_file(test_file, file_size_mb) + + test_builtin_readline(test_file, total_lines) + test_reverse_readline(test_file, total_lines) + test_reverse_readfile(test_file, total_lines) + + os.remove(test_file) + + +if __name__ == "__main__": + # Show OS info + os_info = platform.platform() + python_version = sys.version.split()[0] + print(f"\nRunning on OS: {os_info}, Python {python_version}") + + # Run benchmark for each file size + for file_size_mb in FILE_SIZES_MB: + run_benchmark(file_size_mb) diff --git a/benchmark/benchmark_monty.py b/benchmark/benchmark_monty.py deleted file mode 100644 index fedcc124..00000000 --- a/benchmark/benchmark_monty.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Utility script for monty reverse reader speed benchmark. - -Test matrix: -- Different Python versions. -- File of various sizes. -- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. -""" - -from __future__ import annotations - -import importlib -import os -import platform -import subprocess -import time - -import monty.io - -# Test config -FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) -PYTHON_VERS = ("3.12",) - -# Env config -CONDA_PATH = "/opt/anaconda3" -PR_URL = "git+https://github.com/DanielYang59/monty.git@readline-line-ending" - -ENV_NAME = "monty_benchmark_env" - - -def prepare_conda_env(python_version, from_url=False): - """Create conda environment, install monty, and get Python version.""" - subprocess.run( - [ - f"{CONDA_PATH}/bin/conda", - "create", - "-y", - "-n", - ENV_NAME, - f"python={python_version}", - ], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"Conda environment {ENV_NAME} created with Python {python_version}.") - - # Install monty - install_cmd = PR_URL if from_url else "monty" - subprocess.run( - [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/pip", "install", install_cmd], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"monty installed {'from URL' if from_url else 'from PyPI'}.") - - # Get Python version - result = subprocess.run( - [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/python", "--version"], - capture_output=True, - text=True, - ) - return result.stdout.strip() - - -def create_test_file(file_path, target_size_mb): - """Creates a text file with lines until the target size is reached.""" - target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes - line_number = 1 - - with open(file_path, "w") as f: - while os.path.getsize(file_path) < target_size: - f.write(f"This is line number {line_number}\n") - line_number += 1 - - total_lines = line_number - 1 - print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") - return total_lines - - -def test_readline(file_path, total_lines, readline_func, func_name="readline"): - """General function to test reading lines using a given readline function.""" - - # Read the last line - start = time.perf_counter() - with open(file_path, "r") as f: - _last_line = ( - next(readline_func(f)) - if func_name == "reverse_readline" - else f.readlines()[-1] - ) - last_time = time.perf_counter() - start - - # Calculate the 75% and 50% line numbers - line_75_idx = int(0.75 * total_lines) - line_50_idx = int(0.5 * total_lines) - - # Read the 75% line - start = time.perf_counter() # More accurate timer - with open(file_path, "r") as f: - if func_name == "reverse_readline": - for idx, _line in enumerate(readline_func(f), 1): - if idx == total_lines - line_75_idx: - break - else: - _line = f.readlines()[line_75_idx] - time_75 = time.perf_counter() - start - - # Read the 50% line - start = time.perf_counter() # More accurate timer - with open(file_path, "r") as f: - if func_name == "reverse_readline": - for idx, _line in enumerate(readline_func(f), 1): - if idx == total_lines - line_50_idx: - break - else: - _line = f.readlines()[line_50_idx] - time_50 = time.perf_counter() - start - - print( - f"{func_name.capitalize()} - Last line {total_lines} read, time taken: {last_time:.8f} s." - ) - print( - f"{func_name.capitalize()} - 75% line {line_75_idx} read, time taken: {time_75:.8f} s." - ) - print( - f"{func_name.capitalize()} - 50% line {line_50_idx} read, time taken: {time_50:.8f} s." - ) - - return last_time, time_75, time_50 - - -def run_benchmark(file_size_mb, python_version): - """Run benchmark for both monty and built-in readline.""" - print( - f"\nRunning benchmark for Python {python_version} and file size {file_size_mb} MB." - ) - - importlib.reload(monty.io) - from monty.io import reverse_readline - - test_file = f"test_file_{file_size_mb}MB.txt" - total_lines = create_test_file(test_file, file_size_mb) - - print(f"\nTesting reverse_readline with file size {file_size_mb} MB...") - test_readline(test_file, total_lines, reverse_readline, "reverse_readline") - - print(f"\nTesting built-in readline with file size {file_size_mb} MB...") - test_readline(test_file, total_lines, iter, "readline") - - os.remove(test_file) - - -if __name__ == "__main__": - # Show OS info - os_info = platform.platform() - print(f"\nRunning on OS: {os_info}") - - for python_version in PYTHON_VERS: - for from_url in (False, True): - try: - source_type = "from URL" if from_url else "from PyPI" - print( - f"\n--- Test started for Python {python_version} ({source_type}) ---" - ) - - # Prepare the environment (create conda env and install monty) - installed_python_version = prepare_conda_env( - python_version, from_url=from_url - ) - - for file_size_mb in FILE_SIZES_MB: - # Run benchmark - run_benchmark(file_size_mb, installed_python_version) - - finally: - subprocess.run( - [ - f"{CONDA_PATH}/bin/conda", - "remove", - "-y", - "--name", - ENV_NAME, - "--all", - ], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"Conda environment {ENV_NAME} removed.") From 174d940e8d29cced3083673109cad06d57dcbf6c Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Sun, 15 Sep 2024 20:13:05 +0800 Subject: [PATCH 73/96] save test log on wsl2 --- benchmark/develop_wsl2.txt | 146 +++++++++++++++++++++++++++++++++++++ benchmark/pypi_wsl2.txt | 146 +++++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 benchmark/develop_wsl2.txt create mode 100644 benchmark/pypi_wsl2.txt diff --git a/benchmark/develop_wsl2.txt b/benchmark/develop_wsl2.txt new file mode 100644 index 00000000..40a354c4 --- /dev/null +++ b/benchmark/develop_wsl2.txt @@ -0,0 +1,146 @@ + +Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40857 lines. +===============================Built-in readline================================ + +Last line 40857 read, time taken: 0.00239880 s. +75% line 30642 read, time taken: 0.00195120 s. +50% line 20428 read, time taken: 0.00195210 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40857 read, time taken: 0.00202920 s. +75% line 30642 read, time taken: 0.00318070 s. +50% line 20428 read, time taken: 0.00377320 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40857 read, time taken: 0.00004810 s. +75% line 30642 read, time taken: 0.00371660 s. +50% line 20428 read, time taken: 0.00748130 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392703 lines. +===============================Built-in readline================================ + +Last line 392703 read, time taken: 0.02359850 s. +75% line 294527 read, time taken: 0.02372470 s. +50% line 196351 read, time taken: 0.02274880 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392703 read, time taken: 0.00009730 s. +75% line 294527 read, time taken: 0.08482720 s. +50% line 196351 read, time taken: 0.16897170 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392703 read, time taken: 0.00004930 s. +75% line 294527 read, time taken: 0.03652330 s. +50% line 196351 read, time taken: 0.07156690 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784763 lines. +===============================Built-in readline================================ + +Last line 3784763 read, time taken: 0.22865050 s. +75% line 2838572 read, time taken: 0.22531470 s. +50% line 1892381 read, time taken: 0.22160930 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784763 read, time taken: 0.00008430 s. +75% line 2838572 read, time taken: 0.81884029 s. +50% line 1892381 read, time taken: 1.63674209 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784763 read, time taken: 0.00005140 s. +75% line 2838572 read, time taken: 0.34554400 s. +50% line 1892381 read, time taken: 0.69299879 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462149 lines. +===============================Built-in readline================================ + +Last line 18462149 read, time taken: 1.20599558 s. +75% line 13846611 read, time taken: 1.10270878 s. +50% line 9231074 read, time taken: 1.15081589 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462149 read, time taken: 0.00008520 s. +75% line 13846611 read, time taken: 4.03234395 s. +50% line 9231074 read, time taken: 8.07400641 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462149 read, time taken: 0.00005670 s. +75% line 13846611 read, time taken: 1.68253850 s. +50% line 9231074 read, time taken: 3.37220081 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36541038 lines. +===============================Built-in readline================================ + +Last line 36541038 read, time taken: 2.42040830 s. +75% line 27405778 read, time taken: 2.23429120 s. +50% line 18270519 read, time taken: 2.19824180 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36541038 read, time taken: 0.00008510 s. +75% line 27405778 read, time taken: 8.57079262 s. +50% line 18270519 read, time taken: 15.74267347 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36541038 read, time taken: 0.00005760 s. +75% line 27405778 read, time taken: 3.35192980 s. +50% line 18270519 read, time taken: 6.65849068 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466581 lines. +===============================Built-in readline================================ + +Last line 178466581 read, time taken: 12.07876923 s. +75% line 133849935 read, time taken: 12.38710699 s. +50% line 89233290 read, time taken: 12.44974215 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466581 read, time taken: 0.00009340 s. +75% line 133849935 read, time taken: 41.36420749 s. +50% line 89233290 read, time taken: 78.00581623 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466581 read, time taken: 0.00005820 s. +75% line 133849935 read, time taken: 17.17381484 s. +50% line 89233290 read, time taken: 34.45728315 s. +============================End of reverse_readfile============================= + diff --git a/benchmark/pypi_wsl2.txt b/benchmark/pypi_wsl2.txt new file mode 100644 index 00000000..0f694f2c --- /dev/null +++ b/benchmark/pypi_wsl2.txt @@ -0,0 +1,146 @@ + +Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40857 lines. +===============================Built-in readline================================ + +Last line 40857 read, time taken: 0.00258330 s. +75% line 30642 read, time taken: 0.00183970 s. +50% line 20428 read, time taken: 0.00157660 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40857 read, time taken: 0.00183750 s. +75% line 30642 read, time taken: 0.00282650 s. +50% line 20428 read, time taken: 0.00351460 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40857 read, time taken: 0.00004430 s. +75% line 30642 read, time taken: 0.00387870 s. +50% line 20428 read, time taken: 0.00677220 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392703 lines. +===============================Built-in readline================================ + +Last line 392703 read, time taken: 0.02373670 s. +75% line 294527 read, time taken: 0.02375020 s. +50% line 196351 read, time taken: 0.02245770 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392703 read, time taken: 0.00010220 s. +75% line 294527 read, time taken: 0.10093190 s. +50% line 196351 read, time taken: 0.16555280 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392703 read, time taken: 0.00005150 s. +75% line 294527 read, time taken: 0.03424240 s. +50% line 196351 read, time taken: 0.06756620 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784763 lines. +===============================Built-in readline================================ + +Last line 3784763 read, time taken: 0.22927611 s. +75% line 2838572 read, time taken: 0.22576380 s. +50% line 1892381 read, time taken: 0.22026761 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784763 read, time taken: 0.00008330 s. +75% line 2838572 read, time taken: 0.80485262 s. +50% line 1892381 read, time taken: 1.60336534 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784763 read, time taken: 0.00005080 s. +75% line 2838572 read, time taken: 0.34910371 s. +50% line 1892381 read, time taken: 0.69678711 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462149 lines. +===============================Built-in readline================================ + +Last line 18462149 read, time taken: 1.18820380 s. +75% line 13846611 read, time taken: 1.11525410 s. +50% line 9231074 read, time taken: 1.18887050 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462149 read, time taken: 0.00008390 s. +75% line 13846611 read, time taken: 3.99457921 s. +50% line 9231074 read, time taken: 7.92854667 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462149 read, time taken: 0.00006540 s. +75% line 13846611 read, time taken: 1.61485489 s. +50% line 9231074 read, time taken: 3.26113968 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36541038 lines. +===============================Built-in readline================================ + +Last line 36541038 read, time taken: 2.44387221 s. +75% line 27405778 read, time taken: 2.33452851 s. +50% line 18270519 read, time taken: 2.24032031 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36541038 read, time taken: 0.00008650 s. +75% line 27405778 read, time taken: 8.55044989 s. +50% line 18270519 read, time taken: 15.59317884 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36541038 read, time taken: 0.00005770 s. +75% line 27405778 read, time taken: 3.30868029 s. +50% line 18270519 read, time taken: 6.42568587 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466581 lines. +===============================Built-in readline================================ + +Last line 178466581 read, time taken: 12.40419862 s. +75% line 133849935 read, time taken: 12.18679081 s. +50% line 89233290 read, time taken: 13.93613912 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466581 read, time taken: 0.00011390 s. +75% line 133849935 read, time taken: 41.70395402 s. +50% line 89233290 read, time taken: 77.89940445 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466581 read, time taken: 0.00005320 s. +75% line 133849935 read, time taken: 16.80750758 s. +50% line 89233290 read, time taken: 33.59096583 s. +============================End of reverse_readfile============================= + From 1f7476e1494c2b19793d52a093e31dd914b25f75 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Sep 2024 12:13:16 +0000 Subject: [PATCH 74/96] pre-commit auto-fixes --- benchmark/develop_wsl2.txt | 1 - benchmark/pypi_wsl2.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/benchmark/develop_wsl2.txt b/benchmark/develop_wsl2.txt index 40a354c4..42230d0e 100644 --- a/benchmark/develop_wsl2.txt +++ b/benchmark/develop_wsl2.txt @@ -143,4 +143,3 @@ Last line 178466581 read, time taken: 0.00005820 s. 75% line 133849935 read, time taken: 17.17381484 s. 50% line 89233290 read, time taken: 34.45728315 s. ============================End of reverse_readfile============================= - diff --git a/benchmark/pypi_wsl2.txt b/benchmark/pypi_wsl2.txt index 0f694f2c..1a1be93d 100644 --- a/benchmark/pypi_wsl2.txt +++ b/benchmark/pypi_wsl2.txt @@ -143,4 +143,3 @@ Last line 178466581 read, time taken: 0.00005320 s. 75% line 133849935 read, time taken: 16.80750758 s. 50% line 89233290 read, time taken: 33.59096583 s. ============================End of reverse_readfile============================= - From 0d2af6037c8ed1e82b7f15fe865a47ff46b85ada Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 16 Sep 2024 21:17:21 +0800 Subject: [PATCH 75/96] also track test file create time --- benchmark/benchmark.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index dfdf55ec..1203835a 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -13,7 +13,7 @@ from monty.io import reverse_readfile, reverse_readline # Test config -FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) +FILE_SIZES_MB = (1, 10, 100, 500, 1000,) def create_test_file(file_path, target_size_mb): @@ -21,13 +21,15 @@ def create_test_file(file_path, target_size_mb): target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes line_number = 1 - with open(file_path, "w") as f: + start = time.perf_counter() + with open(file_path, "w", encoding="utf-8", newline="") as f: while os.path.getsize(file_path) < target_size: f.write(f"This is line number {line_number}\n") line_number += 1 + last_time = time.perf_counter() - start total_lines = line_number - 1 - print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") + print(f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time} seconds.") return total_lines From 7cddd29c51132571ec3c45c01336f83e1873b888 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 16 Sep 2024 22:27:58 +0800 Subject: [PATCH 76/96] get obj size as getsize is slow in win --- benchmark/benchmark.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 1203835a..ed5fb0d2 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -13,7 +13,7 @@ from monty.io import reverse_readfile, reverse_readline # Test config -FILE_SIZES_MB = (1, 10, 100, 500, 1000,) +FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) def create_test_file(file_path, target_size_mb): @@ -22,14 +22,29 @@ def create_test_file(file_path, target_size_mb): line_number = 1 start = time.perf_counter() - with open(file_path, "w", encoding="utf-8", newline="") as f: - while os.path.getsize(file_path) < target_size: - f.write(f"This is line number {line_number}\n") - line_number += 1 + + # Create a list of lines and concatenate them at the end + lines = [] + total_bytes_written = 0 + + while total_bytes_written < target_size: + line = f"This is line number {line_number}\n" + line_bytes = line.encode('utf-8') + + if total_bytes_written + len(line_bytes) > target_size: + break + + lines.append(line) + total_bytes_written += len(line_bytes) + line_number += 1 + + with open(file_path, "wb") as f: + f.write("".join(lines).encode('utf-8')) last_time = time.perf_counter() - start + total_lines = line_number - 1 - print(f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time} seconds.") + print(f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time:.2f} seconds.") return total_lines From 0b478dd06e6511fc6a3a605f5fa1931a6987f00f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:28:26 +0000 Subject: [PATCH 77/96] pre-commit auto-fixes --- benchmark/benchmark.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index ed5fb0d2..d60c39f0 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -29,7 +29,7 @@ def create_test_file(file_path, target_size_mb): while total_bytes_written < target_size: line = f"This is line number {line_number}\n" - line_bytes = line.encode('utf-8') + line_bytes = line.encode("utf-8") if total_bytes_written + len(line_bytes) > target_size: break @@ -39,12 +39,14 @@ def create_test_file(file_path, target_size_mb): line_number += 1 with open(file_path, "wb") as f: - f.write("".join(lines).encode('utf-8')) + f.write("".join(lines).encode("utf-8")) last_time = time.perf_counter() - start total_lines = line_number - 1 - print(f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time:.2f} seconds.") + print( + f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time:.2f} seconds." + ) return total_lines From 3053e87d6f8f04878750d99bcba6502a5b6bef3b Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 16 Sep 2024 23:07:41 +0800 Subject: [PATCH 78/96] update builtin readline test not to read entire file --- benchmark/benchmark.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index d60c39f0..12043bff 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -59,7 +59,8 @@ def test_builtin_readline(file_path, total_lines): """Test built-in readline function.""" start = time.perf_counter() with open(file_path, "r") as f: - _last_line = f.readlines()[-1] + for _ in range(total_lines): + _last_line = f.readline() last_time = time.perf_counter() - start line_75_idx = int(0.75 * total_lines) @@ -67,12 +68,14 @@ def test_builtin_readline(file_path, total_lines): start = time.perf_counter() with open(file_path, "r") as f: - _line = f.readlines()[line_75_idx] + for _ in range(line_75_idx + 1): + _line_75 = f.readline() time_75 = time.perf_counter() - start start = time.perf_counter() with open(file_path, "r") as f: - _line = f.readlines()[line_50_idx] + for _ in range(line_50_idx + 1): + _line_50 = f.readline() time_50 = time.perf_counter() - start print_separator("Built-in readline") From 097ae651c8ceea01e98a1d1ffc2fb4a4480bf5a7 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 16 Sep 2024 23:08:21 +0800 Subject: [PATCH 79/96] remove outdated test log --- benchmark/develop_wsl2.txt | 145 ------------------------------------- benchmark/pypi_wsl2.txt | 145 ------------------------------------- 2 files changed, 290 deletions(-) delete mode 100644 benchmark/develop_wsl2.txt delete mode 100644 benchmark/pypi_wsl2.txt diff --git a/benchmark/develop_wsl2.txt b/benchmark/develop_wsl2.txt deleted file mode 100644 index 42230d0e..00000000 --- a/benchmark/develop_wsl2.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40857 lines. -===============================Built-in readline================================ - -Last line 40857 read, time taken: 0.00239880 s. -75% line 30642 read, time taken: 0.00195120 s. -50% line 20428 read, time taken: 0.00195210 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40857 read, time taken: 0.00202920 s. -75% line 30642 read, time taken: 0.00318070 s. -50% line 20428 read, time taken: 0.00377320 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40857 read, time taken: 0.00004810 s. -75% line 30642 read, time taken: 0.00371660 s. -50% line 20428 read, time taken: 0.00748130 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392703 lines. -===============================Built-in readline================================ - -Last line 392703 read, time taken: 0.02359850 s. -75% line 294527 read, time taken: 0.02372470 s. -50% line 196351 read, time taken: 0.02274880 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392703 read, time taken: 0.00009730 s. -75% line 294527 read, time taken: 0.08482720 s. -50% line 196351 read, time taken: 0.16897170 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392703 read, time taken: 0.00004930 s. -75% line 294527 read, time taken: 0.03652330 s. -50% line 196351 read, time taken: 0.07156690 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784763 lines. -===============================Built-in readline================================ - -Last line 3784763 read, time taken: 0.22865050 s. -75% line 2838572 read, time taken: 0.22531470 s. -50% line 1892381 read, time taken: 0.22160930 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784763 read, time taken: 0.00008430 s. -75% line 2838572 read, time taken: 0.81884029 s. -50% line 1892381 read, time taken: 1.63674209 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784763 read, time taken: 0.00005140 s. -75% line 2838572 read, time taken: 0.34554400 s. -50% line 1892381 read, time taken: 0.69299879 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462149 lines. -===============================Built-in readline================================ - -Last line 18462149 read, time taken: 1.20599558 s. -75% line 13846611 read, time taken: 1.10270878 s. -50% line 9231074 read, time taken: 1.15081589 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462149 read, time taken: 0.00008520 s. -75% line 13846611 read, time taken: 4.03234395 s. -50% line 9231074 read, time taken: 8.07400641 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462149 read, time taken: 0.00005670 s. -75% line 13846611 read, time taken: 1.68253850 s. -50% line 9231074 read, time taken: 3.37220081 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36541038 lines. -===============================Built-in readline================================ - -Last line 36541038 read, time taken: 2.42040830 s. -75% line 27405778 read, time taken: 2.23429120 s. -50% line 18270519 read, time taken: 2.19824180 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36541038 read, time taken: 0.00008510 s. -75% line 27405778 read, time taken: 8.57079262 s. -50% line 18270519 read, time taken: 15.74267347 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36541038 read, time taken: 0.00005760 s. -75% line 27405778 read, time taken: 3.35192980 s. -50% line 18270519 read, time taken: 6.65849068 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466581 lines. -===============================Built-in readline================================ - -Last line 178466581 read, time taken: 12.07876923 s. -75% line 133849935 read, time taken: 12.38710699 s. -50% line 89233290 read, time taken: 12.44974215 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466581 read, time taken: 0.00009340 s. -75% line 133849935 read, time taken: 41.36420749 s. -50% line 89233290 read, time taken: 78.00581623 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466581 read, time taken: 0.00005820 s. -75% line 133849935 read, time taken: 17.17381484 s. -50% line 89233290 read, time taken: 34.45728315 s. -============================End of reverse_readfile============================= diff --git a/benchmark/pypi_wsl2.txt b/benchmark/pypi_wsl2.txt deleted file mode 100644 index 1a1be93d..00000000 --- a/benchmark/pypi_wsl2.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40857 lines. -===============================Built-in readline================================ - -Last line 40857 read, time taken: 0.00258330 s. -75% line 30642 read, time taken: 0.00183970 s. -50% line 20428 read, time taken: 0.00157660 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40857 read, time taken: 0.00183750 s. -75% line 30642 read, time taken: 0.00282650 s. -50% line 20428 read, time taken: 0.00351460 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40857 read, time taken: 0.00004430 s. -75% line 30642 read, time taken: 0.00387870 s. -50% line 20428 read, time taken: 0.00677220 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392703 lines. -===============================Built-in readline================================ - -Last line 392703 read, time taken: 0.02373670 s. -75% line 294527 read, time taken: 0.02375020 s. -50% line 196351 read, time taken: 0.02245770 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392703 read, time taken: 0.00010220 s. -75% line 294527 read, time taken: 0.10093190 s. -50% line 196351 read, time taken: 0.16555280 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392703 read, time taken: 0.00005150 s. -75% line 294527 read, time taken: 0.03424240 s. -50% line 196351 read, time taken: 0.06756620 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784763 lines. -===============================Built-in readline================================ - -Last line 3784763 read, time taken: 0.22927611 s. -75% line 2838572 read, time taken: 0.22576380 s. -50% line 1892381 read, time taken: 0.22026761 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784763 read, time taken: 0.00008330 s. -75% line 2838572 read, time taken: 0.80485262 s. -50% line 1892381 read, time taken: 1.60336534 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784763 read, time taken: 0.00005080 s. -75% line 2838572 read, time taken: 0.34910371 s. -50% line 1892381 read, time taken: 0.69678711 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462149 lines. -===============================Built-in readline================================ - -Last line 18462149 read, time taken: 1.18820380 s. -75% line 13846611 read, time taken: 1.11525410 s. -50% line 9231074 read, time taken: 1.18887050 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462149 read, time taken: 0.00008390 s. -75% line 13846611 read, time taken: 3.99457921 s. -50% line 9231074 read, time taken: 7.92854667 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462149 read, time taken: 0.00006540 s. -75% line 13846611 read, time taken: 1.61485489 s. -50% line 9231074 read, time taken: 3.26113968 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36541038 lines. -===============================Built-in readline================================ - -Last line 36541038 read, time taken: 2.44387221 s. -75% line 27405778 read, time taken: 2.33452851 s. -50% line 18270519 read, time taken: 2.24032031 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36541038 read, time taken: 0.00008650 s. -75% line 27405778 read, time taken: 8.55044989 s. -50% line 18270519 read, time taken: 15.59317884 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36541038 read, time taken: 0.00005770 s. -75% line 27405778 read, time taken: 3.30868029 s. -50% line 18270519 read, time taken: 6.42568587 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466581 lines. -===============================Built-in readline================================ - -Last line 178466581 read, time taken: 12.40419862 s. -75% line 133849935 read, time taken: 12.18679081 s. -50% line 89233290 read, time taken: 13.93613912 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466581 read, time taken: 0.00011390 s. -75% line 133849935 read, time taken: 41.70395402 s. -50% line 89233290 read, time taken: 77.89940445 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466581 read, time taken: 0.00005320 s. -75% line 133849935 read, time taken: 16.80750758 s. -50% line 89233290 read, time taken: 33.59096583 s. -============================End of reverse_readfile============================= From cf542a85239c5431541d7407f7cc32f2117f7efd Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Mon, 16 Sep 2024 23:33:12 +0800 Subject: [PATCH 80/96] update test log on windows --- benchmark/develop-win11.txt | 146 ++++++++++++++++++++++++++++++++++ benchmark/pypi-7.12-win11.txt | 146 ++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 benchmark/develop-win11.txt create mode 100644 benchmark/pypi-7.12-win11.txt diff --git a/benchmark/develop-win11.txt b/benchmark/develop-win11.txt new file mode 100644 index 00000000..9f690fc3 --- /dev/null +++ b/benchmark/develop-win11.txt @@ -0,0 +1,146 @@ + +Running on OS: Windows-11-10.0.22631-SP0, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40757 lines, time used 0.03 seconds. +===============================Built-in readline================================ + +Last line 40757 read, time taken: 0.01786360 s. +75% line 30567 read, time taken: 0.00689310 s. +50% line 20378 read, time taken: 0.00435630 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40757 read, time taken: 0.00715640 s. +75% line 30567 read, time taken: 0.00793530 s. +50% line 20378 read, time taken: 0.00999800 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40757 read, time taken: 0.00021470 s. +75% line 30567 read, time taken: 0.00589890 s. +50% line 20378 read, time taken: 0.01156540 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392476 lines, time used 0.16 seconds. +===============================Built-in readline================================ + +Last line 392476 read, time taken: 0.08994260 s. +75% line 294357 read, time taken: 0.06407060 s. +50% line 196238 read, time taken: 0.04269250 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392476 read, time taken: 0.00021460 s. +75% line 294357 read, time taken: 0.20066430 s. +50% line 196238 read, time taken: 0.40051340 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392476 read, time taken: 0.00023090 s. +75% line 294357 read, time taken: 0.05876890 s. +50% line 196238 read, time taken: 0.11659210 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784596 lines, time used 1.79 seconds. +===============================Built-in readline================================ + +Last line 3784596 read, time taken: 0.89517130 s. +75% line 2838447 read, time taken: 0.66395980 s. +50% line 1892298 read, time taken: 0.43874380 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784596 read, time taken: 0.00024390 s. +75% line 2838447 read, time taken: 2.01433790 s. +50% line 1892298 read, time taken: 3.89665920 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784596 read, time taken: 0.00024540 s. +75% line 2838447 read, time taken: 0.55314700 s. +50% line 1892298 read, time taken: 1.10856910 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462038 lines, time used 8.46 seconds. +===============================Built-in readline================================ + +Last line 18462038 read, time taken: 4.11462000 s. +75% line 13846528 read, time taken: 3.03727910 s. +50% line 9231019 read, time taken: 2.00691610 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462038 read, time taken: 0.00023220 s. +75% line 13846528 read, time taken: 9.28227760 s. +50% line 9231019 read, time taken: 18.47311200 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462038 read, time taken: 0.00022390 s. +75% line 13846528 read, time taken: 2.60500570 s. +50% line 9231019 read, time taken: 5.27204600 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36540934 lines, time used 16.16 seconds. +===============================Built-in readline================================ + +Last line 36540934 read, time taken: 8.21203530 s. +75% line 27405700 read, time taken: 6.17786950 s. +50% line 18270467 read, time taken: 4.08797350 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36540934 read, time taken: 0.00021340 s. +75% line 27405700 read, time taken: 18.49044130 s. +50% line 18270467 read, time taken: 37.14956300 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36540934 read, time taken: 0.00022620 s. +75% line 27405700 read, time taken: 5.27717680 s. +50% line 18270467 read, time taken: 10.61577650 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466370 lines, time used 82.48 seconds. +===============================Built-in readline================================ + +Last line 178466370 read, time taken: 40.74100250 s. +75% line 133849777 read, time taken: 30.38311240 s. +50% line 89233185 read, time taken: 20.11819820 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466370 read, time taken: 0.00158920 s. +75% line 133849777 read, time taken: 90.74261630 s. +50% line 89233185 read, time taken: 182.34117580 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466370 read, time taken: 0.00046110 s. +75% line 133849777 read, time taken: 27.03316890 s. +50% line 89233185 read, time taken: 53.99668290 s. +============================End of reverse_readfile============================= + diff --git a/benchmark/pypi-7.12-win11.txt b/benchmark/pypi-7.12-win11.txt new file mode 100644 index 00000000..711eb3cb --- /dev/null +++ b/benchmark/pypi-7.12-win11.txt @@ -0,0 +1,146 @@ + +Running on OS: Windows-11-10.0.22631-SP0, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40757 lines, time used 0.02 seconds. +===============================Built-in readline================================ + +Last line 40757 read, time taken: 0.01602000 s. +75% line 30567 read, time taken: 0.00678060 s. +50% line 20378 read, time taken: 0.00440010 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40757 read, time taken: 0.00682100 s. +75% line 30567 read, time taken: 0.00861640 s. +50% line 20378 read, time taken: 0.01032880 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40757 read, time taken: 0.00035980 s. +75% line 30567 read, time taken: 0.01145030 s. +50% line 20378 read, time taken: 0.01462530 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392476 lines, time used 0.17 seconds. +===============================Built-in readline================================ + +Last line 392476 read, time taken: 0.08988620 s. +75% line 294357 read, time taken: 0.06274470 s. +50% line 196238 read, time taken: 0.04201110 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392476 read, time taken: 0.07084520 s. +75% line 294357 read, time taken: 0.08927400 s. +50% line 196238 read, time taken: 0.10541760 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392476 read, time taken: 0.00015940 s. +75% line 294357 read, time taken: 0.05308200 s. +50% line 196238 read, time taken: 0.09734290 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784596 lines, time used 1.71 seconds. +===============================Built-in readline================================ + +Last line 3784596 read, time taken: 0.88680830 s. +75% line 2838447 read, time taken: 0.66664480 s. +50% line 1892298 read, time taken: 0.44697960 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784596 read, time taken: 0.71368880 s. +75% line 2838447 read, time taken: 0.86291260 s. +50% line 1892298 read, time taken: 1.04332270 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784596 read, time taken: 0.00015020 s. +75% line 2838447 read, time taken: 0.51161870 s. +50% line 1892298 read, time taken: 0.90967440 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462038 lines, time used 8.34 seconds. +===============================Built-in readline================================ + +Last line 18462038 read, time taken: 4.18576000 s. +75% line 13846528 read, time taken: 3.04948250 s. +50% line 9231019 read, time taken: 2.00662010 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462038 read, time taken: 3.52498280 s. +75% line 13846528 read, time taken: 4.23446110 s. +50% line 9231019 read, time taken: 5.16286300 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462038 read, time taken: 0.00017130 s. +75% line 13846528 read, time taken: 2.23264890 s. +50% line 9231019 read, time taken: 4.44377030 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36540934 lines, time used 16.35 seconds. +===============================Built-in readline================================ + +Last line 36540934 read, time taken: 8.15072480 s. +75% line 27405700 read, time taken: 6.11769640 s. +50% line 18270467 read, time taken: 4.03451900 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36540934 read, time taken: 6.99752910 s. +75% line 27405700 read, time taken: 8.55939080 s. +50% line 18270467 read, time taken: 10.08266420 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36540934 read, time taken: 0.00017990 s. +75% line 27405700 read, time taken: 4.50390560 s. +50% line 18270467 read, time taken: 8.96961540 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466370 lines, time used 85.79 seconds. +===============================Built-in readline================================ + +Last line 178466370 read, time taken: 40.56061480 s. +75% line 133849777 read, time taken: 30.29256250 s. +50% line 89233185 read, time taken: 20.17005980 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466370 read, time taken: 34.63317840 s. +75% line 133849777 read, time taken: 41.73865510 s. +50% line 89233185 read, time taken: 50.30206010 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466370 read, time taken: 0.00019910 s. +75% line 133849777 read, time taken: 22.48041550 s. +50% line 89233185 read, time taken: 44.89090870 s. +============================End of reverse_readfile============================= + From 8dc894e6ccdf7d812a9bbd8bdcbbce2064622b33 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Sep 2024 15:34:54 +0000 Subject: [PATCH 81/96] pre-commit auto-fixes --- benchmark/develop-win11.txt | 1 - benchmark/pypi-7.12-win11.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/benchmark/develop-win11.txt b/benchmark/develop-win11.txt index 9f690fc3..c55682ce 100644 --- a/benchmark/develop-win11.txt +++ b/benchmark/develop-win11.txt @@ -143,4 +143,3 @@ Last line 178466370 read, time taken: 0.00046110 s. 75% line 133849777 read, time taken: 27.03316890 s. 50% line 89233185 read, time taken: 53.99668290 s. ============================End of reverse_readfile============================= - diff --git a/benchmark/pypi-7.12-win11.txt b/benchmark/pypi-7.12-win11.txt index 711eb3cb..b47fd915 100644 --- a/benchmark/pypi-7.12-win11.txt +++ b/benchmark/pypi-7.12-win11.txt @@ -143,4 +143,3 @@ Last line 178466370 read, time taken: 0.00019910 s. 75% line 133849777 read, time taken: 22.48041550 s. 50% line 89233185 read, time taken: 44.89090870 s. ============================End of reverse_readfile============================= - From e90da640996fb639c66095400a1b62fd901ca4b2 Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Wed, 18 Sep 2024 19:17:34 +0800 Subject: [PATCH 82/96] test on Ubuntu 22.04 WSL2 --- benchmark/develop-ubuntu2204.txt | 146 +++++++++++++++++++++++++++++ benchmark/pypi-7.12-ubuntu2204.txt | 146 +++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 benchmark/develop-ubuntu2204.txt create mode 100644 benchmark/pypi-7.12-ubuntu2204.txt diff --git a/benchmark/develop-ubuntu2204.txt b/benchmark/develop-ubuntu2204.txt new file mode 100644 index 00000000..16c59299 --- /dev/null +++ b/benchmark/develop-ubuntu2204.txt @@ -0,0 +1,146 @@ + +Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40757 lines, time used 0.01 seconds. +===============================Built-in readline================================ + +Last line 40757 read, time taken: 0.00299960 s. +75% line 30567 read, time taken: 0.00225260 s. +50% line 20378 read, time taken: 0.00147340 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40757 read, time taken: 0.00215450 s. +75% line 30567 read, time taken: 0.00299860 s. +50% line 20378 read, time taken: 0.00347230 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40757 read, time taken: 0.00007200 s. +75% line 30567 read, time taken: 0.00393440 s. +50% line 20378 read, time taken: 0.00788680 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392476 lines, time used 0.11 seconds. +===============================Built-in readline================================ + +Last line 392476 read, time taken: 0.02891790 s. +75% line 294357 read, time taken: 0.02203730 s. +50% line 196238 read, time taken: 0.01453720 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392476 read, time taken: 0.00010690 s. +75% line 294357 read, time taken: 0.08467931 s. +50% line 196238 read, time taken: 0.16308102 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392476 read, time taken: 0.00007110 s. +75% line 294357 read, time taken: 0.03886231 s. +50% line 196238 read, time taken: 0.07676581 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784596 lines, time used 1.11 seconds. +===============================Built-in readline================================ + +Last line 3784596 read, time taken: 0.28840513 s. +75% line 2838447 read, time taken: 0.22593426 s. +50% line 1892298 read, time taken: 0.14234112 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784596 read, time taken: 0.00010290 s. +75% line 2838447 read, time taken: 0.80831332 s. +50% line 1892298 read, time taken: 1.62029043 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784596 read, time taken: 0.00007960 s. +75% line 2838447 read, time taken: 0.35162211 s. +50% line 1892298 read, time taken: 0.70727881 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462038 lines, time used 5.24 seconds. +===============================Built-in readline================================ + +Last line 18462038 read, time taken: 1.38365400 s. +75% line 13846528 read, time taken: 1.11287734 s. +50% line 9231019 read, time taken: 0.69148150 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462038 read, time taken: 0.00009210 s. +75% line 13846528 read, time taken: 3.98585572 s. +50% line 9231019 read, time taken: 7.93171154 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462038 read, time taken: 0.00007860 s. +75% line 13846528 read, time taken: 1.75920299 s. +50% line 9231019 read, time taken: 3.45924340 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36540934 lines, time used 10.33 seconds. +===============================Built-in readline================================ + +Last line 36540934 read, time taken: 2.79645362 s. +75% line 27405700 read, time taken: 2.20113669 s. +50% line 18270467 read, time taken: 1.37504352 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36540934 read, time taken: 0.00009140 s. +75% line 27405700 read, time taken: 8.09242921 s. +50% line 18270467 read, time taken: 15.43773309 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36540934 read, time taken: 0.00010630 s. +75% line 27405700 read, time taken: 3.53976401 s. +50% line 18270467 read, time taken: 7.18230309 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466370 lines, time used 59.52 seconds. +===============================Built-in readline================================ + +Last line 178466370 read, time taken: 14.19376238 s. +75% line 133849777 read, time taken: 10.26362936 s. +50% line 89233185 read, time taken: 6.69335968 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466370 read, time taken: 0.00255530 s. +75% line 133849777 read, time taken: 39.30825986 s. +50% line 89233185 read, time taken: 76.32949390 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466370 read, time taken: 0.00073260 s. +75% line 133849777 read, time taken: 18.99764580 s. +50% line 89233185 read, time taken: 37.87064222 s. +============================End of reverse_readfile============================= + diff --git a/benchmark/pypi-7.12-ubuntu2204.txt b/benchmark/pypi-7.12-ubuntu2204.txt new file mode 100644 index 00000000..39e0c4aa --- /dev/null +++ b/benchmark/pypi-7.12-ubuntu2204.txt @@ -0,0 +1,146 @@ + +Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 +==========================Benchmarking file size: 1 MB========================== + +Test file of size 1 MB created with 40757 lines, time used 0.01 seconds. +===============================Built-in readline================================ + +Last line 40757 read, time taken: 0.00308300 s. +75% line 30567 read, time taken: 0.00226110 s. +50% line 20378 read, time taken: 0.00148280 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 40757 read, time taken: 0.00208500 s. +75% line 30567 read, time taken: 0.00314750 s. +50% line 20378 read, time taken: 0.00374380 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 40757 read, time taken: 0.00006290 s. +75% line 30567 read, time taken: 0.00327840 s. +50% line 20378 read, time taken: 0.00682650 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 10 MB========================== + +Test file of size 10 MB created with 392476 lines, time used 0.11 seconds. +===============================Built-in readline================================ + +Last line 392476 read, time taken: 0.03117800 s. +75% line 294357 read, time taken: 0.02239440 s. +50% line 196238 read, time taken: 0.01462070 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 392476 read, time taken: 0.00008240 s. +75% line 294357 read, time taken: 0.08691510 s. +50% line 196238 read, time taken: 0.17316620 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 392476 read, time taken: 0.00005700 s. +75% line 294357 read, time taken: 0.03377610 s. +50% line 196238 read, time taken: 0.06763100 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 100 MB========================= + +Test file of size 100 MB created with 3784596 lines, time used 1.10 seconds. +===============================Built-in readline================================ + +Last line 3784596 read, time taken: 0.34282561 s. +75% line 2838447 read, time taken: 0.25080561 s. +50% line 1892298 read, time taken: 0.15123111 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 3784596 read, time taken: 0.00008080 s. +75% line 2838447 read, time taken: 0.84473163 s. +50% line 1892298 read, time taken: 1.67904916 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 3784596 read, time taken: 0.00008110 s. +75% line 2838447 read, time taken: 0.31592141 s. +50% line 1892298 read, time taken: 0.64531112 s. +============================End of reverse_readfile============================= + +=========================Benchmarking file size: 500 MB========================= + +Test file of size 500 MB created with 18462038 lines, time used 5.47 seconds. +===============================Built-in readline================================ + +Last line 18462038 read, time taken: 1.65052795 s. +75% line 13846528 read, time taken: 1.15872567 s. +50% line 9231019 read, time taken: 0.69172220 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 18462038 read, time taken: 0.00008290 s. +75% line 13846528 read, time taken: 4.07981196 s. +50% line 9231019 read, time taken: 8.28308262 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 18462038 read, time taken: 0.00007250 s. +75% line 13846528 read, time taken: 1.56064874 s. +50% line 9231019 read, time taken: 3.14020898 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 1000 MB========================= + +Test file of size 1000 MB created with 36540934 lines, time used 11.63 seconds. +===============================Built-in readline================================ + +Last line 36540934 read, time taken: 3.14226262 s. +75% line 27405700 read, time taken: 2.52962214 s. +50% line 18270467 read, time taken: 1.50383193 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 36540934 read, time taken: 0.00008620 s. +75% line 27405700 read, time taken: 8.48286770 s. +50% line 18270467 read, time taken: 17.16495857 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 36540934 read, time taken: 0.00007400 s. +75% line 27405700 read, time taken: 3.23107996 s. +50% line 18270467 read, time taken: 6.59040188 s. +============================End of reverse_readfile============================= + +========================Benchmarking file size: 5000 MB========================= + +Test file of size 5000 MB created with 178466370 lines, time used 68.08 seconds. +===============================Built-in readline================================ + +Last line 178466370 read, time taken: 14.81192763 s. +75% line 133849777 read, time taken: 10.86625818 s. +50% line 89233185 read, time taken: 6.82962298 s. +============================End of Built-in readline============================ + +================================reverse_readline================================ + +Last line 178466370 read, time taken: 0.00107610 s. +75% line 133849777 read, time taken: 41.98991041 s. +50% line 89233185 read, time taken: 88.86566979 s. +============================End of reverse_readline============================= + +================================reverse_readfile================================ + +Last line 178466370 read, time taken: 0.00072650 s. +75% line 133849777 read, time taken: 16.51870077 s. +50% line 89233185 read, time taken: 32.09695029 s. +============================End of reverse_readfile============================= + From df0288b581bc290c2925c689cf1912a5e777cead Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:17:45 +0000 Subject: [PATCH 83/96] pre-commit auto-fixes --- benchmark/develop-ubuntu2204.txt | 1 - benchmark/pypi-7.12-ubuntu2204.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/benchmark/develop-ubuntu2204.txt b/benchmark/develop-ubuntu2204.txt index 16c59299..a9f882fc 100644 --- a/benchmark/develop-ubuntu2204.txt +++ b/benchmark/develop-ubuntu2204.txt @@ -143,4 +143,3 @@ Last line 178466370 read, time taken: 0.00073260 s. 75% line 133849777 read, time taken: 18.99764580 s. 50% line 89233185 read, time taken: 37.87064222 s. ============================End of reverse_readfile============================= - diff --git a/benchmark/pypi-7.12-ubuntu2204.txt b/benchmark/pypi-7.12-ubuntu2204.txt index 39e0c4aa..ded5db11 100644 --- a/benchmark/pypi-7.12-ubuntu2204.txt +++ b/benchmark/pypi-7.12-ubuntu2204.txt @@ -143,4 +143,3 @@ Last line 178466370 read, time taken: 0.00072650 s. 75% line 133849777 read, time taken: 16.51870077 s. 50% line 89233185 read, time taken: 32.09695029 s. ============================End of reverse_readfile============================= - From 0ea6828d715415a47a675f4aaff76b27f871d3e6 Mon Sep 17 00:00:00 2001 From: Haoyu Yang Date: Wed, 18 Sep 2024 19:20:01 +0800 Subject: [PATCH 84/96] remove dup test script --- benchmark_monty.py | 186 --------------------------------------------- 1 file changed, 186 deletions(-) delete mode 100644 benchmark_monty.py diff --git a/benchmark_monty.py b/benchmark_monty.py deleted file mode 100644 index f8f260eb..00000000 --- a/benchmark_monty.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Utility script for monty reverse reader speed benchmark. - -Test matrix: -- Different Python versions. -- File of various sizes. -- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. -""" - -from __future__ import annotations - -import os -import platform -import subprocess -import time - -from monty.io import reverse_readline - -# Test config -FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) -PYTHON_VERS = ("3.12",) - -# Env config -CONDA_PATH = "/opt/anaconda3" -PR_URL = "git+https://github.com/DanielYang59/monty.git@readline-line-ending" - -ENV_NAME = "monty_benchmark_env" - - -def prepare_conda_env(python_version, from_url=False): - """Create conda environment, install monty, and get Python version.""" - subprocess.run( - [ - f"{CONDA_PATH}/bin/conda", - "create", - "-y", - "-n", - ENV_NAME, - f"python={python_version}", - ], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"Conda environment {ENV_NAME} created with Python {python_version}.") - - # Install monty - install_cmd = PR_URL if from_url else "monty" - subprocess.run( - [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/pip", "install", install_cmd], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"monty installed {'from URL' if from_url else 'from PyPI'}.") - - # Get Python version - result = subprocess.run( - [f"{CONDA_PATH}/envs/{ENV_NAME}/bin/python", "--version"], - capture_output=True, - text=True, - ) - return result.stdout.strip() - - -def create_test_file(file_path, target_size_mb): - """Creates a text file with lines until the target size is reached.""" - target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes - line_number = 1 - - with open(file_path, "w") as f: - while os.path.getsize(file_path) < target_size: - f.write(f"This is line number {line_number}\n") - line_number += 1 - - total_lines = line_number - 1 - print(f"Test file of size {target_size_mb} MB created with {total_lines} lines.") - return total_lines - - -def test_readline(file_path, total_lines, readline_func, func_name="readline"): - """General function to test reading lines using a given readline function.""" - - # Read the last line - start = time.perf_counter() - with open(file_path, "r") as f: - _last_line = ( - next(readline_func(f)) - if func_name == "reverse_readline" - else f.readlines()[-1] - ) - last_time = time.perf_counter() - start - - # Calculate the 75% and 50% line numbers - line_75_idx = int(0.75 * total_lines) - line_50_idx = int(0.5 * total_lines) - - # Read the 75% line - start = time.perf_counter() # More accurate timer - with open(file_path, "r") as f: - if func_name == "reverse_readline": - for idx, _line in enumerate(readline_func(f), 1): - if idx == total_lines - line_75_idx: - break - else: - _line = f.readlines()[line_75_idx] - time_75 = time.perf_counter() - start - - # Read the 50% line - start = time.perf_counter() # More accurate timer - with open(file_path, "r") as f: - if func_name == "reverse_readline": - for idx, _line in enumerate(readline_func(f), 1): - if idx == total_lines - line_50_idx: - break - else: - _line = f.readlines()[line_50_idx] - time_50 = time.perf_counter() - start - - print( - f"{func_name.capitalize()} - Last line {total_lines} read, time taken: {last_time:.8f} s." - ) - print( - f"{func_name.capitalize()} - 75% line {line_75_idx} read, time taken: {time_75:.8f} s." - ) - print( - f"{func_name.capitalize()} - 50% line {line_50_idx} read, time taken: {time_50:.8f} s." - ) - - return last_time, time_75, time_50 - - -def run_benchmark(file_size_mb, python_version): - """Run benchmark for both monty and built-in readline.""" - print( - f"\nRunning benchmark for Python {python_version} and file size {file_size_mb} MB." - ) - - test_file = f"test_file_{file_size_mb}MB.txt" - total_lines = create_test_file(test_file, file_size_mb) - - print(f"\nTesting reverse_readline with file size {file_size_mb} MB...") - test_readline(test_file, total_lines, reverse_readline, "reverse_readline") - - print(f"\nTesting built-in readline with file size {file_size_mb} MB...") - test_readline(test_file, total_lines, iter, "readline") - - os.remove(test_file) - - -if __name__ == "__main__": - # Show OS info - os_info = platform.platform() - print(f"\nRunning on OS: {os_info}") - - for python_version in PYTHON_VERS: - for from_url in (False, True): - try: - source_type = "from URL" if from_url else "from PyPI" - print( - f"\n--- Test started for Python {python_version} ({source_type}) ---" - ) - - # Prepare the environment (create conda env and install monty) - installed_python_version = prepare_conda_env( - python_version, from_url=from_url - ) - - for file_size_mb in FILE_SIZES_MB: - # Run benchmark - run_benchmark(file_size_mb, installed_python_version) - - finally: - subprocess.run( - [ - f"{CONDA_PATH}/bin/conda", - "remove", - "-y", - "--name", - ENV_NAME, - "--all", - ], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - print(f"Conda environment {ENV_NAME} removed.") From f354756ff254e5a12607e3258cd8de8da50c949c Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 19 Sep 2024 11:41:38 +0800 Subject: [PATCH 85/96] clear finished TODO tag --- src/monty/io.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 99440353..ecbdcff2 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -82,9 +82,6 @@ def _get_line_ending( Warnings: If file is empty, "\n" would be used as default. - - TODO: - - Read the last N chars instead of the entire line? """ if isinstance(file, (str, Path)): with zopen(file, "rb") as f: @@ -181,14 +178,6 @@ def reverse_readline( Cases where file would be read forwards and reversed in RAM: - If file size is smaller than RAM usage limit (max_mem). - Gzip files, as reverse seeks are not supported. - # WARNING: gzip might decompress in-RAM, and be careful about - the RAM usage (compression ratio) # TODO: confirm this - - TODO: - - Could buffer get overly large (buffer += to_read) if - rfind(l_end) missed several times in a row (line longer - than blk_size)? Need to profile RAM usage. - - Test gzip seek speed (not supported previously) Reference: Based on code by Peter Astrand , using @@ -249,7 +238,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - skip_1st_l_end = False # TODO: better way to skip first match + skip_1st_l_end = False while True: l_end_pos: int = buffer.rfind(l_end) From 96c91080a516c0e56dc38ca3da329ca7d6065f6e Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 19 Sep 2024 13:28:54 +0800 Subject: [PATCH 86/96] tweak var name --- src/monty/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index ecbdcff2..1d7b9c4d 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -238,7 +238,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - skip_1st_l_end = False + skipped_1st_l_end = False while True: l_end_pos: int = buffer.rfind(l_end) @@ -251,9 +251,9 @@ def reverse_readline( buffer = buffer[:l_end_pos] # buffer doesn't include l_end # Skip first match (the last line ending) - if skip_1st_l_end: + if skipped_1st_l_end: yield line + l_end - skip_1st_l_end = True + skipped_1st_l_end = True # Line ending not in current buffer, load next block into the buffer elif pt_pos > 0: From e1786efa8645ee51404918c1910b30806eeab467 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 19 Sep 2024 13:30:57 +0800 Subject: [PATCH 87/96] fix missing newline char in comment --- src/monty/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 1d7b9c4d..2232d5de 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -69,8 +69,8 @@ def _get_line_ending( This function assumes the file has a single consistent line ending. WARNING: as per the POSIX standard, a line is: "A sequence of zero or - more non- characters plus a terminating character.", as such this func - would fail if the only line misses a terminating character. + more non- characters plus a terminating char.", + as such this func might fail if the only line misses a terminating character. https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html Returns: From f90074c7bd3308771854c1611cf440a499dafd48 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 19 Sep 2024 13:35:33 +0800 Subject: [PATCH 88/96] guard warning filter with context manager --- tests/test_io.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 69d21cd1..a28ef760 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -203,14 +203,13 @@ def test_read_file_with_empty_lines(self, l_end, ram): """Empty lines should not be skipped. Using a very small RAM size to force non in-RAM mode. """ - warnings.filterwarnings( - "ignore", message="max_mem=4 smaller than blk_size=4096" - ) - contents = (f"line1{l_end}", f"{l_end}", f"line3{l_end}") filename = "test_empty_line.txt" - with ScratchDir("."): + with ScratchDir("."), warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="max_mem=4 smaller than blk_size=4096" + ) # Test text file with open(filename, "wb") as file: for line in contents: @@ -244,14 +243,14 @@ def test_read_file_with_empty_lines(self, l_end, ram): @pytest.mark.parametrize("l_end", ["\n", "\r\n"]) def test_different_line_endings(self, l_end, ram): """Using a very small RAM size to force non in-RAM mode.""" - warnings.filterwarnings( - "ignore", message="max_mem=4 smaller than blk_size=4096" - ) - contents = (f"Line1{l_end}", f"Line2{l_end}", f"Line3{l_end}") file_name = "test_file.txt" - with ScratchDir("."): + with ScratchDir("."), warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="max_mem=4 smaller than blk_size=4096" + ) + with open(file_name, "wb") as file: for line in contents: file.write(line.encode()) From 27b28a69e5378295fd05f06bde8cbfba3b897f2e Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Thu, 19 Sep 2024 13:36:29 +0800 Subject: [PATCH 89/96] add type annotation --- src/monty/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index 2232d5de..7c6b264d 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -238,7 +238,7 @@ def reverse_readline( buffer: str = "" m_file.seek(0, 2) - skipped_1st_l_end = False + skipped_1st_l_end: bool = False while True: l_end_pos: int = buffer.rfind(l_end) From 1cdc75b4b786033cdbc4d588f0484bebda8f3bb1 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sat, 21 Sep 2024 10:58:51 +0800 Subject: [PATCH 90/96] put tag into condition branch --- src/monty/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/monty/io.py b/src/monty/io.py index 7c6b264d..56a1ff95 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -253,7 +253,8 @@ def reverse_readline( # Skip first match (the last line ending) if skipped_1st_l_end: yield line + l_end - skipped_1st_l_end = True + else: + skipped_1st_l_end = True # Line ending not in current buffer, load next block into the buffer elif pt_pos > 0: From e4940e04f3d0d01c05dc561db97253b624b4850a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Sun, 22 Sep 2024 13:02:26 +0800 Subject: [PATCH 91/96] untrack benchmark script and results --- benchmark/benchmark.py | 175 ----------------------------- benchmark/develop-ubuntu2204.txt | 145 ------------------------ benchmark/develop-win11.txt | 145 ------------------------ benchmark/pypi-7.12-ubuntu2204.txt | 145 ------------------------ benchmark/pypi-7.12-win11.txt | 145 ------------------------ 5 files changed, 755 deletions(-) delete mode 100644 benchmark/benchmark.py delete mode 100644 benchmark/develop-ubuntu2204.txt delete mode 100644 benchmark/develop-win11.txt delete mode 100644 benchmark/pypi-7.12-ubuntu2204.txt delete mode 100644 benchmark/pypi-7.12-win11.txt diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py deleted file mode 100644 index 12043bff..00000000 --- a/benchmark/benchmark.py +++ /dev/null @@ -1,175 +0,0 @@ -"""Utility script for monty reverse reader speed benchmark. -- File of various sizes. -- Find the last line, 75 % line and 50 % line, and compare with reading from forwards. -""" - -from __future__ import annotations - -import os -import platform -import sys -import time - -from monty.io import reverse_readfile, reverse_readline - -# Test config -FILE_SIZES_MB = (1, 10, 100, 500, 1000, 5000) - - -def create_test_file(file_path, target_size_mb): - """Creates a text file with lines until the target size is reached.""" - target_size = target_size_mb * 1024 * 1024 # Convert MB to bytes - line_number = 1 - - start = time.perf_counter() - - # Create a list of lines and concatenate them at the end - lines = [] - total_bytes_written = 0 - - while total_bytes_written < target_size: - line = f"This is line number {line_number}\n" - line_bytes = line.encode("utf-8") - - if total_bytes_written + len(line_bytes) > target_size: - break - - lines.append(line) - total_bytes_written += len(line_bytes) - line_number += 1 - - with open(file_path, "wb") as f: - f.write("".join(lines).encode("utf-8")) - - last_time = time.perf_counter() - start - - total_lines = line_number - 1 - print( - f"Test file of size {target_size_mb} MB created with {total_lines} lines, time used {last_time:.2f} seconds." - ) - return total_lines - - -def print_separator(title: str): - print(f"{title}".center(80, "=")) - print("") - - -def test_builtin_readline(file_path, total_lines): - """Test built-in readline function.""" - start = time.perf_counter() - with open(file_path, "r") as f: - for _ in range(total_lines): - _last_line = f.readline() - last_time = time.perf_counter() - start - - line_75_idx = int(0.75 * total_lines) - line_50_idx = int(0.5 * total_lines) - - start = time.perf_counter() - with open(file_path, "r") as f: - for _ in range(line_75_idx + 1): - _line_75 = f.readline() - time_75 = time.perf_counter() - start - - start = time.perf_counter() - with open(file_path, "r") as f: - for _ in range(line_50_idx + 1): - _line_50 = f.readline() - time_50 = time.perf_counter() - start - - print_separator("Built-in readline") - print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") - print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") - print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") - print_separator("End of Built-in readline") - - return last_time, time_75, time_50 - - -def test_reverse_readline(file_path, total_lines): - """Test reverse_readline function.""" - start = time.perf_counter() - with open(file_path, "r") as f: - _last_line = next(reverse_readline(f)) - last_time = time.perf_counter() - start - - line_75_idx = int(0.75 * total_lines) - line_50_idx = int(0.5 * total_lines) - - start = time.perf_counter() - with open(file_path, "r") as f: - for idx, _line in enumerate(reverse_readline(f), 1): - if idx == total_lines - line_75_idx: - break - time_75 = time.perf_counter() - start - - start = time.perf_counter() - with open(file_path, "r") as f: - for idx, _line in enumerate(reverse_readline(f), 1): - if idx == total_lines - line_50_idx: - break - time_50 = time.perf_counter() - start - - print_separator("reverse_readline") - print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") - print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") - print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") - print_separator("End of reverse_readline") - - return last_time, time_75, time_50 - - -def test_reverse_readfile(file_path, total_lines): - """Test reverse_readfile function.""" - start = time.perf_counter() - _last_line = next(reverse_readfile(file_path)) - last_time = time.perf_counter() - start - - line_75_idx = int(0.75 * total_lines) - line_50_idx = int(0.5 * total_lines) - - start = time.perf_counter() - for idx, _line in enumerate(reverse_readfile(file_path), 1): - if idx == total_lines - line_75_idx: - break - time_75 = time.perf_counter() - start - - start = time.perf_counter() - for idx, _line in enumerate(reverse_readfile(file_path), 1): - if idx == total_lines - line_50_idx: - break - time_50 = time.perf_counter() - start - - print_separator("reverse_readfile") - print(f"Last line {total_lines} read, time taken: {last_time:.8f} s.") - print(f"75% line {line_75_idx} read, time taken: {time_75:.8f} s.") - print(f"50% line {line_50_idx} read, time taken: {time_50:.8f} s.") - print_separator("End of reverse_readfile") - - return last_time, time_75, time_50 - - -def run_benchmark(file_size_mb): - """Run benchmark for all test functions.""" - print_separator(f"Benchmarking file size: {file_size_mb} MB") - - test_file = f"test_file_{file_size_mb}MB.txt" - total_lines = create_test_file(test_file, file_size_mb) - - test_builtin_readline(test_file, total_lines) - test_reverse_readline(test_file, total_lines) - test_reverse_readfile(test_file, total_lines) - - os.remove(test_file) - - -if __name__ == "__main__": - # Show OS info - os_info = platform.platform() - python_version = sys.version.split()[0] - print(f"\nRunning on OS: {os_info}, Python {python_version}") - - # Run benchmark for each file size - for file_size_mb in FILE_SIZES_MB: - run_benchmark(file_size_mb) diff --git a/benchmark/develop-ubuntu2204.txt b/benchmark/develop-ubuntu2204.txt deleted file mode 100644 index a9f882fc..00000000 --- a/benchmark/develop-ubuntu2204.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40757 lines, time used 0.01 seconds. -===============================Built-in readline================================ - -Last line 40757 read, time taken: 0.00299960 s. -75% line 30567 read, time taken: 0.00225260 s. -50% line 20378 read, time taken: 0.00147340 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40757 read, time taken: 0.00215450 s. -75% line 30567 read, time taken: 0.00299860 s. -50% line 20378 read, time taken: 0.00347230 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40757 read, time taken: 0.00007200 s. -75% line 30567 read, time taken: 0.00393440 s. -50% line 20378 read, time taken: 0.00788680 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392476 lines, time used 0.11 seconds. -===============================Built-in readline================================ - -Last line 392476 read, time taken: 0.02891790 s. -75% line 294357 read, time taken: 0.02203730 s. -50% line 196238 read, time taken: 0.01453720 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392476 read, time taken: 0.00010690 s. -75% line 294357 read, time taken: 0.08467931 s. -50% line 196238 read, time taken: 0.16308102 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392476 read, time taken: 0.00007110 s. -75% line 294357 read, time taken: 0.03886231 s. -50% line 196238 read, time taken: 0.07676581 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784596 lines, time used 1.11 seconds. -===============================Built-in readline================================ - -Last line 3784596 read, time taken: 0.28840513 s. -75% line 2838447 read, time taken: 0.22593426 s. -50% line 1892298 read, time taken: 0.14234112 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784596 read, time taken: 0.00010290 s. -75% line 2838447 read, time taken: 0.80831332 s. -50% line 1892298 read, time taken: 1.62029043 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784596 read, time taken: 0.00007960 s. -75% line 2838447 read, time taken: 0.35162211 s. -50% line 1892298 read, time taken: 0.70727881 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462038 lines, time used 5.24 seconds. -===============================Built-in readline================================ - -Last line 18462038 read, time taken: 1.38365400 s. -75% line 13846528 read, time taken: 1.11287734 s. -50% line 9231019 read, time taken: 0.69148150 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462038 read, time taken: 0.00009210 s. -75% line 13846528 read, time taken: 3.98585572 s. -50% line 9231019 read, time taken: 7.93171154 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462038 read, time taken: 0.00007860 s. -75% line 13846528 read, time taken: 1.75920299 s. -50% line 9231019 read, time taken: 3.45924340 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36540934 lines, time used 10.33 seconds. -===============================Built-in readline================================ - -Last line 36540934 read, time taken: 2.79645362 s. -75% line 27405700 read, time taken: 2.20113669 s. -50% line 18270467 read, time taken: 1.37504352 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36540934 read, time taken: 0.00009140 s. -75% line 27405700 read, time taken: 8.09242921 s. -50% line 18270467 read, time taken: 15.43773309 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36540934 read, time taken: 0.00010630 s. -75% line 27405700 read, time taken: 3.53976401 s. -50% line 18270467 read, time taken: 7.18230309 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466370 lines, time used 59.52 seconds. -===============================Built-in readline================================ - -Last line 178466370 read, time taken: 14.19376238 s. -75% line 133849777 read, time taken: 10.26362936 s. -50% line 89233185 read, time taken: 6.69335968 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466370 read, time taken: 0.00255530 s. -75% line 133849777 read, time taken: 39.30825986 s. -50% line 89233185 read, time taken: 76.32949390 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466370 read, time taken: 0.00073260 s. -75% line 133849777 read, time taken: 18.99764580 s. -50% line 89233185 read, time taken: 37.87064222 s. -============================End of reverse_readfile============================= diff --git a/benchmark/develop-win11.txt b/benchmark/develop-win11.txt deleted file mode 100644 index c55682ce..00000000 --- a/benchmark/develop-win11.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Windows-11-10.0.22631-SP0, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40757 lines, time used 0.03 seconds. -===============================Built-in readline================================ - -Last line 40757 read, time taken: 0.01786360 s. -75% line 30567 read, time taken: 0.00689310 s. -50% line 20378 read, time taken: 0.00435630 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40757 read, time taken: 0.00715640 s. -75% line 30567 read, time taken: 0.00793530 s. -50% line 20378 read, time taken: 0.00999800 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40757 read, time taken: 0.00021470 s. -75% line 30567 read, time taken: 0.00589890 s. -50% line 20378 read, time taken: 0.01156540 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392476 lines, time used 0.16 seconds. -===============================Built-in readline================================ - -Last line 392476 read, time taken: 0.08994260 s. -75% line 294357 read, time taken: 0.06407060 s. -50% line 196238 read, time taken: 0.04269250 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392476 read, time taken: 0.00021460 s. -75% line 294357 read, time taken: 0.20066430 s. -50% line 196238 read, time taken: 0.40051340 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392476 read, time taken: 0.00023090 s. -75% line 294357 read, time taken: 0.05876890 s. -50% line 196238 read, time taken: 0.11659210 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784596 lines, time used 1.79 seconds. -===============================Built-in readline================================ - -Last line 3784596 read, time taken: 0.89517130 s. -75% line 2838447 read, time taken: 0.66395980 s. -50% line 1892298 read, time taken: 0.43874380 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784596 read, time taken: 0.00024390 s. -75% line 2838447 read, time taken: 2.01433790 s. -50% line 1892298 read, time taken: 3.89665920 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784596 read, time taken: 0.00024540 s. -75% line 2838447 read, time taken: 0.55314700 s. -50% line 1892298 read, time taken: 1.10856910 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462038 lines, time used 8.46 seconds. -===============================Built-in readline================================ - -Last line 18462038 read, time taken: 4.11462000 s. -75% line 13846528 read, time taken: 3.03727910 s. -50% line 9231019 read, time taken: 2.00691610 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462038 read, time taken: 0.00023220 s. -75% line 13846528 read, time taken: 9.28227760 s. -50% line 9231019 read, time taken: 18.47311200 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462038 read, time taken: 0.00022390 s. -75% line 13846528 read, time taken: 2.60500570 s. -50% line 9231019 read, time taken: 5.27204600 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36540934 lines, time used 16.16 seconds. -===============================Built-in readline================================ - -Last line 36540934 read, time taken: 8.21203530 s. -75% line 27405700 read, time taken: 6.17786950 s. -50% line 18270467 read, time taken: 4.08797350 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36540934 read, time taken: 0.00021340 s. -75% line 27405700 read, time taken: 18.49044130 s. -50% line 18270467 read, time taken: 37.14956300 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36540934 read, time taken: 0.00022620 s. -75% line 27405700 read, time taken: 5.27717680 s. -50% line 18270467 read, time taken: 10.61577650 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466370 lines, time used 82.48 seconds. -===============================Built-in readline================================ - -Last line 178466370 read, time taken: 40.74100250 s. -75% line 133849777 read, time taken: 30.38311240 s. -50% line 89233185 read, time taken: 20.11819820 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466370 read, time taken: 0.00158920 s. -75% line 133849777 read, time taken: 90.74261630 s. -50% line 89233185 read, time taken: 182.34117580 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466370 read, time taken: 0.00046110 s. -75% line 133849777 read, time taken: 27.03316890 s. -50% line 89233185 read, time taken: 53.99668290 s. -============================End of reverse_readfile============================= diff --git a/benchmark/pypi-7.12-ubuntu2204.txt b/benchmark/pypi-7.12-ubuntu2204.txt deleted file mode 100644 index ded5db11..00000000 --- a/benchmark/pypi-7.12-ubuntu2204.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40757 lines, time used 0.01 seconds. -===============================Built-in readline================================ - -Last line 40757 read, time taken: 0.00308300 s. -75% line 30567 read, time taken: 0.00226110 s. -50% line 20378 read, time taken: 0.00148280 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40757 read, time taken: 0.00208500 s. -75% line 30567 read, time taken: 0.00314750 s. -50% line 20378 read, time taken: 0.00374380 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40757 read, time taken: 0.00006290 s. -75% line 30567 read, time taken: 0.00327840 s. -50% line 20378 read, time taken: 0.00682650 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392476 lines, time used 0.11 seconds. -===============================Built-in readline================================ - -Last line 392476 read, time taken: 0.03117800 s. -75% line 294357 read, time taken: 0.02239440 s. -50% line 196238 read, time taken: 0.01462070 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392476 read, time taken: 0.00008240 s. -75% line 294357 read, time taken: 0.08691510 s. -50% line 196238 read, time taken: 0.17316620 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392476 read, time taken: 0.00005700 s. -75% line 294357 read, time taken: 0.03377610 s. -50% line 196238 read, time taken: 0.06763100 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784596 lines, time used 1.10 seconds. -===============================Built-in readline================================ - -Last line 3784596 read, time taken: 0.34282561 s. -75% line 2838447 read, time taken: 0.25080561 s. -50% line 1892298 read, time taken: 0.15123111 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784596 read, time taken: 0.00008080 s. -75% line 2838447 read, time taken: 0.84473163 s. -50% line 1892298 read, time taken: 1.67904916 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784596 read, time taken: 0.00008110 s. -75% line 2838447 read, time taken: 0.31592141 s. -50% line 1892298 read, time taken: 0.64531112 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462038 lines, time used 5.47 seconds. -===============================Built-in readline================================ - -Last line 18462038 read, time taken: 1.65052795 s. -75% line 13846528 read, time taken: 1.15872567 s. -50% line 9231019 read, time taken: 0.69172220 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462038 read, time taken: 0.00008290 s. -75% line 13846528 read, time taken: 4.07981196 s. -50% line 9231019 read, time taken: 8.28308262 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462038 read, time taken: 0.00007250 s. -75% line 13846528 read, time taken: 1.56064874 s. -50% line 9231019 read, time taken: 3.14020898 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36540934 lines, time used 11.63 seconds. -===============================Built-in readline================================ - -Last line 36540934 read, time taken: 3.14226262 s. -75% line 27405700 read, time taken: 2.52962214 s. -50% line 18270467 read, time taken: 1.50383193 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36540934 read, time taken: 0.00008620 s. -75% line 27405700 read, time taken: 8.48286770 s. -50% line 18270467 read, time taken: 17.16495857 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36540934 read, time taken: 0.00007400 s. -75% line 27405700 read, time taken: 3.23107996 s. -50% line 18270467 read, time taken: 6.59040188 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466370 lines, time used 68.08 seconds. -===============================Built-in readline================================ - -Last line 178466370 read, time taken: 14.81192763 s. -75% line 133849777 read, time taken: 10.86625818 s. -50% line 89233185 read, time taken: 6.82962298 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466370 read, time taken: 0.00107610 s. -75% line 133849777 read, time taken: 41.98991041 s. -50% line 89233185 read, time taken: 88.86566979 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466370 read, time taken: 0.00072650 s. -75% line 133849777 read, time taken: 16.51870077 s. -50% line 89233185 read, time taken: 32.09695029 s. -============================End of reverse_readfile============================= diff --git a/benchmark/pypi-7.12-win11.txt b/benchmark/pypi-7.12-win11.txt deleted file mode 100644 index b47fd915..00000000 --- a/benchmark/pypi-7.12-win11.txt +++ /dev/null @@ -1,145 +0,0 @@ - -Running on OS: Windows-11-10.0.22631-SP0, Python 3.12.5 -==========================Benchmarking file size: 1 MB========================== - -Test file of size 1 MB created with 40757 lines, time used 0.02 seconds. -===============================Built-in readline================================ - -Last line 40757 read, time taken: 0.01602000 s. -75% line 30567 read, time taken: 0.00678060 s. -50% line 20378 read, time taken: 0.00440010 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 40757 read, time taken: 0.00682100 s. -75% line 30567 read, time taken: 0.00861640 s. -50% line 20378 read, time taken: 0.01032880 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 40757 read, time taken: 0.00035980 s. -75% line 30567 read, time taken: 0.01145030 s. -50% line 20378 read, time taken: 0.01462530 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 10 MB========================== - -Test file of size 10 MB created with 392476 lines, time used 0.17 seconds. -===============================Built-in readline================================ - -Last line 392476 read, time taken: 0.08988620 s. -75% line 294357 read, time taken: 0.06274470 s. -50% line 196238 read, time taken: 0.04201110 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 392476 read, time taken: 0.07084520 s. -75% line 294357 read, time taken: 0.08927400 s. -50% line 196238 read, time taken: 0.10541760 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 392476 read, time taken: 0.00015940 s. -75% line 294357 read, time taken: 0.05308200 s. -50% line 196238 read, time taken: 0.09734290 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 100 MB========================= - -Test file of size 100 MB created with 3784596 lines, time used 1.71 seconds. -===============================Built-in readline================================ - -Last line 3784596 read, time taken: 0.88680830 s. -75% line 2838447 read, time taken: 0.66664480 s. -50% line 1892298 read, time taken: 0.44697960 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 3784596 read, time taken: 0.71368880 s. -75% line 2838447 read, time taken: 0.86291260 s. -50% line 1892298 read, time taken: 1.04332270 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 3784596 read, time taken: 0.00015020 s. -75% line 2838447 read, time taken: 0.51161870 s. -50% line 1892298 read, time taken: 0.90967440 s. -============================End of reverse_readfile============================= - -=========================Benchmarking file size: 500 MB========================= - -Test file of size 500 MB created with 18462038 lines, time used 8.34 seconds. -===============================Built-in readline================================ - -Last line 18462038 read, time taken: 4.18576000 s. -75% line 13846528 read, time taken: 3.04948250 s. -50% line 9231019 read, time taken: 2.00662010 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 18462038 read, time taken: 3.52498280 s. -75% line 13846528 read, time taken: 4.23446110 s. -50% line 9231019 read, time taken: 5.16286300 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 18462038 read, time taken: 0.00017130 s. -75% line 13846528 read, time taken: 2.23264890 s. -50% line 9231019 read, time taken: 4.44377030 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 1000 MB========================= - -Test file of size 1000 MB created with 36540934 lines, time used 16.35 seconds. -===============================Built-in readline================================ - -Last line 36540934 read, time taken: 8.15072480 s. -75% line 27405700 read, time taken: 6.11769640 s. -50% line 18270467 read, time taken: 4.03451900 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 36540934 read, time taken: 6.99752910 s. -75% line 27405700 read, time taken: 8.55939080 s. -50% line 18270467 read, time taken: 10.08266420 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 36540934 read, time taken: 0.00017990 s. -75% line 27405700 read, time taken: 4.50390560 s. -50% line 18270467 read, time taken: 8.96961540 s. -============================End of reverse_readfile============================= - -========================Benchmarking file size: 5000 MB========================= - -Test file of size 5000 MB created with 178466370 lines, time used 85.79 seconds. -===============================Built-in readline================================ - -Last line 178466370 read, time taken: 40.56061480 s. -75% line 133849777 read, time taken: 30.29256250 s. -50% line 89233185 read, time taken: 20.17005980 s. -============================End of Built-in readline============================ - -================================reverse_readline================================ - -Last line 178466370 read, time taken: 34.63317840 s. -75% line 133849777 read, time taken: 41.73865510 s. -50% line 89233185 read, time taken: 50.30206010 s. -============================End of reverse_readline============================= - -================================reverse_readfile================================ - -Last line 178466370 read, time taken: 0.00019910 s. -75% line 133849777 read, time taken: 22.48041550 s. -50% line 89233185 read, time taken: 44.89090870 s. -============================End of reverse_readfile============================= From 6613d3366d682ca797cae505502823cb6cb6a84a Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 22 Oct 2024 10:43:24 +0800 Subject: [PATCH 92/96] fix typo in test var name --- tests/test_io.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 1e57a255..2e710d83 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -44,15 +44,15 @@ def test_get_line_ending(self, l_end): # Test text mode with open(test_file, "r", encoding="utf-8") as f: - start_pot = f.tell() + start_pos = f.tell() assert _get_line_ending(f) == l_end - assert f.tell() == start_pot + assert f.tell() == start_pos # Test binary mode with open(test_file, "rb") as f: - start_pot = f.tell() + start_pos = f.tell() assert _get_line_ending(f) == l_end - assert f.tell() == start_pot + assert f.tell() == start_pos # Test gzip file gzip_filename = f"{test_file}.gz" @@ -61,9 +61,9 @@ def test_get_line_ending(self, l_end): # Opened file stream with gzip.open(gzip_filename, "rb") as f: - start_pot = f.tell() + start_pos = f.tell() assert _get_line_ending(f) == l_end - assert f.tell() == start_pot + assert f.tell() == start_pos # Filename directly assert _get_line_ending(gzip_filename) == l_end @@ -75,9 +75,9 @@ def test_get_line_ending(self, l_end): # Opened file stream with bz2.open(bz2_filename, "rb") as f: - start_pot = f.tell() + start_pos = f.tell() assert _get_line_ending(f) == l_end - assert f.tell() == start_pot + assert f.tell() == start_pos # Filename directly assert _get_line_ending(bz2_filename) == l_end From 3baea3adb1ae68ee807cbdd8d8506b3ce21cfc7c Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 22 Oct 2024 11:20:00 +0800 Subject: [PATCH 93/96] remove merge issue --- .github/workflows/test.yml | 1 - src/monty/io.py | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4901cfd9..905fc976 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,6 @@ jobs: strategy: fail-fast: false max-parallel: 20 - fail-fast: false matrix: os: [ubuntu-latest, macos-14, windows-latest] python-version: ["3.9", "3.12"] diff --git a/src/monty/io.py b/src/monty/io.py index 8a8a1c5a..07fd58d7 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -9,6 +9,7 @@ import errno import gzip import io +import lzma import mmap import os import subprocess @@ -17,11 +18,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal, cast -try: - import lzma -except ImportError: - lzma = None # type: ignore[assignment] - if TYPE_CHECKING: from typing import IO, Iterator, Union From f1fd669346b9d4d7f37be0d32583c9c90f80e822 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 22 Oct 2024 11:25:05 +0800 Subject: [PATCH 94/96] revise comment --- src/monty/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index 07fd58d7..f029e324 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -65,8 +65,8 @@ def _get_line_ending( This function assumes the file has a single consistent line ending. WARNING: as per the POSIX standard, a line is: "A sequence of zero or - more non- characters plus a terminating char.", - as such this func might fail if the only line misses a terminating character. + more non- characters plus a terminating char.", as such + this func might fail if the only line misses a terminating newline character. https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html Returns: From d70e80e0760b465649e5c5b5c86e276c7fa70659 Mon Sep 17 00:00:00 2001 From: "Haoyu (Daniel)" Date: Tue, 22 Oct 2024 11:30:49 +0800 Subject: [PATCH 95/96] suppress mypy errors --- src/monty/io.py | 2 +- src/monty/json.py | 2 +- src/monty/re.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/monty/io.py b/src/monty/io.py index f029e324..6c55bb17 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -83,7 +83,7 @@ def _get_line_ending( with zopen(file, "rb") as f: first_line = f.readline() elif isinstance(file, io.TextIOWrapper): - first_line = file.buffer.readline() + first_line = file.buffer.readline() # type: ignore[attr-defined] elif isinstance(file, (io.BufferedReader, gzip.GzipFile, bz2.BZ2File)): first_line = file.readline() else: diff --git a/src/monty/json.py b/src/monty/json.py index d0626d3e..e6e59258 100644 --- a/src/monty/json.py +++ b/src/monty/json.py @@ -35,7 +35,7 @@ try: import orjson except ImportError: - orjson = None + orjson = None # type: ignore[assignment] __version__ = "3.0.0" diff --git a/src/monty/re.py b/src/monty/re.py index 0dfb75a3..5b9f3425 100644 --- a/src/monty/re.py +++ b/src/monty/re.py @@ -58,5 +58,5 @@ def regrep( with contextlib.suppress(Exception): # Try to close open file handle. Pass if it is a generator. - gen.close() # type: ignore[attr-defined] + gen.close() # type: ignore[attr-defined, union-attr] return matches From 3df6709ca03e8cb26c91c5d4bc246408daaebcdb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 01:46:06 +0000 Subject: [PATCH 96/96] pre-commit auto-fixes --- src/monty/io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/monty/io.py b/src/monty/io.py index a8f12fc7..81110534 100644 --- a/src/monty/io.py +++ b/src/monty/io.py @@ -112,6 +112,7 @@ def zopen( return open(filename, mode, **kwargs) + def _get_line_ending( file: str | Path