Skip to content

Commit

Permalink
Merge pull request #712 from DanielYang59/readline-line-ending
Browse files Browse the repository at this point in the history
Fix line ending handling in `reverse_readfile/readline` across OS, and not skipping empty lines
  • Loading branch information
shyuep authored Dec 11, 2024
2 parents ae78429 + 3df6709 commit bf66f0d
Show file tree
Hide file tree
Showing 11 changed files with 482 additions and 185 deletions.
265 changes: 184 additions & 81 deletions src/monty/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import time
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal, cast

if TYPE_CHECKING:
from typing import IO, Any, Generator, Union
Expand Down Expand Up @@ -90,6 +90,7 @@ def zopen(
kwargs["encoding"] = "utf-8"

_name, ext = os.path.splitext(filename)

ext = ext.lower()

if ext == ".bz2":
Expand All @@ -112,7 +113,64 @@ def zopen(
return open(filename, mode, **kwargs)


def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]:
def _get_line_ending(
file: str
| Path
| io.TextIOWrapper
| io.BufferedReader
| gzip.GzipFile
| bz2.BZ2File,
) -> Literal["\r\n", "\n"]:
"""Helper function to get line ending of a file.
This function assumes the file has a single consistent line ending.
WARNING: as per the POSIX standard, a line is: "A sequence of zero or
more non-<newline> characters plus a terminating <newline> char.", as such
this func might fail if the only line misses a terminating newline character.
https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html
Returns:
"\n": Unix line ending.
"\r\n": Windows line ending.
Raises:
ValueError: If line ending is unknown.
Warnings:
If file is empty, "\n" would be used as default.
"""
if isinstance(file, (str, Path)):
with zopen(file, "rb") as f:
first_line = f.readline()
elif isinstance(file, io.TextIOWrapper):
first_line = file.buffer.readline() # type: ignore[attr-defined]
elif isinstance(file, (io.BufferedReader, gzip.GzipFile, bz2.BZ2File)):
first_line = file.readline()
else:
raise TypeError(f"Unknown file type {type(file).__name__}")

# Reset pointer to start of file if possible
if hasattr(file, "seek"):
file.seek(0)

# Return Unix "\n" line ending as default if file is empty
if not first_line:
warnings.warn("File is empty, return Unix line ending \n.", stacklevel=2)
return "\n"

if first_line.endswith(b"\r\n"):
return "\r\n"
if first_line.endswith(b"\n"):
return "\n"

# It's likely the line is missing a line ending for the first line
raise ValueError(f"Unknown line ending in line {repr(first_line)}.")


def reverse_readfile(
filename: Union[str, Path],
) -> Iterator[str]:
"""
A much faster reverse read of file by using Python's mmap to generate a
memory-mapped file. It is slower for very small files than
Expand All @@ -125,108 +183,154 @@ def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]:
Yields:
Lines from the file in reverse order.
"""
try:
with zopen(filename, "rb") as file:
if isinstance(file, (gzip.GzipFile, bz2.BZ2File)):
for line in reversed(file.readlines()):
yield line.decode("utf-8").rstrip(os.linesep)
else:
# Get line ending
l_end = _get_line_ending(filename)
len_l_end = len(l_end)

with zopen(filename, "rb") as file:
if isinstance(file, (gzip.GzipFile, bz2.BZ2File)):
for line in reversed(file.readlines()):
# "readlines" would keep the line end character
yield line.decode("utf-8")

else:
try:
filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
n = len(filemap)
while n > 0:
i = filemap.rfind(os.linesep.encode(), 0, n)
yield filemap[i + 1 : n].decode("utf-8").rstrip(os.linesep)
n = i
except ValueError:
warnings.warn("trying to mmap an empty file.", stacklevel=2)
return

except ValueError:
return
file_size = len(filemap)
while file_size > 0:
# Find line segment start and end positions
seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size)
seg_end_pos = file_size + len_l_end

# The first (originally) line doesn't have an ending character at its head
if seg_start_pos == -1:
yield (filemap[:seg_end_pos].decode("utf-8"))

# Skip the first match (the last line ending character)
elif file_size != len(filemap):
yield (
filemap[seg_start_pos + len_l_end : seg_end_pos].decode("utf-8")
)
file_size = seg_start_pos


def reverse_readline(
m_file, blk_size: int = 4096, max_mem: int = 4000000
) -> Generator[str, str, None]:
m_file: io.BufferedReader | io.TextIOWrapper | gzip.GzipFile | bz2.BZ2File,
blk_size: int = 4096,
max_mem: int = 4_000_000,
) -> Iterator[str]:
"""
Generator function to read a file line-by-line, but backwards.
This allows one to efficiently get data at the end of a file.
Read a file backwards line-by-line, and behave similarly to
the file.readline function. This allows one to efficiently
get data from the end of a file.
Read file forwards and reverse in memory for files smaller than the
max_mem parameter, or for gzip files where reverse seeks are not supported.
Supported file stream formats:
- TextIOWrapper (text mode) | BufferedReader (binary mode)
- gzip/bzip2 file stream
Files larger than max_mem are dynamically read backwards.
Cases where file would be read forwards and reversed in RAM:
- If file size is smaller than RAM usage limit (max_mem).
- Gzip files, as reverse seeks are not supported.
Reference:
Based on code by Peter Astrand <[email protected]>, using modifications
by Raymond Hettinger and Kevin German.
http://code.activestate.com/recipes/439045-read-a-text-file-backwards
-yet-another-implementat/
Based on code by Peter Astrand <[email protected]>, using
modifications by Raymond Hettinger and Kevin German.
http://code.activestate.com/recipes/439045-read-a-text-
file-backwards-yet-another-implementat/
Args:
m_file (File): File stream to read (backwards)
blk_size (int): The buffer size. Defaults to 4096.
max_mem (int): The maximum amount of memory to involve in this
operation. This is used to determine when to reverse a file
in-memory versus seeking portions of a file. For bz2 files,
this sets the maximum block size.
m_file: File stream to read (backwards).
blk_size (int): The block size to read each time in bytes.
Defaults to 4096.
max_mem (int): Threshold to determine when to reverse a file
in-memory versus reading blocks of a file each time.
For bz2 files, this sets the block size.
Returns:
Generator that yields lines from the file. Behave similarly to the
file.readline() function, except the lines are returned from the back
of the file.
Yields:
Lines from the back of the file.
Raises:
TypeError: If m_file is the name of the file (expect file stream).
Warnings:
If max_mem is smaller than blk_size.
"""
# Check if the file stream is a bit stream or not
is_text = isinstance(m_file, io.TextIOWrapper)

try:
file_size = os.path.getsize(m_file.name)
except AttributeError:
# Bz2 files do not have name attribute. Just set file_size to above
# max_mem for now.
file_size = max_mem + 1

# If the file size is within our desired RAM use, just reverse it in memory
# GZip files must use this method because there is no way to negative seek
# For windows, we also read the whole file.
if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt":
# Check for illegal usage
if isinstance(m_file, (str, Path)):
raise TypeError("expect a file stream, not file name")

# Generate line ending
l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file)
len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end))

# Bz2 files do not have "name" attribute, just set to max_mem for now
if hasattr(m_file, "name"):
file_size: int = os.path.getsize(m_file.name)
else:
file_size = max_mem

# If the file size is within desired RAM limit, just reverse it in memory.
# Gzip files must use this method because there is no way to negative seek.
if file_size < max_mem or isinstance(m_file, gzip.GzipFile):
for line in reversed(m_file.readlines()):
yield line.rstrip()
yield line if isinstance(line, str) else cast(bytes, line).decode("utf-8")

else:
# RAM limit should be greater than block size,
# as file is read into RAM one block each time.
if max_mem < blk_size:
warnings.warn(f"{max_mem=} smaller than {blk_size=}", stacklevel=2)

# For bz2 files, seek is expensive. It is therefore in our best
# interest to maximize the block size within RAM usage limit.
if isinstance(m_file, bz2.BZ2File):
# for bz2 files, seeks are expensive. It is therefore in our best
# interest to maximize the blk_size within limits of desired RAM
# use.
blk_size = min(max_mem, file_size)

buf = ""
m_file.seek(0, 2)
lastchar = m_file.read(1) if is_text else m_file.read(1).decode("utf-8")
# Check if the file stream is text (instead of binary)
is_text: bool = isinstance(m_file, io.TextIOWrapper)

trailing_newline = lastchar == os.linesep
buffer: str = ""
m_file.seek(0, 2)
skipped_1st_l_end: bool = False

while True:
newline_pos = buf.rfind(os.linesep)
pos = m_file.tell()
if newline_pos != -1:
# Found a newline
line = buf[newline_pos + 1 :]
buf = buf[:newline_pos]
if pos or newline_pos or trailing_newline:
line += os.linesep
yield line

elif pos:
# Need to fill buffer
toread = min(blk_size, pos)
m_file.seek(pos - toread, 0)
l_end_pos: int = buffer.rfind(l_end)
# Pointer position (also size of remaining file)
pt_pos: int = m_file.tell()

# Line ending found within buffer
if l_end_pos != -1:
line = buffer[l_end_pos + len_l_end :]
buffer = buffer[:l_end_pos] # buffer doesn't include l_end

# Skip first match (the last line ending)
if skipped_1st_l_end:
yield line + l_end
else:
skipped_1st_l_end = True

# Line ending not in current buffer, load next block into the buffer
elif pt_pos > 0:
to_read: int = min(blk_size, pt_pos)
m_file.seek(pt_pos - to_read)
if is_text:
buf = m_file.read(toread) + buf
buffer = cast(str, m_file.read(to_read)) + buffer
else:
buf = m_file.read(toread).decode("utf-8") + buf
m_file.seek(pos - toread, 0)
if pos == toread:
buf = os.linesep + buf
buffer = cast(bytes, m_file.read(to_read)).decode("utf-8") + buffer

# Move pointer forward
m_file.seek(pt_pos - to_read)

else:
# Start-of-file
# Add a l_end to the start of file
if pt_pos == to_read:
buffer = l_end + buffer

# Start of file
else: # l_end_pos == -1 and pt_post == 0
return


Expand Down Expand Up @@ -328,8 +432,7 @@ def get_open_fds() -> int:
"""
Get the number of open file descriptors for current process.
Warnings:
Will only work on UNIX-like OS-es.
Warning, this will only work on UNIX-like OS.
Returns:
int: The number of open file descriptors for current process.
Expand Down
2 changes: 1 addition & 1 deletion src/monty/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
try:
import orjson
except ImportError:
orjson = None
orjson = None # type: ignore[assignment]


__version__ = "3.0.0"
Expand Down
2 changes: 1 addition & 1 deletion src/monty/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,5 @@ def regrep(

with contextlib.suppress(Exception):
# Try to close open file handle. Pass if it is a generator.
gen.close() # type: ignore[attr-defined]
gen.close() # type: ignore[attr-defined, union-attr]
return matches
2 changes: 1 addition & 1 deletion tests/test_files/3000_lines.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2997,4 +2997,4 @@
2997
2998
2999
3000
3000
Binary file modified tests/test_files/3000_lines.txt.bz2
Binary file not shown.
Binary file modified tests/test_files/3000_lines.txt.gz
Binary file not shown.
Binary file removed tests/test_files/3000lines.txt.gz
Binary file not shown.
Loading

0 comments on commit bf66f0d

Please sign in to comment.