-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #712 from DanielYang59/readline-line-ending
Fix line ending handling in `reverse_readfile/readline` across OS, and not skipping empty lines
- Loading branch information
Showing
11 changed files
with
482 additions
and
185 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ | |
import time | ||
import warnings | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING | ||
from typing import TYPE_CHECKING, Literal, cast | ||
|
||
if TYPE_CHECKING: | ||
from typing import IO, Any, Generator, Union | ||
|
@@ -90,6 +90,7 @@ def zopen( | |
kwargs["encoding"] = "utf-8" | ||
|
||
_name, ext = os.path.splitext(filename) | ||
|
||
ext = ext.lower() | ||
|
||
if ext == ".bz2": | ||
|
@@ -112,7 +113,64 @@ def zopen( | |
return open(filename, mode, **kwargs) | ||
|
||
|
||
def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]: | ||
def _get_line_ending( | ||
file: str | ||
| Path | ||
| io.TextIOWrapper | ||
| io.BufferedReader | ||
| gzip.GzipFile | ||
| bz2.BZ2File, | ||
) -> Literal["\r\n", "\n"]: | ||
"""Helper function to get line ending of a file. | ||
This function assumes the file has a single consistent line ending. | ||
WARNING: as per the POSIX standard, a line is: "A sequence of zero or | ||
more non-<newline> characters plus a terminating <newline> char.", as such | ||
this func might fail if the only line misses a terminating newline character. | ||
https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html | ||
Returns: | ||
"\n": Unix line ending. | ||
"\r\n": Windows line ending. | ||
Raises: | ||
ValueError: If line ending is unknown. | ||
Warnings: | ||
If file is empty, "\n" would be used as default. | ||
""" | ||
if isinstance(file, (str, Path)): | ||
with zopen(file, "rb") as f: | ||
first_line = f.readline() | ||
elif isinstance(file, io.TextIOWrapper): | ||
first_line = file.buffer.readline() # type: ignore[attr-defined] | ||
elif isinstance(file, (io.BufferedReader, gzip.GzipFile, bz2.BZ2File)): | ||
first_line = file.readline() | ||
else: | ||
raise TypeError(f"Unknown file type {type(file).__name__}") | ||
|
||
# Reset pointer to start of file if possible | ||
if hasattr(file, "seek"): | ||
file.seek(0) | ||
|
||
# Return Unix "\n" line ending as default if file is empty | ||
if not first_line: | ||
warnings.warn("File is empty, return Unix line ending \n.", stacklevel=2) | ||
return "\n" | ||
|
||
if first_line.endswith(b"\r\n"): | ||
return "\r\n" | ||
if first_line.endswith(b"\n"): | ||
return "\n" | ||
|
||
# It's likely the line is missing a line ending for the first line | ||
raise ValueError(f"Unknown line ending in line {repr(first_line)}.") | ||
|
||
|
||
def reverse_readfile( | ||
filename: Union[str, Path], | ||
) -> Iterator[str]: | ||
""" | ||
A much faster reverse read of file by using Python's mmap to generate a | ||
memory-mapped file. It is slower for very small files than | ||
|
@@ -125,108 +183,154 @@ def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]: | |
Yields: | ||
Lines from the file in reverse order. | ||
""" | ||
try: | ||
with zopen(filename, "rb") as file: | ||
if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): | ||
for line in reversed(file.readlines()): | ||
yield line.decode("utf-8").rstrip(os.linesep) | ||
else: | ||
# Get line ending | ||
l_end = _get_line_ending(filename) | ||
len_l_end = len(l_end) | ||
|
||
with zopen(filename, "rb") as file: | ||
if isinstance(file, (gzip.GzipFile, bz2.BZ2File)): | ||
for line in reversed(file.readlines()): | ||
# "readlines" would keep the line end character | ||
yield line.decode("utf-8") | ||
|
||
else: | ||
try: | ||
filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) | ||
n = len(filemap) | ||
while n > 0: | ||
i = filemap.rfind(os.linesep.encode(), 0, n) | ||
yield filemap[i + 1 : n].decode("utf-8").rstrip(os.linesep) | ||
n = i | ||
except ValueError: | ||
warnings.warn("trying to mmap an empty file.", stacklevel=2) | ||
return | ||
|
||
except ValueError: | ||
return | ||
file_size = len(filemap) | ||
while file_size > 0: | ||
# Find line segment start and end positions | ||
seg_start_pos = filemap.rfind(l_end.encode(), 0, file_size) | ||
seg_end_pos = file_size + len_l_end | ||
|
||
# The first (originally) line doesn't have an ending character at its head | ||
if seg_start_pos == -1: | ||
yield (filemap[:seg_end_pos].decode("utf-8")) | ||
|
||
# Skip the first match (the last line ending character) | ||
elif file_size != len(filemap): | ||
yield ( | ||
filemap[seg_start_pos + len_l_end : seg_end_pos].decode("utf-8") | ||
) | ||
file_size = seg_start_pos | ||
|
||
|
||
def reverse_readline( | ||
m_file, blk_size: int = 4096, max_mem: int = 4000000 | ||
) -> Generator[str, str, None]: | ||
m_file: io.BufferedReader | io.TextIOWrapper | gzip.GzipFile | bz2.BZ2File, | ||
blk_size: int = 4096, | ||
max_mem: int = 4_000_000, | ||
) -> Iterator[str]: | ||
""" | ||
Generator function to read a file line-by-line, but backwards. | ||
This allows one to efficiently get data at the end of a file. | ||
Read a file backwards line-by-line, and behave similarly to | ||
the file.readline function. This allows one to efficiently | ||
get data from the end of a file. | ||
Read file forwards and reverse in memory for files smaller than the | ||
max_mem parameter, or for gzip files where reverse seeks are not supported. | ||
Supported file stream formats: | ||
- TextIOWrapper (text mode) | BufferedReader (binary mode) | ||
- gzip/bzip2 file stream | ||
Files larger than max_mem are dynamically read backwards. | ||
Cases where file would be read forwards and reversed in RAM: | ||
- If file size is smaller than RAM usage limit (max_mem). | ||
- Gzip files, as reverse seeks are not supported. | ||
Reference: | ||
Based on code by Peter Astrand <[email protected]>, using modifications | ||
by Raymond Hettinger and Kevin German. | ||
http://code.activestate.com/recipes/439045-read-a-text-file-backwards | ||
-yet-another-implementat/ | ||
Based on code by Peter Astrand <[email protected]>, using | ||
modifications by Raymond Hettinger and Kevin German. | ||
http://code.activestate.com/recipes/439045-read-a-text- | ||
file-backwards-yet-another-implementat/ | ||
Args: | ||
m_file (File): File stream to read (backwards) | ||
blk_size (int): The buffer size. Defaults to 4096. | ||
max_mem (int): The maximum amount of memory to involve in this | ||
operation. This is used to determine when to reverse a file | ||
in-memory versus seeking portions of a file. For bz2 files, | ||
this sets the maximum block size. | ||
m_file: File stream to read (backwards). | ||
blk_size (int): The block size to read each time in bytes. | ||
Defaults to 4096. | ||
max_mem (int): Threshold to determine when to reverse a file | ||
in-memory versus reading blocks of a file each time. | ||
For bz2 files, this sets the block size. | ||
Returns: | ||
Generator that yields lines from the file. Behave similarly to the | ||
file.readline() function, except the lines are returned from the back | ||
of the file. | ||
Yields: | ||
Lines from the back of the file. | ||
Raises: | ||
TypeError: If m_file is the name of the file (expect file stream). | ||
Warnings: | ||
If max_mem is smaller than blk_size. | ||
""" | ||
# Check if the file stream is a bit stream or not | ||
is_text = isinstance(m_file, io.TextIOWrapper) | ||
|
||
try: | ||
file_size = os.path.getsize(m_file.name) | ||
except AttributeError: | ||
# Bz2 files do not have name attribute. Just set file_size to above | ||
# max_mem for now. | ||
file_size = max_mem + 1 | ||
|
||
# If the file size is within our desired RAM use, just reverse it in memory | ||
# GZip files must use this method because there is no way to negative seek | ||
# For windows, we also read the whole file. | ||
if file_size < max_mem or isinstance(m_file, gzip.GzipFile) or os.name == "nt": | ||
# Check for illegal usage | ||
if isinstance(m_file, (str, Path)): | ||
raise TypeError("expect a file stream, not file name") | ||
|
||
# Generate line ending | ||
l_end: Literal["\r\n", "\n"] = _get_line_ending(m_file) | ||
len_l_end: Literal[1, 2] = cast(Literal[1, 2], len(l_end)) | ||
|
||
# Bz2 files do not have "name" attribute, just set to max_mem for now | ||
if hasattr(m_file, "name"): | ||
file_size: int = os.path.getsize(m_file.name) | ||
else: | ||
file_size = max_mem | ||
|
||
# If the file size is within desired RAM limit, just reverse it in memory. | ||
# Gzip files must use this method because there is no way to negative seek. | ||
if file_size < max_mem or isinstance(m_file, gzip.GzipFile): | ||
for line in reversed(m_file.readlines()): | ||
yield line.rstrip() | ||
yield line if isinstance(line, str) else cast(bytes, line).decode("utf-8") | ||
|
||
else: | ||
# RAM limit should be greater than block size, | ||
# as file is read into RAM one block each time. | ||
if max_mem < blk_size: | ||
warnings.warn(f"{max_mem=} smaller than {blk_size=}", stacklevel=2) | ||
|
||
# For bz2 files, seek is expensive. It is therefore in our best | ||
# interest to maximize the block size within RAM usage limit. | ||
if isinstance(m_file, bz2.BZ2File): | ||
# for bz2 files, seeks are expensive. It is therefore in our best | ||
# interest to maximize the blk_size within limits of desired RAM | ||
# use. | ||
blk_size = min(max_mem, file_size) | ||
|
||
buf = "" | ||
m_file.seek(0, 2) | ||
lastchar = m_file.read(1) if is_text else m_file.read(1).decode("utf-8") | ||
# Check if the file stream is text (instead of binary) | ||
is_text: bool = isinstance(m_file, io.TextIOWrapper) | ||
|
||
trailing_newline = lastchar == os.linesep | ||
buffer: str = "" | ||
m_file.seek(0, 2) | ||
skipped_1st_l_end: bool = False | ||
|
||
while True: | ||
newline_pos = buf.rfind(os.linesep) | ||
pos = m_file.tell() | ||
if newline_pos != -1: | ||
# Found a newline | ||
line = buf[newline_pos + 1 :] | ||
buf = buf[:newline_pos] | ||
if pos or newline_pos or trailing_newline: | ||
line += os.linesep | ||
yield line | ||
|
||
elif pos: | ||
# Need to fill buffer | ||
toread = min(blk_size, pos) | ||
m_file.seek(pos - toread, 0) | ||
l_end_pos: int = buffer.rfind(l_end) | ||
# Pointer position (also size of remaining file) | ||
pt_pos: int = m_file.tell() | ||
|
||
# Line ending found within buffer | ||
if l_end_pos != -1: | ||
line = buffer[l_end_pos + len_l_end :] | ||
buffer = buffer[:l_end_pos] # buffer doesn't include l_end | ||
|
||
# Skip first match (the last line ending) | ||
if skipped_1st_l_end: | ||
yield line + l_end | ||
else: | ||
skipped_1st_l_end = True | ||
|
||
# Line ending not in current buffer, load next block into the buffer | ||
elif pt_pos > 0: | ||
to_read: int = min(blk_size, pt_pos) | ||
m_file.seek(pt_pos - to_read) | ||
if is_text: | ||
buf = m_file.read(toread) + buf | ||
buffer = cast(str, m_file.read(to_read)) + buffer | ||
else: | ||
buf = m_file.read(toread).decode("utf-8") + buf | ||
m_file.seek(pos - toread, 0) | ||
if pos == toread: | ||
buf = os.linesep + buf | ||
buffer = cast(bytes, m_file.read(to_read)).decode("utf-8") + buffer | ||
|
||
# Move pointer forward | ||
m_file.seek(pt_pos - to_read) | ||
|
||
else: | ||
# Start-of-file | ||
# Add a l_end to the start of file | ||
if pt_pos == to_read: | ||
buffer = l_end + buffer | ||
|
||
# Start of file | ||
else: # l_end_pos == -1 and pt_post == 0 | ||
return | ||
|
||
|
||
|
@@ -328,8 +432,7 @@ def get_open_fds() -> int: | |
""" | ||
Get the number of open file descriptors for current process. | ||
Warnings: | ||
Will only work on UNIX-like OS-es. | ||
Warning, this will only work on UNIX-like OS. | ||
Returns: | ||
int: The number of open file descriptors for current process. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2997,4 +2997,4 @@ | |
2997 | ||
2998 | ||
2999 | ||
3000 | ||
3000 |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.