From 2a632932e4ebc61ec9ef03d914c400cf9ac807e7 Mon Sep 17 00:00:00 2001 From: Edward Hope-Morley Date: Sun, 27 Aug 2023 13:38:49 +0100 Subject: [PATCH] Consistency cleanup --- searchkit/constraints.py | 292 +++++++++++++++++---------------------- 1 file changed, 123 insertions(+), 169 deletions(-) diff --git a/searchkit/constraints.py b/searchkit/constraints.py index e0cf055..6b823ac 100644 --- a/searchkit/constraints.py +++ b/searchkit/constraints.py @@ -153,23 +153,23 @@ def _line_date_is_valid(self, extracted_datetime): return True -class NoMatchingLogLineWithDate(Exception): +class ValidLinesNotFound(Exception): """Raised when a log file contains proper timestamps but no log lines after the since date.""" -class NoDateFoundInLogs(Exception): +class ValidFormattedDateNotFound(Exception): """Raised when a log file does not contain any line with date suitable to specified date format""" -class DateSearchFailedAtOffset(Exception): +class DateNotFoundInLine(Exception): """Raised when searcher has encountered a line with no date and performed forward-backward searches, but still yet, could not found a line with date.""" -class UncheckedAccess(Exception): +class InvalidSearchState(Exception): """Raised when a variable dependent on another variable (e.g. the variable x only has value when y is True) is accessed without checking the prerequisite variable.""" @@ -181,11 +181,15 @@ class FindTokenStatus(Enum): FAILED = 3 -class FindTokenResult(object): - def __init__(self, status: FindTokenStatus, found_offset=0, read_bytes=0): +class SearchState(object): + def __init__(self, status: FindTokenStatus, offset=0): + """ + @param status: current status of search + @param offset: current position in file from which next search will be + started. + """ self._status = status - self._found_offset = found_offset - self._read_bytes = read_bytes + self._offset = offset @property def status(self): @@ -194,21 +198,14 @@ def status(self): @property def offset(self): if self.status == FindTokenStatus.FAILED: - raise UncheckedAccess() - return self._found_offset + raise InvalidSearchState() - @property - def read_bytes(self): - if self.status == FindTokenStatus.FAILED: - raise UncheckedAccess() - return self._read_bytes + return self._offset -class PeekFile(): - """A context manager class that allows peeking the - file by keeping the initial read position and returning - to it back. - """ +class FileReadNonDestructive(object): + """ Context manager class that saves current position at start and restores + once finished. """ def __init__(self, file): self.file = file self.original_position = file.tell() @@ -221,11 +218,10 @@ def __exit__(self, exc_type, exc_value, exc_traceback): class LogLine(object): - """A class that represents a line in a log file. + """Class representing a line in a log file. - The class only keeps the start/end offsets of the line and - the line itself is lazily loaded on demand (i.e. by calling - the `text` function). + Keeps the start/end offsets of the line and line content is lazy-loaded on + demand (i.e. by calling the `text` function). """ def __init__(self, file, constraint, line_start_lf, line_end_lf): @@ -249,6 +245,7 @@ def start_offset(self): # (being the \n) if self.start_lf.status == FindTokenStatus.FOUND: return self.start_lf.offset + 1 + return self.start_lf.offset @property @@ -259,6 +256,7 @@ def end_offset(self): # the \n) if self.end_lf.status == FindTokenStatus.FOUND: return self.end_lf.offset - 1 + return self.end_lf.offset @property @@ -278,9 +276,7 @@ def date(self): The function will use extracted_datetime function to parse the date/time. - Returns: - datetime: if `text` contains a valid datetime - None: otherwise + @return: datetime: if `text` contains a valid datetime otherwise None. """ return self._constraint.extracted_datetime(self.text) @@ -292,105 +288,95 @@ def text(self): on demand. The function will revert the file offset back after reading to where it was before. - Returns: - str: The line text + @return: the line text string """ - with PeekFile(self._file) as f: + with FileReadNonDestructive(self._file) as f: f.seek(self.start_offset) line_text = f.read(len(self)) return line_text -class LogFileDateSinceOffsetSeeker: - """This class allows user to perform `since` date lookups with - file offsets. This is useful for performing line-based binary - date searches on a log file. +class LogFileDateSinceOffsetSeeker(object): + """ This class performs `since` date lookups with file offsets. This is + useful for performing line-based binary date searches on a log file. - The class implements __len__ and __getitem__ methods in order to - behave like a list. When __getitem__ is called with ane offset, - the algorithm locates the rightmost and leftmost line feed (`\n`) - to form a line. Assume the following file contents: + Implements __len__ and __getitem__ methods in order to behave like a list. + When __getitem__ is called with an offset the algorithm locates the + rightmost and leftmost line feed '\n' to form a line. For example with the + following file contents: 13:15 AAAAAA\n13:16 BBBBBBB\n13:17 CCCCCC - ... and let's assume that the __getitem__ function is called with - offset `19`: + and assuming __getitem__ is called with offset 19 i.e. 13:15 AAAAAA\n13:16 BBBBBBB\n13:17 CCCCCC ^19 - The algorithm first will read SEEK_HORIZON bytes forward, starting + The algorithm will first read SEEK_HORIZON bytes forward, starting from offset `19`, and then try to find the first line feed: 13:15 AAAAAA\n13:16 BBBBBBB\n13:17 CCCCCC ^19 ^r-lf - Consequently, the algorithm will seek SEEK_HORIZON bytes backward, - starting from offset `19`, read SEEK_HORIZON bytes and then try to - find the first line feed, scanning in reverse: + Then the algorithm will seek SEEK_HORIZON bytes backward, starting from + offset 19, read SEEK_HORIZON bytes and then try to find the first line feed + scanning in reverse: 13:15 AAAAAA\n13:16 BBBBBBB\n13:17 CCCCCC ^l-lf ^19 ^r-lf - Then, the algoritm will extract the characters between l-lf and r-lf + Then, the algorithm will extract the characters between l-lf and r-lf to form a line. The line will be checked against the date matcher - to extract the date. If the date matcher yields a valid date, the - __getitem__ function will return that date. Otherwise, the search will - be extended to other nearby lines, prioritizing the lines prior to the - current, until either: + to extract the date. If the date matcher yields a valid date, __getitem__ + will return that date. Otherwise, the search will be extended to other + nearby lines, prioritising the lines prior to the current, until either of + the following is true: - - a line with timestamp found, or - - MAX_*_FALLBACK_LINES has reached. + - a line with a timestamp is found + - MAX_*_FALLBACK_LINES has been reached """ - # Amount of characters to read while searching + # Number of characters to read while searching SEEK_HORIZON = 256 - # How many times we can expand the search - # horizon while trying to find a line feed. - # This means the search will read SEEK_HORIZON - # times MAX_SEEK_HORIZON_EXPAND bytes in total - # when a line feed character is not found. + # How many times we can expand the search horizon while trying to find a + # line feed. This means the search will read SEEK_HORIZON times + # MAX_SEEK_HORIZON_EXPAND bytes in total when a line feed character is not + # found. MAX_SEEK_HORIZON_EXPAND = 100 - # How many lines should we search forwards utmost - # when the algorithm encounters lines with no date. + # Number of lines to search forwards when the algorithm encounters lines + # with no date. MAX_FWD_FALLBACK_LINES = 500 - # How many lines should we search backwards utmost - # when the algorithm encounters lines with no date. + # Number of lines to search backwards when the algorithm encounters lines + # with no date. MAX_RWD_FALLBACK_LINES = 500 - def __init__(self, fd, c) -> None: + LF_TOKEN = b"\n" + + def __init__(self, fd, c): self.file = fd self.constraint = c self.line_info = None self.found_any_date = False - def find_token_reverse( - self, - file, - start_offset, - horizon, - token=b"\n", - max_iterations=MAX_SEEK_HORIZON_EXPAND, - ): - r"""Find `token` in `file` starting from `start_offset` and backing off + def find_token_reverse(self, file, start_offset, horizon, + max_iterations=MAX_SEEK_HORIZON_EXPAND): + r"""Find LF in `file` starting from `start_offset` and backing off `horizon`bytes on each iteration for maximum of `max_iterations` times. - Args: - file (file): File descriptor, open in read mode - start_offset (int): start offset of search - horizon (int): Amount of bytes to be processed on each step. - token (str, optional): Search token. Defaults to `\n`. - max_iterations (int, optional): Maximum amount of search - iterations. Defaults to 100. + @param file (file): File descriptor, open in read mode + @param start_offset (int): start offset of search + @param horizon (int): Amount of bytes to be processed on each step. + @param max_iterations (int, optional): Maximum amount of search + iterations. Defaults to 100. - Returns: - FindDelimiterResult(FindTokenStatus.FOUND, ...) if token is found - FindDelimiterResult(FindTokenStatus.REACHED_EOF, ...) if token is + @return: + FindDelimiterResult(FindTokenStatus.FOUND, ...) if LF is found + FindDelimiterResult(FindTokenStatus.REACHED_EOF, ...) if LF is not found because the scan reached the EOF - FindDelimiterResult(FindTokenStatus.FAILED, ...) if token is + FindDelimiterResult(FindTokenStatus.FAILED, ...) if LF is not found because scan exhausted `max_iterations` """ @@ -405,58 +391,42 @@ def find_token_reverse( file.seek(read_offset) chunk = file.read(read_size) if not chunk or len(chunk) == 0: - # We've reached to start of the file and - # could not find the token. - return FindTokenResult( - status=FindTokenStatus.REACHED_EOF, - found_offset=0, - read_bytes=0) - - chunk_offset = chunk.rfind(token) + # We've reached the start of the file and could not find the + # token. + return SearchState(status=FindTokenStatus.REACHED_EOF, + offset=0) + chunk_offset = chunk.rfind(self.LF_TOKEN) if chunk_offset == -1: current_offset = current_offset - len(chunk) max_iterations -= 1 if (start_offset + current_offset) < 0: - return FindTokenResult( - status=FindTokenStatus.REACHED_EOF, - found_offset=0, - read_bytes=start_offset + current_offset) + return SearchState(status=FindTokenStatus.REACHED_EOF, + offset=0) continue - read_bytes = start_offset + current_offset - len(chunk) - return FindTokenResult( - status=FindTokenStatus.FOUND, - found_offset=read_offset + chunk_offset, - read_bytes=read_bytes if read_bytes > 0 else 0) - - return FindTokenResult(FindTokenStatus.FAILED) - - def find_token( - self, - file, - start_offset, - horizon, - token=b"\n", - max_iterations=MAX_SEEK_HORIZON_EXPAND, - ): + return SearchState(status=FindTokenStatus.FOUND, + offset=read_offset + chunk_offset) + + return SearchState(FindTokenStatus.FAILED) + + def find_token(self, file, start_offset, horizon, + max_iterations=MAX_SEEK_HORIZON_EXPAND): r"""Find `token` in `file` starting from `start_offset` and moving forward `horizon` bytes on each iteration for maximum of `max_iterations` times. - Args: - file (file): File descriptor, open in read mode - start_offset (int): start offset of search - horizon (int): Amount of bytes to be processed on each step. - token (str, optional): Search token. Defaults to "\n". - max_iterations (int, optional): Maximum amount of search - iterations. Defaults to 100. + @param file (file): File descriptor, open in read mode + @param start_offset (int): start offset of search + @param horizon (int): Amount of bytes to be processed on each step. + @param max_iterations (int, optional): Maximum amount of search + iterations. Defaults to 100. - Returns: - FindDelimiterResult(FindTokenStatus.FOUND, ...) if token is found - FindDelimiterResult(FindTokenStatus.REACHED_EOF, ...) if token is + @return: + FindDelimiterResult(FindTokenStatus.FOUND, ...) if LF is found + FindDelimiterResult(FindTokenStatus.REACHED_EOF, ...) if LF is not found because the scan reached the EOF - FindDelimiterResult(FindTokenStatus.FAILED, ...) if token is + FindDelimiterResult(FindTokenStatus.FAILED, ...) if LF is not found because scan exhausted `max_iterations` """ @@ -469,33 +439,30 @@ def find_token( if not chunk or len(chunk) == 0: # Reached end of file - return FindTokenResult( - status=FindTokenStatus.REACHED_EOF, - found_offset=len(self), - read_bytes=len(self) - ) + return SearchState(status=FindTokenStatus.REACHED_EOF, + offset=len(self)) - chunk_offset = chunk.find(token) + chunk_offset = chunk.find(self.LF_TOKEN) if chunk_offset == -1: - # We failed to find the token in the chunk. + # We failed to find the LF in the chunk. # Progress the current offset forward by # chunk's length. current_offset = current_offset + len(chunk) max_iterations -= 1 continue - # We've found the token in the chunk. + + # We've found the LF in the chunk. # As the chunk_offset is a relative offset to the chunk # translate it to file offset while returning. - return FindTokenResult( - status=FindTokenStatus.FOUND, - found_offset=start_offset + current_offset + chunk_offset, - read_bytes=start_offset + current_offset + len(chunk) - ) + return SearchState(status=FindTokenStatus.FOUND, + offset=(start_offset + current_offset + + chunk_offset)) + # Reached max_iterations and found nothing. - return FindTokenResult(FindTokenStatus.FAILED) + return SearchState(FindTokenStatus.FAILED) def try_find_line(self, epicenter, slf_off=None, elf_off=None): - r"""Try to find a line at `epicenter`. This function allows extracting + """ Try to find a line at `epicenter`. This function allows extracting the corresponding line from a file offset. "Line" is a string between two line feed characters i.e.; @@ -576,8 +543,7 @@ def try_find_line(self, epicenter, slf_off=None, elf_off=None): # ^epicenter ^line end lf line_end_lf = self.find_token( self.file, epicenter, LogFileDateSinceOffsetSeeker.SEEK_HORIZON - ) if elf_off is None else FindTokenResult( - FindTokenStatus.FOUND, elf_off, elf_off) + ) if elf_off is None else SearchState(FindTokenStatus.FOUND, elf_off) if line_end_lf.status == FindTokenStatus.FAILED: raise ValueError("Could not find ending line feed " @@ -588,8 +554,7 @@ def try_find_line(self, epicenter, slf_off=None, elf_off=None): # line start lf ^ ^epicenter line_start_lf = self.find_token_reverse( self.file, epicenter, LogFileDateSinceOffsetSeeker.SEEK_HORIZON - ) if slf_off is None else FindTokenResult( - FindTokenStatus.FOUND, slf_off, slf_off) + ) if slf_off is None else SearchState(FindTokenStatus.FOUND, slf_off) if line_start_lf.status == FindTokenStatus.FAILED: raise ValueError("Could not find start line feed " @@ -603,15 +568,11 @@ def try_find_line(self, epicenter, slf_off=None, elf_off=None): # Ensure that end lf offset is >= start lf offset assert line_end_lf.offset >= line_start_lf.offset - return LogLine( - file=self.file, - constraint=self.constraint, - line_start_lf=line_start_lf, - line_end_lf=line_end_lf) + return LogLine(file=self.file, constraint=self.constraint, + line_start_lf=line_start_lf, line_end_lf=line_end_lf) - def try_find_line_w_date_for( - self, how_many_lines, start_offset, prev_offset=None, forwards=True - ): + def try_find_line_w_date_for(self, how_many_lines, start_offset, + prev_offset=None, forwards=True): """Try to fetch a line with date, starting from `start_offset`. The algorithm will try to fetch a new line searching for a valid date @@ -638,11 +599,9 @@ def try_find_line_w_date_for( offset = start_offset log_line = None while how_many_lines > 0: - log_line = self.try_find_line( - offset, - (None, prev_offset)[forwards], - (None, prev_offset)[not forwards] - ) + log_line = self.try_find_line(offset, + (None, prev_offset)[forwards], + (None, prev_offset)[not forwards]) log.debug( " > TRY_FETCH, REMAINING_ATTEMPTS:%d, START_LF_OFFSET: %d, " @@ -668,7 +627,7 @@ def try_find_line_w_date_for( return None def __len__(self): - with PeekFile(self.file) as f: + with FileReadNonDestructive(self.file) as f: return f.seek(0, 2) def __getitem__(self, offset): @@ -750,15 +709,10 @@ def __getitem__(self, offset): functions, so therefore it only returns the `date` for the comparison. - Args: - offset (int): Lookup offset - - Raises: - DateSearchFailedAtOffset: When a line with a date could not - be found. - - Returns: - date: Date of the line at `offset` + @param offset (int): Lookup offset + @raise DateNotFoundInLine: When a line with a date could not + be found. + @return: Date of the line at `offset` """ log.debug("-------------------------------------------") @@ -799,7 +753,7 @@ def __getitem__(self, offset): log.debug("######### FORWARDS SEARCH END #########") if not result or result.date is None: - raise DateSearchFailedAtOffset( + raise DateNotFoundInLine( f"Date search failed at offset `{offset}`") # This is mostly for diagnostics. If we could not find @@ -826,9 +780,9 @@ def run(self): bisect.bisect_left(self, self.constraint._since_date) if not self.found_any_date: - raise NoDateFoundInLogs + raise ValidFormattedDateNotFound if not self.line_info: - raise NoMatchingLogLineWithDate + raise ValidLinesNotFound log.debug( "RUN END, FOUND LINE(START:%d, END:%d, CONTENT:%s)", @@ -979,15 +933,15 @@ def apply_to_file(self, fd, destructive=True): self._results[fd.name] = None else: self._results[fd.name] = result[0] - except NoDateFoundInLogs: + except ValidFormattedDateNotFound: log.debug("c:%s No timestamp found in file", self.id) fd.seek(0) return fd.tell() - except NoMatchingLogLineWithDate: + except ValidLinesNotFound: log.debug("c:%s No date after found in file", self.id) fd.seek(0, 2) return fd.tell() - except DateSearchFailedAtOffset as ed: + except DateNotFoundInLine as ed: log.debug("c:%s Expanded date search failed for a line: %s", self.id, ed) fd.seek(0)