Skip to content

Commit

Permalink
fixup logs and some code style
Browse files Browse the repository at this point in the history
  • Loading branch information
dosaboy committed Sep 16, 2023
1 parent 12a6c36 commit d8dd0cd
Showing 1 changed file with 75 additions and 76 deletions.
151 changes: 75 additions & 76 deletions searchkit/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,18 @@ class ValidFormattedDateNotFound(Exception):
date suitable to specified date format"""


class DateNotFoundInLine(Exception):
class LineWithDateNotFound(Exception):
"""Raised when searcher has encountered a line with no date
and performed forward-backward searches, but still yet, could
not found a line with date."""


class MaxSearchableLineLengthReached(Exception):
""" Raises when we exceed the number of characters we can search in a line
before finding a line feed.
"""


class InvalidSearchState(Exception):
"""Raised when a variable dependent on another variable (e.g.
the variable x only has value when y is True) is accessed without
Expand Down Expand Up @@ -356,14 +362,15 @@ class LogFileDateSinceSeeker(object):
# with no date.
MAX_TRY_FIND_WITH_DATE_ATTEMPTS = 500

MAX_SEARCHABLE_LINE_LENGTH = MAX_SEEK_HORIZON_EXPAND * SEEK_HORIZON
LINE_FEED_TOKEN = b'\n'

def __init__(self, fd, c):
self.file = fd
self.constraint = c
self.line_info = None
self.found_any_date = False
self.lookup_times = 0
self.lookup_count = 0
with NonDestructiveFileRead(self.file) as f:
self.length = f.seek(0, 2)

Expand Down Expand Up @@ -407,7 +414,9 @@ def find_token_reverse(self, start_offset):
if (start_offset + current_offset) < 0:
return SearchState(status=FindTokenStatus.REACHED_EOF,
offset=0)
log.debug("reached max line length search without finding a line feed")

log.debug("reached max line length (%s) search without finding a line "
"feed", self.MAX_SEARCHABLE_LINE_LENGTH)
return SearchState(FindTokenStatus.FAILED)

def find_token(self, start_offset):
Expand Down Expand Up @@ -446,8 +455,10 @@ def find_token(self, start_offset):
# chunk's length.
current_offset = current_offset + len(chunk)
attempts -= 1

# Exhausted all the attempts and found nothing.
log.debug("reached max line length search without finding a line feed")
log.debug("reached max line length (%s) search without finding a line "
"feed", self.MAX_SEARCHABLE_LINE_LENGTH)
return SearchState(FindTokenStatus.FAILED)

def try_find_line(self, epicenter, slf_off=None, elf_off=None):
Expand Down Expand Up @@ -516,37 +527,36 @@ def try_find_line(self, epicenter, slf_off=None, elf_off=None):
to None.
@param elf_off: Optional ending line feed offset, if known. Defaults to
None.
@raise ValueError: when ending line feed offset could not be found or
when starting line feed offset could not be found.
@raise MaxSearchableLineLengthReached
@return: found logline
"""
log.debug(" > EPICENTER: %d", epicenter)
log.debug("searching either side of pos=%d", epicenter)

# Find the first LF token from the right of the epicenter
# e.g. \nThis is a line\n
# ^epicenter ^line end lf
line_end_lf = self.find_token(
epicenter
) if elf_off is None else SearchState(
FindTokenStatus.FOUND, elf_off)
if elf_off is None:
line_end_lf = self.find_token(epicenter)
else:
line_end_lf = SearchState(FindTokenStatus.FOUND, elf_off)

if line_end_lf.status == FindTokenStatus.FAILED:
raise ValueError("Could not find ending line feed "
f"offset at epicenter {epicenter}")
raise MaxSearchableLineLengthReached("Could not find ending line "
"feed offset at epicenter "
f"{epicenter}")

# Find the first LF token from the left of the epicenter
# e.g. \nThis is a line\n
# line start lf ^ ^epicenter
line_start_lf = self.find_token_reverse(
epicenter
) if slf_off is None else SearchState(
FindTokenStatus.FOUND, slf_off)
if slf_off is None:
line_start_lf = self.find_token_reverse(epicenter)
else:
line_start_lf = SearchState(FindTokenStatus.FOUND, slf_off)

if line_start_lf.status == FindTokenStatus.FAILED:
raise ValueError("Could not find start line feed "
f"offset at epicenter {epicenter}")
raise MaxSearchableLineLengthReached("Could not find start line "
"feed offset at epicenter "
f"{epicenter}")

# Ensure that found offsets are in file range
assert line_start_lf.offset <= len(self)
Expand Down Expand Up @@ -580,6 +590,13 @@ def try_find_line_with_date(self, start_offset, line_feed_offset=None,
@return: line if found otherwise None.
"""
if line_feed_offset is None:
direction = 'reverse'
else:
direction = 'forward'

log.debug(f"starting {direction} search")

attempts = LogFileDateSinceSeeker.MAX_TRY_FIND_WITH_DATE_ATTEMPTS
offset = start_offset
log_line = None
Expand All @@ -590,29 +607,27 @@ def try_find_line_with_date(self, start_offset, line_feed_offset=None,
(None, line_feed_offset)[not forwards]
)

log.debug(
" > TRY_FETCH, REMAINING_ATTEMPTS:%d, START_LF_OFFSET: %d, "
"END_LF_OFFSET: %d >: on line -> %s",
attempts,
log_line.start_lf.offset,
log_line.end_lf.offset,
log_line.text,
)

# If the line has a valid date, return it.
if log_line.date:
log.debug(f"finished {direction} search - date found")
return log_line

log.debug("looking further for date: attempts_remaining=%d, "
"start=%d, end=%d", attempts, log_line.start_lf.offset,
log_line.end_lf.offset)

# Set offset of the found line feed
line_feed_offset = (log_line.start_lf,
log_line.end_lf)[forwards].offset
# Set the next search starting point
offset = line_feed_offset + (-1, +1)[forwards]
if offset < 0 or offset > len(self):
log.debug(" > TRY_FETCH EXIT EOF/SOF")
log.debug("search hit eof/sof - exiting")
break

attempts -= 1

log.debug(f"finished {direction} search - no date found")
return None

def __len__(self):
Expand Down Expand Up @@ -699,40 +714,27 @@ def __getitem__(self, offset):
the comparison.
@param offset: integer lookup offset
@raise DateNotFoundInLine: When a line with a date could not
@raise LineWithDateNotFound: When a line with a date could not
be found.
@return: Date of the line at `offset`
"""

self.lookup_times += 1
log.debug("-------------------------------------------")
log.debug("-------------------------------------------")
log.debug("-------------------------------------------")
log.debug("-------------------------------------------")
log.debug("LOOKUP (#%d) AT OFFSET: %d", self.lookup_times, offset)
self.lookup_count += 1
log.debug("timestamp lookup attempt=%d, offset=%d", self.lookup_count,
offset)
result = None

# Try to search backwards first.
# First call will effectively be a regular forward search given
# that we're not passing a line feed offset to the function.
# Any subsequent search attempt will be made backwards.
log.debug("######### BACKWARDS SEARCH START #########")
result = self.try_find_line_with_date(
offset,
None,
False,
)
log.debug("######### BACKWARDS SEARCH END #########")

result = self.try_find_line_with_date(offset, None, False)
# ... then, forwards.
if not result or result.date is None:
log.debug("######### FORWARDS SEARCH START #########")
result = self.try_find_line_with_date(offset + 1, offset, True)

log.debug("######### FORWARDS SEARCH END #########")

if not result or result.date is None:
raise DateNotFoundInLine(
raise LineWithDateNotFound(
f"Date search failed at offset `{offset}`")

# This is mostly for diagnostics. If we could not find
Expand All @@ -746,24 +748,21 @@ def __getitem__(self, offset):
# lookup.
self.line_info = result

log.debug(
" > EXTRACTED_DATE: `%s` >= SINCE DATE: `%s` == %s",
result.date,
self.constraint._since_date,
(result.date >= self.constraint._since_date)
if result.date else False,
)
log.debug("found date='%s' >= '%s' == %s", result.date,
self.constraint._since_date,
(result.date is not False and
result.date >= self.constraint._since_date))
return result.date

def run(self):
# bisect_left will give us the first occurenct of the date
# that satisfies the constraint.
# Similarly, bisect_right would allow the last occurence of
# a date that satisfies the criteria.

"""
Bisect_left will give us the first occurrence of the date
that satisfies the constraint. Similarly, bisect_right would allow the
last occurrence of a date that satisfies the criteria.
"""
try:
bisect.bisect_left(self, self.constraint._since_date)
except DateNotFoundInLine as exc:
except LineWithDateNotFound as exc:
if not self.found_any_date:
raise ValidFormattedDateNotFound from exc

Expand All @@ -772,14 +771,10 @@ def run(self):
if not self.line_info:
raise ValidLinesNotFound

log.debug(
"RUN END, FOUND LINE(START:%d, END:%d, CONTENT:%s)"
" IN %d LOOKUP(S)",
self.line_info.start_offset,
self.line_info.end_offset,
self.line_info.text,
self.lookup_times
)
log.debug("run end: found line start=%d, end=%d, content=%s in %d "
"lookup(s)", self.line_info.start_offset,
self.line_info.end_offset, self.line_info.text,
self.lookup_count)

return (self.line_info.start_offset, self.line_info.end_offset)

Expand Down Expand Up @@ -931,11 +926,16 @@ def apply_to_file(self, fd, destructive=True):
log.debug("c:%s No date after found in file", self.id)
fd.seek(0, 2)
return fd.tell()
except DateNotFoundInLine as ed:
log.debug("c:%s Expanded date search failed for a line: %s",
self.id, ed)
except LineWithDateNotFound as exc:
log.warning("c:%s failed to find a line containing a date: %s",
self.id, exc)
fd.seek(0)
return fd.tell()
except MaxSearchableLineLengthReached as exc:
log.warning("c:%s exceeded allowed line length search limit "
"before finding line feed: %s", self.id, exc)
fd.seek(0)
return fd.tell()

log.debug("c:%s: finished binary seek search in file %s, offset %d",
self.id, fd.name, self._results[fd.name])
Expand All @@ -947,5 +947,4 @@ def stats(self):
return _stats

def __repr__(self):
return ("id={}, since={}, current={}".
format(self.id, self._since_date, self.current_date))
return ("id={}, since={}".format(self.id, self._since_date))

0 comments on commit d8dd0cd

Please sign in to comment.