Skip to content

Commit

Permalink
print as CSV when DRYRUN; add total count during range
Browse files Browse the repository at this point in the history
  • Loading branch information
elrayle committed Jul 24, 2024
1 parent 291234c commit 392f8f6
Showing 1 changed file with 37 additions and 16 deletions.
53 changes: 37 additions & 16 deletions tools/analyze_data_synchronization/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,32 +242,47 @@ def create_months(start_month, end_month):
months.append(f"{year}-{str(month).zfill(2)}")
return months

def page_count_and_setup(collection, query, range_label, invalid_data):
def page_count_and_setup(collection, all_query, missing_query, range_label, invalid_data):
"""Get the count of pages and set up the stats for the range"""
all_docs_count = collection.count_documents(query,
max_time_ms=10000000)
initialize_stats(range_label, all_docs_count, invalid_data)
if all_docs_count == 0:
print(f"No documents found with missing licenses in {range_label}.")
all_docs_count = collection.count_documents(
all_query,
max_time_ms=10000000
)

docs_with_missing_count = collection.count_documents(
missing_query,
max_time_ms=10000000
)
initialize_stats(range_label, docs_with_missing_count, invalid_data)
if docs_with_missing_count == 0:
if DRYRUN:
print(f"{range_label}, {all_docs_count}, 0%, 0, 0, 0")
else:
print(f"No documents found with missing licenses out of {all_docs_count} total in {range_label}.")
return 0

if INITIAL_SKIP > 0:
print(f"Skipping {INITIAL_SKIP} documents")
all_docs_count -= INITIAL_SKIP
docs_with_missing_count -= INITIAL_SKIP

page_count = all_docs_count // PAGE_SIZE
if all_docs_count % PAGE_SIZE:
page_count = docs_with_missing_count // PAGE_SIZE
if docs_with_missing_count % PAGE_SIZE:
page_count += 1

est_hours_to_complete = round(page_count * 2.5 / 60)
est_completion_time = datetime.now() + timedelta(hours=est_hours_to_complete)
print(f"Found {all_docs_count} documents missing licenses in {range_label}. Estimated time to complete is {est_hours_to_complete} hours ending at {est_completion_time}.")

if DRYRUN:
print(f"{range_label}, {all_docs_count}, {docs_with_missing_count}, {round(docs_with_missing_count/all_docs_count, 4)*100}%, {est_hours_to_complete}, {round(est_hours_to_complete / 24, 2)}")
else:
print(f"Found {docs_with_missing_count} documents missing licenses out of {all_docs_count} total in {range_label}. Estimated time to complete is {est_hours_to_complete} hours ending at {est_completion_time}.")

return page_count

def analyze_docs(collection, query, range_label, invalid_data, one_pass=False):
"""Analyze the documents in the collection for the given query"""
page_count = page_count_and_setup(collection, query, range_label, invalid_data)
missing_query = {**query, "licensed.declared": {"$exists": False}}
page_count = page_count_and_setup(collection, query, missing_query, range_label, invalid_data)
if page_count == 0 or DRYRUN:
return

Expand All @@ -279,7 +294,7 @@ def analyze_docs(collection, query, range_label, invalid_data, one_pass=False):
skip = INITIAL_SKIP
while True:
print(f"Processing page {page+1} of {page_count} in {range_label} starting at offset {skip} - {datetime.now()}")
docs = collection.find(query).skip(skip).limit(PAGE_SIZE).max_time_ms(10000000)
docs = collection.find(missing_query).skip(skip).limit(PAGE_SIZE).max_time_ms(10000000)
new_docs_count, new_invalid_count = analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_label, invalid_data)
running_count_invalid += new_invalid_count
running_count_docs += new_docs_count
Expand Down Expand Up @@ -395,13 +410,15 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
print("Processing custom date range")
print(f" START_DATE: {START_DATE}")
print(f" END_DATE: {END_DATE}")


if DRYRUN:
print("Range, # all docs, # missing, % missing, est hours to complete, est days to complete")

label = custom_range_label() if not DRYRUN else f"{custom_range_label()}_dryrun"
analyze_docs(
collection,
{
"_meta.updated": {"$gte": START_DATE, "$lte": END_DATE},
"licensed.declared": {"$exists": False},
},
label,
invalid_data,
Expand All @@ -415,15 +432,19 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
months = create_months(START_MONTH, END_MONTH)
print(f" {months}")

if DRYRUN:
print("Range, # all docs, # missing, % missing, est hours to complete, est days to complete")

for month in months:
print(f"Processing {month}")
if not DRYRUN:
print(f"Processing {month}")

label = month if not DRYRUN else f"{month}_dryrun"

analyze_docs(
collection,
{
"_meta.updated": {"$gte": f"{month}-01", "$lte": f"{month}-31"},
"licensed.declared": {"$exists": False},
},
label,
invalid_data,
Expand Down

0 comments on commit 392f8f6

Please sign in to comment.