This repository has been archived by the owner on Oct 20, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
zip_results.py
65 lines (56 loc) · 2.81 KB
/
zip_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import argparse
import re
from openpyxl import load_workbook
from utils import open_xls_as_xlsx
def zip_results(filename: str, score_type: str, column_idx: int):
regex = re.compile('(?:@)?(?P<tld>[\w\-]+\.\w+)')
print("Welcome to the results zipper! Wait for the xlsx to load.")
if re.search('\.xlsx$', filename):
workbook = load_workbook(filename=filename, keep_vba=False)
result_filename = filename.replace(".xlsx", ".csv")
elif re.search('\.xls$', filename):
workbook = open_xls_as_xlsx(filename)
result_filename = filename.replace(".xls", ".csv")
else:
print("Unsupported file format!")
exit(1)
sheet = workbook.active
print("File loaded. results/{} will be zipped to it.".format(result_filename))
with open("results/" + result_filename, "rb") as result:
result.seek(0)
rows = sheet.max_row
columns = sheet.max_column
skip_empty = 2
for row in sheet['A2:B256']:
if not row[0].value:
skip_empty += 1
else:
break
for line in range(skip_empty, rows):
if line % 100 == 0:
print("Zipped {0:.3f}%".format(line / (rows * 1.) * 100.))
result_line = result.readline()
if not result_line:
# xlsx had some invalid lines, some where skiped
# resulting in a discrepancy between the two files lines numbers
break
domain, segment, score = result_line.decode("utf-8").split(",")
mail = sheet['{}{}'.format(column_idx, line)].value
if not mail or not regex.search(mail):
result.seek(-len(result_line), 1) # go back to read the same line again
continue # skip invalid entries, like in bulk_score
if domain not in mail:
print("ERROR! Zipping on weird data (trying to zip {} with {})".format(mail, domain))
exit(1)
sheet.cell(line, columns + 1).value = segment
sheet.cell(line, columns + 2).value = int(score)
filename_with_results = filename.replace(".xls", "_with-results.xls")
print("Now saving to {}, this might take several minutes...".format(filename_with_results))
workbook.save(filename_with_results)
print("You're good to go!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Sends bulk persons to be scored.')
parser.add_argument("--filename", help="xlsx file containing all the persons to score", required=True)
parser.add_argument("--score_type", help="which score to use: either by domain or by personal email", required=True, choices=['domain', 'mail'])
parser.add_argument("--column_idx", help="domain/mail column idx (i.e: BQ)", required=True)
zip_results(**vars(parser.parse_args()))