-
Notifications
You must be signed in to change notification settings - Fork 0
/
symbol_downloader.py
158 lines (122 loc) · 6.37 KB
/
symbol_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import concurrent.futures
import itertools
import csv
import requests
from constraint import Problem
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
import time
from argparse import ArgumentParser
def grouper(n, iterable, fillvalue=None):
"""grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"""
grouper_args = [iter(iterable)] * n
return itertools.zip_longest(fillvalue=fillvalue, *grouper_args)
def get_symbols(url):
resp = requests.get(
url,
timeout=3,
allow_redirects=False,
headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
)
return resp.json()['finance']['result'][0]['documents']
if __name__ == '__main__':
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '=']
# if the alphabet is not reversed "==" will be first and "aa" last
alphabet.reverse()
parser = ArgumentParser(description='Collects almost all symbols from Yahoo Finance.')
parser.add_argument("-b", "--batchsize", dest="batchsize", default=400, type=int,
help="Number of urls in one batch")
parser.add_argument("-l", "--clength", dest="clength", default=2, type=int,
help="The maximum length of combinations to search for")
parser.add_argument("-t", "--types", dest="types", default="equity,mutualfund,etf,index,future,currency,cryptocurrency",
help="The types of symbols to download (equity,mutualfund,etf,index,future,currency,cryptocurrency)")
parser.add_argument("-o", "--outfile", dest="outfile", default="symbols.csv",
help="The path of the output file")
args = parser.parse_args()
symbols_types = args.types.split(",")
# final results dict
results = dict()
executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.batchsize)
for i in range(1, args.clength + 1):
# generates all possible combinations of the alphabet
prob = Problem()
prob.addVariables(range(1, i + 1), alphabet)
urls_to_complete = []
errors = []
is_finished = False
# generate an url for every combination with every symbol type
for combination in (''.join(c.values()) for c in prob.getSolutionIter()):
for symbol_type in symbols_types:
urls_to_complete.append(
"https://query1.finance.yahoo.com/v1/finance/lookup?"
"formatted=true&"
"lang=en-US&"
"region=US&"
f"query={combination}&"
f"type={symbol_type}&"
"count=10000&"
"start=0&"
"corsDomain=finance.yahoo.com"
)
print(f"Requesting data from {len(urls_to_complete)} urls")
with Progress(
TimeElapsedColumn(),
BarColumn(),
"{task.percentage:>3.0f}%",
TimeRemainingColumn(),
" {task.completed} / {task.total}"
) as progress:
task = progress.add_task("[red]Getting Symbols...", total=len(urls_to_complete))
while not is_finished:
for url_batch in list(grouper(args.batchsize, urls_to_complete)):
time1 = time.time()
future_to_url = {executor.submit(get_symbols, url): url for url in url_batch if
url is not None}
for future in concurrent.futures.as_completed(future_to_url):
done_url = future_to_url[future]
try:
# if the url is in the errors array, remove it
try:
errors.remove(done_url)
except ValueError:
pass
result = future.result()
for r in result:
if r['symbol'] not in results:
results[r['symbol']] = {}
results[r['symbol']]['symbol'] = r['symbol']
try:
results[r['symbol']]['shortName'] = r['shortName']
except KeyError:
if 'shortName' not in results[r['symbol']] or results[r['symbol']]['shortName'] == r['symbol']:
results[r['symbol']]['shortName'] = r['symbol']
results[r['symbol']]['exchange'] = r['exchange']
results[r['symbol']]['type'] = r['quoteType']
try:
results[r['symbol']]['rank'] = r['rank']
except KeyError:
results[r['symbol']]['rank'] = -1
progress.update(task, advance=1)
except TypeError as t:
print(f"Internal error for: {done_url}")
pass
except Exception as e:
if done_url not in errors:
errors.append(done_url)
pass
time2 = time.time()
print("\nBatch completed: ")
if len(future_to_url) > 0:
print(
f"\tTook {round(time2 - time1, 3)}s ({round((time2 - time1) / (len(future_to_url)), 3)}s per url)")
print(f"\tTotal Errors: {len(errors)}")
print(f"\tTotal symbols: {len(results)}")
urls_to_complete = errors
# if there are no more errors stop the loop
if len(urls_to_complete) == 0:
is_finished = True
results_arr = list(results.values())
with open(args.outfile, 'w', newline='', encoding='utf-8') as f:
w = csv.DictWriter(f, results_arr[0].keys(), extrasaction='ignore')
w.writeheader()
w.writerows(results_arr)