forked from teamdandelion/RoboBuffett
-
Notifications
You must be signed in to change notification settings - Fork 0
/
yahoo_parser.py
121 lines (94 loc) · 3.06 KB
/
yahoo_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
#
from BeautifulSoup import BeautifulSoup as bs
from collections import defaultdict
from multiprocessing import Pool
import urllib2
try: import cPickle as pickle
except: import pickle
def main():
download_list = 0
validate_list = 0
compile_list = 0
''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
fulllistdat = 'raw_stocks_list.dat'
if download_list:
maximum = 23686; cnt = 1; fcnt = 1; collector = defaultdict(list);
while cnt < maximum:
soup = bs(urllib2.urlopen('http://screener.finance.yahoo.com/b?pr=0/&s=tk&vw=1&db=stocks&b=' + str(cnt)))
table = soup.findAll("table")[1].contents[1].contents[1].contents[1]
for n in range(21)[1:]:
try:
ticker = str(table.contents[n].find('a').string).replace(';','')
name = str(table.contents[n].findAll('font')[1].string).replace('&','&')
collector[ticker] = name
print fcnt,'of',maximum,'\t',ticker,'\t',name
fcnt += 1
except:
saver(collector, fulllistdat)
cnt += 20
saver(collector, fulllistdat)
''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
# Load pickled stocks data
with open(fulllistdat) as f:
d = pickle.load(f)
if validate_list:
# Clear file in which we record validation tickers
open('record_stock_name_validation.txt', 'w').close()
pool = Pool(processes=16)
pool.map(validator, d.keys())
''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
if compile_list:
with open('raw_name_validation.txt', 'r') as f:
rcrd = f.read().split('\n')[:-1]
collector = defaultdict(list)
for i in rcrd:
collector[i.split('\t')[0]] = i.split('\t')[1]
notdone = list(set(d.keys()).difference(set(collector.keys())))
pool = Pool(processes=8)
pool.map(validator, notdone)
final_list = defaultdict(list)
p = 0; n = 0; f = 0; r = 0;
for k,v in collector.iteritems():
if v == '':
n += 1
elif v == 'FAIL':
f += 1
elif v == 'PASS':
final_list[k] = d[k]
p += 1
else:
final_list[v] = d[k]
r += 1
with open('stocks_list.dat','wb') as fn:
pickle.dump(dict(final_list),fn)
print 'Total list:',len(final_list)
print 'Nothing:',n,'| Fail:',f,'| Pass:',p,'| Replace:',r
def validator(ticker):
soup = bs(urllib2.urlopen('http://finance.yahoo.com/q?s=' + ticker.replace('&','%26') ))
outcome = ''
try:
if ( str(soup.find('h3').contents[0]) == 'Changed Ticker Symbol' ):
outcome = str(soup.findAll('p')[1].contents[1].contents[0])
except: pass
try:
if ( str(soup.findAll('h2')[2].contents[0]) == 'There are no All Markets results for' ):
outcome = 'FAIL'
except: pass
try:
tname = str(soup.findAll('h2')[3].contents[0]).split('(')[-1][:-1]
# tname = fname[fname.find("(")+1:fname.find(")")]
if ticker in tname:
if ticker == tname:
outcome = 'PASS'
else:
outcome = tname
except: pass
with open('raw_name_validation.txt','a') as f:
f.write(ticker+'\t'+outcome+'\n')
print ticker,'\t',outcome
def saver(collector, fname):
with open(fname,'wb') as f:
pickle.dump(dict(collector),f)
if __name__ == '__main__':
main()