-
Notifications
You must be signed in to change notification settings - Fork 0
/
testScrape.py
125 lines (86 loc) · 3.59 KB
/
testScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
import re
from bs4 import BeautifulSoup
import pdb
page_main = 'http://www.ddbst.com/free-data.html'
page_1 = 'http://www.ddbst.com/en/EED/PCP/CMPS_C3.php'
ex_DB = 'http://www.ddbst.com/en/EED/PCP/PCPindex.php'
base = 'http://www.ddbst.com/'
excessH_DB = 'http://www.ddbst.com/en/EED/HE/HEindex.php'
DB_multi_set = 'http://www.ddbst.com/en/EED/VLE/VLE%20Acetonitrile%3BAcetic%20acid.php'
def DDBST_table(DDBST_url):
#page = urllib2.urlopen(DDBST_url)
page = requests.get(DDBST_url).text
soup = BeautifulSoup(page, 'html.parser')
tab = []
soup.getText
for rel_item in soup.find_all(['table','h3']):
if re.search('Data Set',rel_item.get_text() ):
tab.append(['$$$'+rel_item.get_text()] )
elif rel_item.find_all('a'): # eliminates picture tables
continue
else:
for line in rel_item.find_all('tr'):
fetchline = []
for elem in line.find_all(['th','td']):
# string without whitespace before of after characters
# added utf-8 formating to include unusual letters i.e. german letters ect..
item = elem.get_text().strip('\t').strip(' ')
try:
if (item != '\xa0'):
fetchline.append(item)
else:
fetchline.append('<empty>')
except:
fetchline.append('PARSING_ERROR_HERE')
#print(fetchline)
tab.append(fetchline)
return tab
def DDBST_DB_free(prnt):
page = requests.get(page_main).text
soup = BeautifulSoup(page, 'html.parser')
tab = soup.find_all('table')[0]
res = ['url']
for fline in tab.find_all('tr')[0].find_all('strong'):
res.append(fline.get_text())
res = [res] # first array done
for line in tab.find_all('tr')[1:]:
li = []
for hlink in line.find_all('a'):
li.append(hlink.get('href'))
qlist = [li]
for elem in line.find_all(['a','td','th']):
qlist.append( str( elem.get_text() ) )
res.append(qlist)
# end for
if prnt:
for hline in res:
print(hline)
return res
def All_links(url):
''' fetches all possible urls from href tags'''
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
h = []
for link in soup.find_all('a'):
h.append(link.get('href'))
return h
def DB(DDBST_url):
''' fetches all urls endings with databases in a specific main database Pure component properties
ect..'''
page = requests.get(DDBST_url).text
soup = BeautifulSoup(page, 'html.parser')
all_par_iter = [m.start(0) for m in re.finditer('/', ex_DB)]
base_url = DDBST_url[0:all_par_iter[-1]]
ret_elem = []
for link in soup.find_all('p')[3:-1]:
#pdb.set_trace()
header = str( link.find('a').get('href') )
name = str ( link.find('a').getText() )
#wspace_iter = [m.start(0) for m in re.finditer(' ', header)]
#if wspace_iter:
# add %20 in space
#header = header[0:wspace_iter[0]] + ' ' + header[wspace_iter[0]+1:]
#pdb.set_trace()
ret_elem.append([base_url + '/' + header, name])
return ret_elem