-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
57 lines (45 loc) · 1.49 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import urllib.request
from bs4 import BeautifulSoup
BASE_URL = 'https://www.dramteatr.ru/seasons'
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse(html):
soup = BeautifulSoup(html, "html.parser")
table = soup.find('div', class_='region region-content')
rows = table.find_all('div', class_='views-field views-field-title')
dates = soup.find_all("span", class_= "dt1")
month = soup.find_all("span", class_="dt2")
a = (re.sub("\<span class\=\"dt1\"\>","", str(dates)))
b = (re.sub("\<\/span\>", "", str(a)))
c = b.split()
# month = soup.find_all("span", class_= "dt2")
# weekday = soup.find_all("span", class_="dt3")
projects = []
for row in rows:
projects.append({
'title': row.a.text,
})
for i, j in zip(projects,c):
print(i,j)
# for dt1 in date:
# projects[0]['date'] = dt1.text
# for dt2 in month:
# projects[0]['month'] = dt2.text
# for project in projects:
# print(project)
def get_links(dirty_list,start,end):
#из "грязной" версии забираем чистые URL-ы
links=[]
for row in dirty_list:
if row!='None':
i_beg=row.find(start)
i_end=row.rfind(end)
if i_beg!=-1 & i_end!=-1:
links.append(row[i_beg:i_end])
return links
def main():
parse(get_html('https://www.dramteatr.ru/seasons'))
if __name__ == '__main__':
main()