-
Notifications
You must be signed in to change notification settings - Fork 32
/
ziroom.py
136 lines (125 loc) · 4.17 KB
/
ziroom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
from lxml import etree
import time
import json
import pymongo
import requests
DB = "shziroom"
base_url = "http://sh.ziroom.com"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
def fix_url(url):
if re.match(r'//', url):
url = 'http:{}'.format(url)
return url
def get_sub_districts(node):
sub_nodes = node.xpath('.//div[@class="con"]/span/a')
result = []
for sub_node in sub_nodes:
sub_district = sub_node.text
url = sub_node.attrib["href"]
url = fix_url(url)
if sub_district == "全部":
continue
result.append({"sub_district": sub_district, "url": url})
return result
def get_disctricts():
url = base_url + "/z/nl/z1.html"
r = requests.get(url, headers=headers, verify=False)
content = r.content.decode("utf-8")
root = etree.HTML(content)
distr_nodes = root.xpath('.//dl[contains(@class, "zIndex6")]/dd/ul/li')
client = pymongo.MongoClient()
db = client[DB]
for distr_node in distr_nodes:
nodes = distr_node.xpath('.//span[@class="tag"]/a')
if len(nodes) == 0:
continue
node = nodes[0]
district = node.text
url = node.attrib["href"]
url = fix_url(url)
sub_distrs = get_sub_districts(distr_node)
for sub_distr in sub_distrs:
item = {"district": district,
"sub_district": sub_distr["sub_district"],
"url": sub_distr["url"]
}
db.sub_districts.insert_one(item)
def get_price(price_node):
num_nodes = price_node.xpath('./span[@class="num"]')
print(price_node.text)
offset_map = {
1: 6,
30: 5,
6: 3,
90: 2,
120: 1,
3: 4,
7: 8,
210: 9,
5: 0,
270: 7,
}
price = 0
for num_node in num_nodes:
style = num_node.attrib["style"]
matched = re.match(r'background-position:-(\d+)px', style)
if not matched:
raise Exception("error getting price")
offset = matched.group(1)
num = offset_map[offset]
price = price*10 + num
return price
def get_houses_by_sub_district(sub_distr_id, entry_url):
url_patt = entry_url + "?p={}"
i = 1
client = pymongo.MongoClient()
db = client[DB]
while True:
url = url_patt.format(i)
url = "http://sh.ziroom.com/z/nl/z1-d310112.html"
r = requests.get(url, headers=headers, verify=False)
content = r.content.decode("utf-8")
print(content)
return
root = etree.HTML(content)
house_nodes = root.xpath('.//ul[@id="houseList"]/li[@class="clearfix"]')
if len(house_nodes) == 0:
break
for house_node in house_nodes:
title_nodes = house_node.xpath('.//div[@class="txt"]/h3/a')
if len(title_nodes) == 0:
continue
title = title_nodes[0].text
area = 0
floor_info = ""
room_type = ""
detail_nodes = house_node.xpath('.//div[@class="detail"]/p/span')
for node in detail_nodes:
print(etree.tostring(node))
text = node.text
matched = re.search(r'(\d+) ㎡', text)
if matched:
area = matched.group(1)
elif re.search(r'室', text):
room_type = text
price_nodes = house_node.xpath('.//div[@class="priceDetail"]/p[@class="price"]')
if len(price_nodes) == 0:
continue
price = get_price(price_nodes[0])
print(price)
i += 1
def get_all_houses():
client = pymongo.MongoClient()
db = client[DB]
sub_distr_rows = db.sub_districts.find()
for sub_distr in sub_distr_rows:
entry_url = sub_distr["url"]
sub_distr_id = sub_distr["_id"]
distr_name = sub_distr["district"]
sub_distr_name = sub_distr["sub_district"]
print(distr_name, sub_distr_name)
get_houses_by_sub_district(sub_distr_id, entry_url)
break
if __name__ == "__main__":
get_all_houses()