-
Notifications
You must be signed in to change notification settings - Fork 5
/
Web Scrapping Hurriyet Emlak_v3.py
125 lines (78 loc) · 2.81 KB
/
Web Scrapping Hurriyet Emlak_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding: utf-8
# I did not prefer to wrap results as string object in this project.
# But if you want to work on strings to wrap the features you can return results string by using str function.
# for i in range(0,len(results)):
# results[i] = str(results[i])
# str_res
# In[65]:
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup
list_id= []
price = []
date = []
area = []
owner = []
room = []
seller = []
adres = []
title = []
for j in range():
r = requests.get("https://www.hurriyetemlak.com/konut-satilik/villa/listeleme?pageSize=50&view=catalog&page={}".format(j))
soup = BeautifulSoup(r.text,'html.parser')
results = soup.find_all("a", attrs={'class':'overlay-link'})
for tag in results :
price.append(tag.get('data-price'))
date.append(tag.get('data-date'))
area.append(tag.get('data-meter'))
owner.append(tag.get('data-owner'))
room.append(tag.get('data-room'))
seller.append(tag.get('data-seller-type'))
adres.append(tag.get('href'))
title.append(tag.get('title'))
list_id.append(tag.get('data-listing-id'))
records={"list_id":list_id,"title": title, "price": price, "date":date, "area-m2": area, "owner":owner, "room": room, "seller": seller, "adres":adres, }
#w = csv.writer(open("hurriyet.csv", "w"))
#for key, val in records.items():
#w.writerow([key, val])
df = pd.DataFrame(records)
df.to_csv("{}hurriyet.txt".format(j))
#df.tail(20)
# In[16]:
#df.isnull().sum()
# In[17]:
import pandas as pd
df=pd.read_csv("520hurriyet.txt")
for i in range(0,len(df.iloc[:,1])):
if df.iloc[i,1][0:5] == "https":
df.loc[i,"adres"] = "New project without adres info"
else:
try:
df.iloc[i,1] = "-".join(df.iloc[i,1].replace("/konut-satilik/","").replace("-emlakcidan-villa/detay","").replace("-sahibinden-villa/detay","").split("/"))
df.iloc[i,1] = (str(df.iloc[i,1]))[0:-9]
adres_list = (df.iloc[i,1].split("-"))
df.loc[i,"şehir"]= adres_list[0]
df.loc[i,"ilçe"]= adres_list[1]
df.loc[i,"mahalle"]= adres_list[2]
except:
continue
df=df.dropna(subset=['list_id'])
df= df.drop(["Unnamed: 0"],axis=1)
df.reset_index()
df.to_csv("520hurriyet_inorder.csv", index=False)
# In[42]:
#df.isnull().sum()
# In[11]:
#df.to_csv("50hurriyet_inorder.csv", index=False)
#
# #import pandas as pd
# df1=pd.read_csv("50hurriyet_inorder.csv")
# df2=pd.read_csv("100hurriyet_inorder.csv")
# df3=pd.read_csv("150hurriyet_inorder.csv")
# df4=pd.read_csv("250hurriyet_inorder.csv")
# df5=pd.read_csv("350hurriyet_inorder.csv")
# df6=pd.read_csv("450hurriyet_inorder.csv")
# df7=pd.read_csv("520hurriyet_inorder.csv")
# In[1]:
#df2.sort_values(by=["şehir"])