-
Notifications
You must be signed in to change notification settings - Fork 0
/
bgg_scraper.py
120 lines (102 loc) · 4.68 KB
/
bgg_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import csv
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
try:
from googlesearch import search
except ImportError:
print("No module named 'google' found")
from class_boardgames import Boardgames
#Ask whether a new csv file should be used
while True:
answer = input('Do you want to create a new csv file? [Y/n] ')
if answer == 'Y' or answer == 'n':
break
header = ['Name', 'Board game url', 'BGG Rating', 'User rating', 'Total reviews', 'Max # players',
'Optimal # players', 'Min playing time', 'Max playing time', 'Weight/5', 'Min price', 'Skroutz url']
if answer == 'Y':
start = 1
#write csv file
f = open('Board_games.csv', 'w', encoding='UTF8', newline='')
wr = csv.writer(f)
wr.writerow(header)
else:
start = int(input('From which line should the csv start? The number must be in the range [3-3000] ')) -1 #must have at least 1 element and remove header line
df = pd.read_csv('Board_games.csv')
#initialize necessary parameters
flag = int(input('How many board games do you want to retrieve from the list? '))
ppgc = (start-1)%100 #per page game counter
blanks = ppgc//15 #each page contains 6 blank lines per 15 board games
page = (start-1)//100
blank_count = blanks + page*6
game_counter = (start-1)+blank_count
init_gc = game_counter #a constant index
#-----------retrieve data for each board game----------------------
while game_counter < init_gc+flag:
#retrieve the corresponding page from bgg site; each page contains 100 games
url = 'https://boardgamegeek.com/browse/boardgame/page/'+str(page+1)+'?sort=rank'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
results = soup('tr')[1:] #ignore the first entry
game_name = results[ppgc+blanks].find(class_ = 'primary').text
init_url = results[ppgc+blanks].find(class_ = 'primary')['href']
bgg_url = 'https://boardgamegeek.com'+ init_url
#get the rating of each game
res_ratings = results[ppgc+blanks].find_all(class_ = 'collection_bggrating')
bgg_rating = float(res_ratings[0].text.split()[0])
user_rating = float(res_ratings[1].text.split()[0])
votes = int(res_ratings[2].text.split()[0])
#open bgg_url and retrieve extra attributes
p = Boardgames(bgg_url)
max_p = p._maxplayers()
opt_p = p._players()
min_t, max_t = p._playingtime()
dif = p._weight()
#Perform a google search
query = f'{game_name}' + ' ' + 'skroutz'
try:
first_option = True
for web_url in search(query, tld="co.in", num=4, stop=4, pause=4):
if 'keyphrase' not in web_url and first_option == True:
first_option = False
skroutz_url = web_url
#get the corresponding price from skroutz.gr
driver = webdriver.Chrome()
driver.get(skroutz_url)
html = requests.get(skroutz_url)
price = driver.find_element(By.CLASS_NAME, 'dominant-price').text.split()[0].replace(',' , '.')
except:
price = '-'
skroutz_url = '-'
#-----------------------------------------------------------------------
#------------------Print the results in the csv file--------------------
if start == 1:
wr.writerow([game_name, bgg_url, bgg_rating, user_rating, votes, max_p,
opt_p, min_t, max_t, dif, price, skroutz_url])
else:
if len(df) > game_counter-blank_count: #update the list
new_row = [game_name, bgg_url, bgg_rating, user_rating, votes, max_p, opt_p, min_t, max_t, dif, price, skroutz_url]
for i in range(12):
df[header[i]].values[game_counter-blank_count] = new_row[i]
else:
new_row = pd.Series(data={'Name':game_name , 'Board game url':bgg_url, 'BGG Rating':bgg_rating,
'User rating':user_rating, 'Total reviews':votes, 'Max # players':max_p,
'Optimal # players':opt_p, 'Min playing time':min_t, 'Max playing time':max_t,
'Weight/5':dif, 'Min price':price, 'Skroutz url':skroutz_url})
df = df.append(new_row, ignore_index = True)
game_counter += 1
blanks = (ppgc+1)//15
if (ppgc+1)%15 == 0:
game_counter += 1 #when the index surpasses 100, it resets because each page contains 100 board games
blank_count += 1
ppgc = (game_counter-blank_count)%100
page = (game_counter-blank_count)//100
if ppgc == 0 and blanks != 0:
blanks = 0
#----------------------------------------------------------------------
#Replace the existing csv
if start > 1:
df.to_csv('Board_games.csv', index=False)