-
Notifications
You must be signed in to change notification settings - Fork 2
/
google_scholar.py
91 lines (78 loc) · 3.05 KB
/
google_scholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
"""
google_scholar: This module contains the GoogleScholarUser class that
is used to scrape the articles available in google scholar of a
certain user.
"""
from __future__ import print_function
import time
import requests
from bs4 import BeautifulSoup
class GoogleScholarUser():
"""
This class scrapes the articles of a certain user on google scholar,
organises the articles in possible categories and makes them available
under the variable publications.
"""
start_page = 0
end_page = 100
base_url = "https://scholar.google.co.in/citations?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
payload = {
'json': 1
}
session = requests.Session()
def __init__(self, user_id):
"""
Initialize the necessary variables.
Definitions:
- user_id: User ID of the user to scrape
- url: URL used to for post/get requests
- articles: Raw scrape data
- response: Response object after get/post request
"""
self.user_id = user_id
self.url = self.base_url + 'user=' + self.user_id
self.articles = None
self.response = None
# Generate the URL based on page count
def __frame_url(self):
"""
Generate appropriate URL based on start and
end pages to scrape.
"""
self.url = self.base_url + 'user=' + self.user_id
self.url += '&hl=en&oi=ao&cstart=' + str(self.start_page)
self.url += '&pagesize=' + str(self.end_page)
# This function fetches all google scholar articles
def get_scholar_articles(self):
"""
Populate the raw articles.
"""
# Generate the URLs
self.__frame_url()
# Fetch the data
self.response = self.session.post(self.url, headers=self.headers, data=self.payload)
if not self.response.ok:
raise Exception('Gone :/')
# Sanitize the fetched data
self.response = self.response.json()
self.articles = BeautifulSoup(self.response['B'], 'html.parser')
self.articles = self.articles.find_all('tr')
# Google scholar return only 100 articles at most.
# So we repeat the loop for every 100 articles
while len(self.articles) == self.end_page:
self.start_page += 100
self.end_page += 100
time.sleep(2) # Make sure we don't query too many times
self.__frame_url()
self.response = self.session.post(self.url, headers=self.headers, data=self.payload)
if not self.response.ok:
raise Exception('Gone :/')
self.response = self.response.json()
current_fetch = BeautifulSoup(self.response['B'], 'html.parser')
# Append the current articles to the past ones
for article in current_fetch.find_all('tr'):
self.articles.append(article)