-
Notifications
You must be signed in to change notification settings - Fork 4
/
test_scraper_windows.py
121 lines (91 loc) · 4.38 KB
/
test_scraper_windows.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import bs4
import requests
import selenium
from selenium import webdriver
import os
import time
from pymongo import MongoClient
from selenium.webdriver.common.by import By
'''
Install selenium and chromedriver (make sure chromedriver matches your version of chrome)
'''
# pip install --user pymongo
# pip install --user "pymongo[srv]"
# download chromedriver and place it somewhere
# chromeDriverPath = "C:\\Users\\Emma\\Desktop\\HW\\Senior Year\\Big Data\\chromedriver"
# driver = webdriver.Chrome(chromeDriverPath)
option = webdriver.ChromeOptions()
# Prevent some useless logs
option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# driver = webdriver.Chrome(chrome_options=option)
chromePath = "C:\\Users\\Emma\\Desktop\\HW\\Senior Year\\Big Data\\chromedriver"
driver = webdriver.Chrome(executable_path=chromePath, options=option)
# driver.get("https://www.google.co.in")
try:
# Open the website
driver.get('https://images.google.com/')
# Find cam button
cam_button = driver.find_elements(by=By.XPATH, value="//div[@aria-label=\"Search by image\" and @role=\"button\"]")[0]
cam_button.click()
# Find upload tab
upload_tab = driver.find_elements(by=By.XPATH, value="//*[contains(text(), 'Upload an image')]")[0]
upload_tab.click()
# Find image input
upload_btn = driver.find_elements(by=By.NAME, value='encoded_image')[0]
upload_btn.send_keys(os.getcwd()+"/test_image.jpg")
# Click on "visually similar images"
driver.find_elements(by=By.XPATH, value="""//*[@id="rso"]/div[2]/div/div[2]/g-section-with-header/div[1]/title-with-lhs-icon/a/div[2]/h3""")[0].click()
time.sleep(2)
except Exception as e:
print(e)
# If we want to search by term
# search_URL = "https://www.google.com/search?q=cute+puppies&source=lnms&tbm=isch"
# driver.get(search_URL)
num_images = 20
# scroll down until we have enough images
while True:
page_html = driver.page_source
pageSoup = bs4.BeautifulSoup(page_html, 'html.parser')
containers = pageSoup.findAll('div', {'class': "isv-r PNCib MSM1fd BUooTd"})
driver.execute_script("window.scrollBy(0,1000);")
len_containers = len(containers)
# once we have enough containers to scrape the # of images we want
if len_containers > num_images:
break
# scrolling all the way up
driver.execute_script("window.scrollTo(0,0);")
print("found %s image containers"%(len(containers)))
for i in range(1,num_images + 1):
# in google images every 25th images is 'related images', this will break if we click this
if i % 25 == 0:
continue
xpath = """//*[@id="islrg"]/div[1]/div[%s]"""%(i)
image_website_xpath = """//*[@id="islrg"]/div[1]/div[%s]/a[2]"""%(i)
preview_image_xpath = """//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img"""%(i)
image_website_element = driver.find_elements(by=By.XPATH, value=image_website_xpath)[0]
preview_image_element = driver.find_elements(by=By.XPATH, value=preview_image_xpath)[0]
preview_image_url = preview_image_element.get_attribute("src")
driver.find_elements(by=By.XPATH, value=xpath)[0].click()
timeStarted = time.time()
while True:
image_element1 = driver.find_elements(by=By.XPATH, value="""//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img""")[0]
# gather image URL
imageURL = image_element1.get_attribute("src")
image_description = image_element1.get_attribute("alt")
image_website = image_website_element.get_attribute("href")
# if the new high-res image has loaded
if imageURL != preview_image_url:
break
# if we have not loaded the high res images in x seconds, break
if time.time() - timeStarted > 3:
break
#MongoDB with PyMongo to insert Database directly
my_client = MongoClient("mongodb+srv://team_andrew:Green91%40%[email protected]/test")
db = my_client.ImageSearch #connect to "ImageSearch" Database
collection = db.get_collection("ImageData") #connect to "ImageData" Collection
image_element = {"imageLink": imageURL,"description": image_description,"websiteLink":image_website} #Create Element
data = [image_element]
result = collection.insert_many(data) #insert the saved data into the collection
print("image description:", image_description)
print("image URL", imageURL)
print("image website:", image_website)