-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrape_IMDb.py
69 lines (59 loc) · 2.38 KB
/
Scrape_IMDb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# ================================== Note =======================================
# If you encounter an error indicating that the IMDb module is not found, it means
# that the IMDb library is not installed. To install the library, use the following
# command in your terminal (Command Prompt or Windows PowerShell):
#
# pip install imdbpy
#
# After installation, it is recommended to restart your system to ensure proper
# configuration of the library.
#
# Additionally, make sure to create a CSV file in the same folder where you plan
# to save the scraped data. If the program encounters an error, it could be due
# to an issue with the API, as APIs sometimes reject requests. In such cases,
# wait a few minutes and try again. Be aware that repeated failed attempts may
# result in your IP address being blocked, as many websites restrict data scraping
# to protect privacy and avoid copyright issues.
#
# Group Members:
# - Zulqarnain Ahmed (BSE-23S-088)
# - Syed Zubair Sarwar (BSE-23S-086)
# ===============================================================================
import time
from imdb import IMDb
import pandas as pd # Importing pandas library
# Initialize IMDb instance
ia = IMDb()
# Function to fetch movie details
def fetch_movie_details(movie_id):
try:
# Retrieve movie details from IMDb
movie = ia.get_movie(movie_id)
return {
'movie_id': movie_id,
'title': movie.get('title'),
'genres': ', '.join(movie.get('genres', [])),
}
except Exception as e:
# Handle errors during movie retrieval
print(f"Error fetching movie {movie_id}: {e}")
return None
# List to store movie data
movies_data = []
# Fetch details for 60,000 movies
for movie_id in range(1, 62425):
# Fetch details for each movie and add to list
movie_details = fetch_movie_details(movie_id)
if movie_details:
movies_data.append(movie_details)
# Print progress every 100 movies
if movie_id % 100 == 0:
print(f"Fetched {movie_id} movies")
# Respectful delay to avoid rate limiting
time.sleep(0.5)
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(movies_data)
# Write data to CSV file
df.to_csv('movies.csv', index=False)
# Print completion message
print("Data scraping complete. Saved to movies.csv")