143 lines
5.2 KiB
Python
143 lines
5.2 KiB
Python
|
import requests
|
||
|
import json
|
||
|
import time
|
||
|
from tqdm import tqdm # progress bar library
|
||
|
|
||
|
# Replace with your actual TMDb API key
|
||
|
api_key = "96f3424d6fe55c2982e6e094416607f5"
|
||
|
|
||
|
# Output file where results are saved incrementally
|
||
|
output_filename = "top_movies.json"
|
||
|
|
||
|
def write_movies(movies, filename=output_filename):
|
||
|
"""Helper function to write the movies list to a JSON file."""
|
||
|
try:
|
||
|
with open(filename, "w", encoding="utf-8") as f:
|
||
|
json.dump(movies, f, indent=4, ensure_ascii=False)
|
||
|
except Exception as e:
|
||
|
print(f"Error saving data to JSON file: {e}")
|
||
|
|
||
|
def get_movie_details_tmdb(movie_id):
|
||
|
"""
|
||
|
Fetch additional details for a movie using the TMDb API.
|
||
|
Returns runtime and genres.
|
||
|
"""
|
||
|
details = {}
|
||
|
details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
|
||
|
params = {
|
||
|
"api_key": api_key,
|
||
|
"language": "en-US"
|
||
|
}
|
||
|
try:
|
||
|
response = requests.get(details_url, params=params)
|
||
|
if response.status_code == 200:
|
||
|
data = response.json()
|
||
|
details["runtime"] = data.get("runtime") # runtime in minutes
|
||
|
details["genres"] = [g["name"] for g in data.get("genres", [])]
|
||
|
else:
|
||
|
print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
|
||
|
except Exception as e:
|
||
|
print(f"Exception while fetching details for movie {movie_id}: {e}")
|
||
|
return details
|
||
|
|
||
|
def get_movie_keywords(movie_id):
|
||
|
"""
|
||
|
Fetch movie keywords (tags) using the TMDb API.
|
||
|
"""
|
||
|
keywords = []
|
||
|
keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
|
||
|
params = {
|
||
|
"api_key": api_key
|
||
|
}
|
||
|
try:
|
||
|
response = requests.get(keywords_url, params=params)
|
||
|
if response.status_code == 200:
|
||
|
data = response.json()
|
||
|
keywords = [kw["name"] for kw in data.get("keywords", [])]
|
||
|
else:
|
||
|
print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
|
||
|
except Exception as e:
|
||
|
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
|
||
|
return keywords
|
||
|
|
||
|
def get_top_movies():
|
||
|
"""
|
||
|
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
|
||
|
For each movie, additional details and keywords are fetched.
|
||
|
After processing each page, the current movies list is saved to a JSON file.
|
||
|
"""
|
||
|
movies = []
|
||
|
base_url = "https://api.themoviedb.org/3/movie/top_rated"
|
||
|
params = {
|
||
|
"api_key": api_key,
|
||
|
"language": "en-US",
|
||
|
"page": 1
|
||
|
}
|
||
|
|
||
|
# Initial request to determine total pages.
|
||
|
response = requests.get(base_url, params=params)
|
||
|
if response.status_code != 200:
|
||
|
print("Failed to retrieve top rated movies")
|
||
|
return []
|
||
|
data = response.json()
|
||
|
total_pages = data.get("total_pages", 1)
|
||
|
|
||
|
# Loop through all pages.
|
||
|
for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
|
||
|
params["page"] = page
|
||
|
response = requests.get(base_url, params=params)
|
||
|
if response.status_code != 200:
|
||
|
print(f"Failed to retrieve page {page}")
|
||
|
continue
|
||
|
data = response.json()
|
||
|
results = data.get("results", [])
|
||
|
for idx, movie in enumerate(results):
|
||
|
# Ranking is computed by overall order.
|
||
|
ranking = (page - 1) * len(results) + idx + 1
|
||
|
movie_id = movie.get("id")
|
||
|
title = movie.get("title")
|
||
|
release_date = movie.get("release_date", "")
|
||
|
year = release_date.split("-")[0] if release_date else None
|
||
|
vote_average = movie.get("vote_average")
|
||
|
vote_count = movie.get("vote_count")
|
||
|
overview = movie.get("overview")
|
||
|
poster_path = movie.get("poster_path")
|
||
|
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
|
||
|
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
|
||
|
|
||
|
# Get additional details: runtime and genres.
|
||
|
details = get_movie_details_tmdb(movie_id)
|
||
|
runtime = details.get("runtime")
|
||
|
genres = details.get("genres", [])
|
||
|
|
||
|
# Get keywords (tags).
|
||
|
tags = get_movie_keywords(movie_id)
|
||
|
|
||
|
movie_data = {
|
||
|
"ranking": ranking,
|
||
|
"title": title,
|
||
|
"year": year,
|
||
|
"runtime": runtime,
|
||
|
"content_rating": None, # Not available via TMDb by default.
|
||
|
"metascore": None, # Not applicable.
|
||
|
"imdb_rating": vote_average, # Using TMDb's vote average.
|
||
|
"vote_count": vote_count,
|
||
|
"description": overview,
|
||
|
"poster": poster,
|
||
|
"url": tmdb_url,
|
||
|
"genres": genres,
|
||
|
"tags": tags
|
||
|
}
|
||
|
movies.append(movie_data)
|
||
|
# Pause a bit between detail requests to be courteous.
|
||
|
time.sleep(0.2)
|
||
|
# After processing each page, write the current movies list to the JSON file.
|
||
|
write_movies(movies)
|
||
|
# Pause between pages.
|
||
|
time.sleep(0.5)
|
||
|
return movies
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
top_movies = get_top_movies()
|
||
|
print(f"\nData saved to {output_filename}")
|