Movie-Me-Now/test.py
2025-04-02 13:49:04 -05:00

143 lines
5.2 KiB
Python

import requests
import json
import time
from tqdm import tqdm # progress bar library
# Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5"
# Output file where results are saved incrementally
output_filename = "top_movies.json"
def write_movies(movies, filename=output_filename):
"""Helper function to write the movies list to a JSON file."""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(movies, f, indent=4, ensure_ascii=False)
except Exception as e:
print(f"Error saving data to JSON file: {e}")
def get_movie_details_tmdb(movie_id):
"""
Fetch additional details for a movie using the TMDb API.
Returns runtime and genres.
"""
details = {}
details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
params = {
"api_key": api_key,
"language": "en-US"
}
try:
response = requests.get(details_url, params=params)
if response.status_code == 200:
data = response.json()
details["runtime"] = data.get("runtime") # runtime in minutes
details["genres"] = [g["name"] for g in data.get("genres", [])]
else:
print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching details for movie {movie_id}: {e}")
return details
def get_movie_keywords(movie_id):
"""
Fetch movie keywords (tags) using the TMDb API.
"""
keywords = []
keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
params = {
"api_key": api_key
}
try:
response = requests.get(keywords_url, params=params)
if response.status_code == 200:
data = response.json()
keywords = [kw["name"] for kw in data.get("keywords", [])]
else:
print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
return keywords
def get_top_movies():
"""
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
For each movie, additional details and keywords are fetched.
After processing each page, the current movies list is saved to a JSON file.
"""
movies = []
base_url = "https://api.themoviedb.org/3/movie/top_rated"
params = {
"api_key": api_key,
"language": "en-US",
"page": 1
}
# Initial request to determine total pages.
response = requests.get(base_url, params=params)
if response.status_code != 200:
print("Failed to retrieve top rated movies")
return []
data = response.json()
total_pages = data.get("total_pages", 1)
# Loop through all pages.
for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
params["page"] = page
response = requests.get(base_url, params=params)
if response.status_code != 200:
print(f"Failed to retrieve page {page}")
continue
data = response.json()
results = data.get("results", [])
for idx, movie in enumerate(results):
# Ranking is computed by overall order.
ranking = (page - 1) * len(results) + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details: runtime and genres.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
# Get keywords (tags).
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
movies.append(movie_data)
# Pause a bit between detail requests to be courteous.
time.sleep(0.2)
# After processing each page, write the current movies list to the JSON file.
write_movies(movies)
# Pause between pages.
time.sleep(0.5)
return movies
if __name__ == "__main__":
top_movies = get_top_movies()
print(f"\nData saved to {output_filename}")