Movie-Me-Now/scrapper.py
2025-04-03 15:13:40 -05:00

162 lines
5.6 KiB
Python

import requests
import json
import time
from tqdm import tqdm # progress bar library
import concurrent.futures
# Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5"
# Output file where results are saved incrementally
output_filename = "movies.json"
def write_movies(movies, filename=output_filename):
"""Helper function to write the movies list to a JSON file."""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(movies, f, indent=4, ensure_ascii=False)
except Exception as e:
print(f"Error saving data to JSON file: {e}")
def get_movie_details_tmdb(movie_id):
"""
Fetch additional details for a movie using the TMDb API.
Returns runtime and genres.
"""
details = {}
details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
params = {
"api_key": api_key,
"language": "en-US"
}
try:
response = requests.get(details_url, params=params)
if response.status_code == 200:
data = response.json()
details["runtime"] = data.get("runtime") # runtime in minutes
details["genres"] = [g["name"] for g in data.get("genres", [])]
else:
print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching details for movie {movie_id}: {e}")
return details
def get_movie_keywords(movie_id):
"""
Fetch movie keywords (tags) using the TMDb API.
"""
keywords = []
keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
params = {
"api_key": api_key
}
try:
response = requests.get(keywords_url, params=params)
if response.status_code == 200:
data = response.json()
keywords = [kw["name"] for kw in data.get("keywords", [])]
else:
print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
return keywords
def process_movie(movie, page, idx, results_per_page):
"""
Processes a single movie record:
- Computes its ranking,
- Extracts basic information,
- Fetches additional details and keywords.
"""
ranking = (page - 1) * results_per_page + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details and keywords.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
# Brief sleep to help throttle requests
time.sleep(0.2)
return movie_data
def get_top_movies():
"""
Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
After processing each page, the current list of movies is written to a JSON file.
"""
movies = []
base_url = "https://api.themoviedb.org/3/movie/top_rated"
params = {
"api_key": api_key,
"language": "en-US",
"page": 1
}
# Initial request to determine total pages.
response = requests.get(base_url, params=params)
if response.status_code != 200:
print("Failed to retrieve top rated movies")
return []
data = response.json()
total_pages = data.get("total_pages", 1)
# Loop through all pages.
for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
params["page"] = page
response = requests.get(base_url, params=params)
if response.status_code != 200:
print(f"Failed to retrieve page {page}")
continue
data = response.json()
results = data.get("results", [])
results_per_page = len(results)
# Process each movie concurrently using a thread pool.
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
futures = []
for idx, movie in enumerate(results):
futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
# Collect results as they complete.
for future in concurrent.futures.as_completed(futures):
try:
movie_data = future.result()
movies.append(movie_data)
except Exception as e:
print(f"Error processing movie: {e}")
# Write movies to JSON file incrementally after each page.
write_movies(movies)
# Pause between pages.
time.sleep(0.5)
return movies
if __name__ == "__main__":
top_movies = get_top_movies()
print(f"\nData saved to {output_filename}")