import requests import json import time from tqdm import tqdm # progress bar library import concurrent.futures # Replace with your actual TMDb API key api_key = "96f3424d6fe55c2982e6e094416607f5" # Output file where results are saved incrementally output_filename = "top_movies.json" def write_movies(movies, filename=output_filename): """Helper function to write the movies list to a JSON file.""" try: with open(filename, "w", encoding="utf-8") as f: json.dump(movies, f, indent=4, ensure_ascii=False) except Exception as e: print(f"Error saving data to JSON file: {e}") def get_movie_details_tmdb(movie_id): """ Fetch additional details for a movie using the TMDb API. Returns runtime and genres. """ details = {} details_url = f"https://api.themoviedb.org/3/movie/{movie_id}" params = { "api_key": api_key, "language": "en-US" } try: response = requests.get(details_url, params=params) if response.status_code == 200: data = response.json() details["runtime"] = data.get("runtime") # runtime in minutes details["genres"] = [g["name"] for g in data.get("genres", [])] else: print(f"Failed to get details for movie {movie_id}: status code {response.status_code}") except Exception as e: print(f"Exception while fetching details for movie {movie_id}: {e}") return details def get_movie_keywords(movie_id): """ Fetch movie keywords (tags) using the TMDb API. """ keywords = [] keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords" params = { "api_key": api_key } try: response = requests.get(keywords_url, params=params) if response.status_code == 200: data = response.json() keywords = [kw["name"] for kw in data.get("keywords", [])] else: print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}") except Exception as e: print(f"Exception while fetching keywords for movie {movie_id}: {e}") return keywords def process_movie(movie, page, idx, results_per_page): """ Processes a single movie record: - Computes its ranking, - Extracts basic information, - Fetches additional details and keywords. """ ranking = (page - 1) * results_per_page + idx + 1 movie_id = movie.get("id") title = movie.get("title") release_date = movie.get("release_date", "") year = release_date.split("-")[0] if release_date else None vote_average = movie.get("vote_average") vote_count = movie.get("vote_count") overview = movie.get("overview") poster_path = movie.get("poster_path") poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}" # Get additional details and keywords. details = get_movie_details_tmdb(movie_id) runtime = details.get("runtime") genres = details.get("genres", []) tags = get_movie_keywords(movie_id) movie_data = { "ranking": ranking, "title": title, "year": year, "runtime": runtime, "content_rating": None, # Not available via TMDb by default. "metascore": None, # Not applicable. "imdb_rating": vote_average, # Using TMDb's vote average. "vote_count": vote_count, "description": overview, "poster": poster, "url": tmdb_url, "genres": genres, "tags": tags } # Brief sleep to help throttle requests time.sleep(0.2) return movie_data def get_top_movies(): """ Uses the TMDb API to retrieve top-rated movies and processes them concurrently. After processing each page, the current list of movies is written to a JSON file. """ movies = [] base_url = "https://api.themoviedb.org/3/movie/top_rated" params = { "api_key": api_key, "language": "en-US", "page": 1 } # Initial request to determine total pages. response = requests.get(base_url, params=params) if response.status_code != 200: print("Failed to retrieve top rated movies") return [] data = response.json() total_pages = data.get("total_pages", 1) # Loop through all pages. for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"): params["page"] = page response = requests.get(base_url, params=params) if response.status_code != 200: print(f"Failed to retrieve page {page}") continue data = response.json() results = data.get("results", []) results_per_page = len(results) # Process each movie concurrently using a thread pool. with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: futures = [] for idx, movie in enumerate(results): futures.append(executor.submit(process_movie, movie, page, idx, results_per_page)) # Collect results as they complete. for future in concurrent.futures.as_completed(futures): try: movie_data = future.result() movies.append(movie_data) except Exception as e: print(f"Error processing movie: {e}") # Write movies to JSON file incrementally after each page. write_movies(movies) # Pause between pages. time.sleep(0.5) return movies if __name__ == "__main__": top_movies = get_top_movies() print(f"\nData saved to {output_filename}")