import requests import json import time from tqdm import tqdm # progress bar library # Replace with your actual TMDb API key api_key = "96f3424d6fe55c2982e6e094416607f5" # Output file where results are saved incrementally output_filename = "top_movies.json" def write_movies(movies, filename=output_filename): """Helper function to write the movies list to a JSON file.""" try: with open(filename, "w", encoding="utf-8") as f: json.dump(movies, f, indent=4, ensure_ascii=False) except Exception as e: print(f"Error saving data to JSON file: {e}") def get_movie_details_tmdb(movie_id): """ Fetch additional details for a movie using the TMDb API. Returns runtime and genres. """ details = {} details_url = f"https://api.themoviedb.org/3/movie/{movie_id}" params = { "api_key": api_key, "language": "en-US" } try: response = requests.get(details_url, params=params) if response.status_code == 200: data = response.json() details["runtime"] = data.get("runtime") # runtime in minutes details["genres"] = [g["name"] for g in data.get("genres", [])] else: print(f"Failed to get details for movie {movie_id}: status code {response.status_code}") except Exception as e: print(f"Exception while fetching details for movie {movie_id}: {e}") return details def get_movie_keywords(movie_id): """ Fetch movie keywords (tags) using the TMDb API. """ keywords = [] keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords" params = { "api_key": api_key } try: response = requests.get(keywords_url, params=params) if response.status_code == 200: data = response.json() keywords = [kw["name"] for kw in data.get("keywords", [])] else: print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}") except Exception as e: print(f"Exception while fetching keywords for movie {movie_id}: {e}") return keywords def get_top_movies(): """ Uses the TMDb API to retrieve top rated movies, then iterates through all pages. For each movie, additional details and keywords are fetched. After processing each page, the current movies list is saved to a JSON file. """ movies = [] base_url = "https://api.themoviedb.org/3/movie/top_rated" params = { "api_key": api_key, "language": "en-US", "page": 1 } # Initial request to determine total pages. response = requests.get(base_url, params=params) if response.status_code != 200: print("Failed to retrieve top rated movies") return [] data = response.json() total_pages = data.get("total_pages", 1) # Loop through all pages. for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"): params["page"] = page response = requests.get(base_url, params=params) if response.status_code != 200: print(f"Failed to retrieve page {page}") continue data = response.json() results = data.get("results", []) for idx, movie in enumerate(results): # Ranking is computed by overall order. ranking = (page - 1) * len(results) + idx + 1 movie_id = movie.get("id") title = movie.get("title") release_date = movie.get("release_date", "") year = release_date.split("-")[0] if release_date else None vote_average = movie.get("vote_average") vote_count = movie.get("vote_count") overview = movie.get("overview") poster_path = movie.get("poster_path") poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}" # Get additional details: runtime and genres. details = get_movie_details_tmdb(movie_id) runtime = details.get("runtime") genres = details.get("genres", []) # Get keywords (tags). tags = get_movie_keywords(movie_id) movie_data = { "ranking": ranking, "title": title, "year": year, "runtime": runtime, "content_rating": None, # Not available via TMDb by default. "metascore": None, # Not applicable. "imdb_rating": vote_average, # Using TMDb's vote average. "vote_count": vote_count, "description": overview, "poster": poster, "url": tmdb_url, "genres": genres, "tags": tags } movies.append(movie_data) # Pause a bit between detail requests to be courteous. time.sleep(0.2) # After processing each page, write the current movies list to the JSON file. write_movies(movies) # Pause between pages. time.sleep(0.5) return movies if __name__ == "__main__": top_movies = get_top_movies() print(f"\nData saved to {output_filename}")