Movie-Me-Now/scrapper.py

import requests
import json
import time
from tqdm import tqdm  # progress bar library
import concurrent.futures

# Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5"

# Output file where results are saved incrementally
output_filename = "movies.json"

def write_movies(movies, filename=output_filename):
    """Helper function to write the movies list to a JSON file."""
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(movies, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error saving data to JSON file: {e}")

def get_movie_details_tmdb(movie_id):
    """
    Fetch additional details for a movie using the TMDb API.
    Returns runtime and genres.
    """
    details = {}
    details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
        "api_key": api_key,
        "language": "en-US"
    }
    try:
        response = requests.get(details_url, params=params)
        if response.status_code == 200:
            data = response.json()
            details["runtime"] = data.get("runtime")  # runtime in minutes
            details["genres"] = [g["name"] for g in data.get("genres", [])]
        else:
            print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
    except Exception as e:
        print(f"Exception while fetching details for movie {movie_id}: {e}")
    return details

def get_movie_keywords(movie_id):
    """
    Fetch movie keywords (tags) using the TMDb API.
    """
    keywords = []
    keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
    params = {
        "api_key": api_key
    }
    try:
        response = requests.get(keywords_url, params=params)
        if response.status_code == 200:
            data = response.json()
            keywords = [kw["name"] for kw in data.get("keywords", [])]
        else:
            print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
    except Exception as e:
        print(f"Exception while fetching keywords for movie {movie_id}: {e}")
    return keywords

def process_movie(movie, page, idx, results_per_page):
    """
    Processes a single movie record:
      - Computes its ranking,
      - Extracts basic information,
      - Fetches additional details and keywords.
    """
    ranking = (page - 1) * results_per_page + idx + 1
    movie_id = movie.get("id")
    title = movie.get("title")
    release_date = movie.get("release_date", "")
    year = release_date.split("-")[0] if release_date else None
    vote_average = movie.get("vote_average")
    vote_count = movie.get("vote_count")
    overview = movie.get("overview")
    poster_path = movie.get("poster_path")
    poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
    tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"

    # Get additional details and keywords.
    details = get_movie_details_tmdb(movie_id)
    runtime = details.get("runtime")
    genres = details.get("genres", [])

    tags = get_movie_keywords(movie_id)

    movie_data = {
        "ranking": ranking,
        "title": title,
        "year": year,
        "runtime": runtime,
        "content_rating": None,   # Not available via TMDb by default.
        "metascore": None,        # Not applicable.
        "imdb_rating": vote_average,  # Using TMDb's vote average.
        "vote_count": vote_count,
        "description": overview,
        "poster": poster,
        "url": tmdb_url,
        "genres": genres,
        "tags": tags
    }
    # Brief sleep to help throttle requests
    time.sleep(0.2)
    return movie_data

def get_top_movies():
    """
    Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
    After processing each page, the current list of movies is written to a JSON file.
    """
    movies = []
    base_url = "https://api.themoviedb.org/3/movie/top_rated"
    params = {
        "api_key": api_key,
        "language": "en-US",
        "page": 1
    }

    # Initial request to determine total pages.
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print("Failed to retrieve top rated movies")
        return []
    data = response.json()
    total_pages = data.get("total_pages", 1)

    # Loop through all pages.
    for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
        params["page"] = page
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Failed to retrieve page {page}")
            continue
        data = response.json()
        results = data.get("results", [])
        results_per_page = len(results)

        # Process each movie concurrently using a thread pool.
        with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
            futures = []
            for idx, movie in enumerate(results):
                futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
            # Collect results as they complete.
            for future in concurrent.futures.as_completed(futures):
                try:
                    movie_data = future.result()
                    movies.append(movie_data)
                except Exception as e:
                    print(f"Error processing movie: {e}")
        # Write movies to JSON file incrementally after each page.
        write_movies(movies)
        # Pause between pages.
        time.sleep(0.5)
    return movies

if __name__ == "__main__":
    top_movies = get_top_movies()
    print(f"\nData saved to {output_filename}")