bad

2025-04-02 14:26:37 -05:00 · 2025-04-02 14:26:37 -05:00 · 914c085cac
commit 914c085cac
parent 547b161138
3 changed files with 94981 additions and 2180 deletions
--- a/app.py
+++ b/app.py
@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
 import json
 import numpy as np
 import random
 import math
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here'  # Replace with a secure key in producti
 with open('top_movies.json', 'r', encoding='utf-8') as f:
    movies = json.load(f)
-# Assign a unique ID and preprocess features for each movie
+# Preprocess each movie
 for i, movie in enumerate(movies):
-    movie['id'] = i  # Unique id for each movie
+    movie['id'] = i  # Unique ID
-    # Combine genres and tags into a feature string (could add description etc.)
+    # Combine genres and tags into one feature string.
    movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
-    # Ensure numeric values for year and runtime if possible:
+    # Ensure numeric values for year and runtime:
    try:
        movie['year_num'] = int(movie.get('year', '0'))
    except:
        movie['year_num'] = 0
    try:
        # runtime might be a number already or a string; if string, try to convert.
        movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
    except:
        movie['runtime_num'] = 0
    # Ensure vote_count is numeric.
    try:
        count = movie.get('vote_count', 0)
        if isinstance(count, str):
            count = count.replace(',', '')
            if 'M' in count:
                count = float(count.replace('M', '')) * 1e6
            else:
                count = int(count)
        movie['vote_count'] = int(count)
    except:
        movie['vote_count'] = 0
 # Build the TF‑IDF vectorizer on movie features.
 vectorizer = TfidfVectorizer(stop_words='english')
 movie_features = [movie['features'] for movie in movies]
 movie_vectors = vectorizer.fit_transform(movie_features)
-# Precompute overall ranges for numeric features across the dataset.
+# Precompute overall ranges for numeric features.
 years = [m['year_num'] for m in movies if m['year_num'] > 0]
 runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
 max_vote = max([m['vote_count'] for m in movies]) if movies else 1
 min_year, max_year = (min(years), max(years)) if years else (0, 1)
 min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
 year_range = max_year - min_year if max_year != min_year else 1
 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
 rating_range = 10.0  # Assuming ratings are on a 0–10 scale
-def get_diverse_movies(num=10):
+def get_predicted_movies(num=10):
    """
-    Pick up to `num` movies that have not been shown yet, trying to cover different genres.
+    Return up to `num` movies that haven't been shown yet.
    Uses the user's past ratings to predict which unseen movies they might like.
    If no ratings exist, falls back to random selection.
    """
    asked = session.get('asked_movies', [])
    available = [m for m in movies if m['id'] not in asked]
    if not available:
        return []
-    selected = []
+    rated = session.get('rated_movies', {})
-    # List of desired genres to cover
+    # Fallback to random selection if there are no like/dislike ratings.
-    desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror", 
+    if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
-                      "Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
+        random.shuffle(available)
-    # Try to pick one movie per desired genre.
+        return available[:num]
-    for genre in desired_genres:
+    
-        for m in available:
+    # Build prediction profiles.
-            if genre in m.get('genres', []) and m not in selected:
+    liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
-                selected.append(m)
+    disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
-                break
+    
-        if len(selected) >= num:
+    if liked_ids:
-            break
+        liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
-    # If we still need more movies, fill the remainder randomly.
+    else:
-    if len(selected) < num:
+        liked_profile = np.zeros((1, movie_vectors.shape[1]))
-        remaining = [m for m in available if m not in selected]
+    if disliked_ids:
-        random.shuffle(remaining)
+        disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
-        selected.extend(remaining[:(num - len(selected))])
+    else:
-    return selected[:num]
+        disliked_profile = np.zeros((1, movie_vectors.shape[1]))
    # Compute numeric averages for liked movies.
    liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
    liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
    liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
    avg_year = np.mean(liked_years) if liked_years else None
    avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
    avg_rating = np.mean(liked_ratings) if liked_ratings else None
    predictions = []
    # Tunable weights.
    w_text = 0.5
    w_year = 0.1
    w_runtime = 0.1
    w_rating = 0.15
    w_popularity = 0.15
    for movie in available:
        i = movie['id']
        # TEXT SIMILARITY.
        movie_vector = movie_vectors[i].toarray()
        like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
        dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
        text_score = like_sim - dislike_sim
        # YEAR SIMILARITY.
        year_score = 0
        if avg_year is not None and movie['year_num'] > 0:
            diff_year = abs(movie['year_num'] - avg_year)
            year_score = 1 - (diff_year / year_range)
        # RUNTIME SIMILARITY.
        runtime_score = 0
        if avg_runtime is not None and movie['runtime_num'] > 0:
            diff_runtime = abs(movie['runtime_num'] - avg_runtime)
            runtime_score = 1 - (diff_runtime / runtime_range)
        # RATING SIMILARITY.
        rating_score = 0
        movie_rating = movie.get('imdb_rating', 0)
        if avg_rating is not None and movie_rating:
            diff_rating = abs(movie_rating - avg_rating)
            rating_score = 1 - (diff_rating / rating_range)
        # POPULARITY SCORE.
        popularity_score = 0
        if movie['vote_count'] > 0:
            popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
        # Final prediction score.
        final_score = (w_text * text_score +
                       w_year * year_score +
                       w_runtime * runtime_score +
                       w_rating * rating_score +
                       w_popularity * popularity_score)
        predictions.append((movie, final_score))
    predictions.sort(key=lambda x: x[1], reverse=True)
    return [pred[0] for pred in predictions[:num]]
 def enough_info():
    """
-    Determines whether we have collected enough ratings.
+    Check if the user has rated at least 3 movies (like/dislike).
    In this example, we require that the user has given a 'like' or 'dislike'
    to at least 3 movies.
    """
    rated = session.get('rated_movies', {})
    count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
@ -80,15 +155,13 @@ def enough_info():
@app.route('/')
 def home():
    # Initialize session variables
    session.setdefault('rated_movies', {})  # {movie_id: rating}
-    session.setdefault('asked_movies', [])    # list of movie ids already asked
+    session.setdefault('asked_movies', [])    # list of movie IDs already shown
    return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST'])
 def questionnaire():
    if request.method == 'POST':
        # Process ratings from the current round.
        current_ids = request.form.getlist("movie_id")
        for movie_id in current_ids:
            rating = request.form.get(f"rating_{movie_id}")
@ -101,25 +174,34 @@ def questionnaire():
        else:
            return redirect(url_for('questionnaire'))
    else:
-        selected_movies = get_diverse_movies(num=10)
+        # Use prediction to select movies for the questionnaire.
        selected_movies = get_predicted_movies(num=10)
        if not selected_movies:
            return redirect(url_for('recommend'))
        return render_template('questionnaire.html', movies=selected_movies)
 def advanced_recommendations():
    """
-    Build an advanced recommendation score for movies not rated by the user.
+    Compute an advanced hybrid recommendation score on unseen movies.
    Only movies not already shown (asked) are considered.
    Combines:
-      1. Text similarity (from TF-IDF features on genres/tags).
+      1. Text similarity (TF‑IDF) between liked/disliked profiles.
-      2. Year similarity: movies with similar release years to liked movies.
+      2. Year similarity.
-      3. Runtime similarity: movies with similar runtime to liked movies.
+      3. Runtime similarity.
-    The final score is a weighted sum of these signals.
+      4. Rating similarity.
      5. Popularity (log-scaled vote count).
    Returns the top 20 recommendations.
    """
    rated = session.get('rated_movies', {})
    asked = set(session.get('asked_movies', []))
    # Only consider movies that haven't been shown to the user.
    available = [m for m in movies if m['id'] not in asked]
    if not available:
        available = movies  # Fallback if all movies have been shown.
    liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
    disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
    # Build text profiles for liked/disliked movies.
    if liked_ids:
        liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
    else:
@ -129,48 +211,56 @@ def advanced_recommendations():
    else:
        disliked_profile = np.zeros((1, movie_vectors.shape[1]))
    # Compute numeric averages for liked movies (for year and runtime).
    liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
    liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
    liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
    avg_year = np.mean(liked_years) if liked_years else None
    avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
    avg_rating = np.mean(liked_ratings) if liked_ratings else None
    recommendations = []
-    # Weights for each component – adjust these to tune the algorithm.
+    w_text = 0.5
-    w_text = 0.70
+    w_year = 0.1
-    w_year = 0.15
+    w_runtime = 0.1
-    w_runtime = 0.15
+    w_rating = 0.15
    w_popularity = 0.15
-    for i, movie in enumerate(movies):
+    for movie in available:
-        movie_id = str(movie['id'])
+        i = movie['id']
        if rated.get(movie_id, "not seen") != "not seen":
            continue  # Skip movies already rated.
        # TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
        movie_vector = movie_vectors[i].toarray()
        like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
        dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
        text_score = like_sim - dislike_sim
        # NUMERIC SIMILARITY for Year.
        year_score = 0
        if avg_year is not None and movie['year_num'] > 0:
            diff_year = abs(movie['year_num'] - avg_year)
-            year_score = 1 - (diff_year / year_range)  # normalized similarity (1 means identical)
+            year_score = 1 - (diff_year / year_range)
        # NUMERIC SIMILARITY for Runtime.
        runtime_score = 0
        if avg_runtime is not None and movie['runtime_num'] > 0:
            diff_runtime = abs(movie['runtime_num'] - avg_runtime)
            runtime_score = 1 - (diff_runtime / runtime_range)
-        # Final combined score.
+        rating_score = 0
-        final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
+        movie_rating = movie.get('imdb_rating', 0)
        if avg_rating is not None and movie_rating:
            diff_rating = abs(movie_rating - avg_rating)
            rating_score = 1 - (diff_rating / rating_range)
        popularity_score = 0
        if movie['vote_count'] > 0:
            popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
        final_score = (w_text * text_score +
                       w_year * year_score +
                       w_runtime * runtime_score +
                       w_rating * rating_score +
                       w_popularity * popularity_score)
        recommendations.append((movie, final_score))
    # Sort recommendations by final score in descending order.
    recommendations.sort(key=lambda x: x[1], reverse=True)
-    return recommendations
+    return recommendations[:20]
@app.route('/recommend')
 def recommend():
@ -179,3 +269,5 @@ def recommend():
 if __name__ == '__main__':
    app.run(debug=True)
--- a/test.py
+++ b/test.py
@ -2,6 +2,7 @@ import requests
 import json
 import time
 from tqdm import tqdm  # progress bar library
 import concurrent.futures
 # Replace with your actual TMDb API key
 api_key = "96f3424d6fe55c2982e6e094416607f5"
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
        print(f"Exception while fetching keywords for movie {movie_id}: {e}")
    return keywords
 def process_movie(movie, page, idx, results_per_page):
    """
    Processes a single movie record:
      - Computes its ranking,
      - Extracts basic information,
      - Fetches additional details and keywords.
    """
    ranking = (page - 1) * results_per_page + idx + 1
    movie_id = movie.get("id")
    title = movie.get("title")
    release_date = movie.get("release_date", "")
    year = release_date.split("-")[0] if release_date else None
    vote_average = movie.get("vote_average")
    vote_count = movie.get("vote_count")
    overview = movie.get("overview")
    poster_path = movie.get("poster_path")
    poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
    tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
    # Get additional details and keywords.
    details = get_movie_details_tmdb(movie_id)
    runtime = details.get("runtime")
    genres = details.get("genres", [])
    tags = get_movie_keywords(movie_id)
    movie_data = {
        "ranking": ranking,
        "title": title,
        "year": year,
        "runtime": runtime,
        "content_rating": None,   # Not available via TMDb by default.
        "metascore": None,        # Not applicable.
        "imdb_rating": vote_average,  # Using TMDb's vote average.
        "vote_count": vote_count,
        "description": overview,
        "poster": poster,
        "url": tmdb_url,
        "genres": genres,
        "tags": tags
    }
    # Brief sleep to help throttle requests
    time.sleep(0.2)
    return movie_data
 def get_top_movies():
    """
-    Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
+    Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
-    For each movie, additional details and keywords are fetched.
+    After processing each page, the current list of movies is written to a JSON file.
    After processing each page, the current movies list is saved to a JSON file.
    """
    movies = []
    base_url = "https://api.themoviedb.org/3/movie/top_rated"
@ -91,47 +136,21 @@ def get_top_movies():
            continue
        data = response.json()
        results = data.get("results", [])
-        for idx, movie in enumerate(results):
+        results_per_page = len(results)
            # Ranking is computed by overall order.
            ranking = (page - 1) * len(results) + idx + 1
            movie_id = movie.get("id")
            title = movie.get("title")
            release_date = movie.get("release_date", "")
            year = release_date.split("-")[0] if release_date else None
            vote_average = movie.get("vote_average")
            vote_count = movie.get("vote_count")
            overview = movie.get("overview")
            poster_path = movie.get("poster_path")
            poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
            tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
-            # Get additional details: runtime and genres.
+        # Process each movie concurrently using a thread pool.
-            details = get_movie_details_tmdb(movie_id)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
-            runtime = details.get("runtime")
+            futures = []
-            genres = details.get("genres", [])
+            for idx, movie in enumerate(results):
-            
+                futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
-            # Get keywords (tags).
+            # Collect results as they complete.
-            tags = get_movie_keywords(movie_id)
+            for future in concurrent.futures.as_completed(futures):
-            
+                try:
-            movie_data = {
+                    movie_data = future.result()
-                "ranking": ranking,
+                    movies.append(movie_data)
-                "title": title,
+                except Exception as e:
-                "year": year,
+                    print(f"Error processing movie: {e}")
-                "runtime": runtime,
+        # Write movies to JSON file incrementally after each page.
                "content_rating": None,   # Not available via TMDb by default.
                "metascore": None,        # Not applicable.
                "imdb_rating": vote_average,  # Using TMDb's vote average.
                "vote_count": vote_count,
                "description": overview,
                "poster": poster,
                "url": tmdb_url,
                "genres": genres,
                "tags": tags
            }
            movies.append(movie_data)
            # Pause a bit between detail requests to be courteous.
            time.sleep(0.2)
        # After processing each page, write the current movies list to the JSON file.
        write_movies(movies)
        # Pause between pages.
        time.sleep(0.5)
--- a/top_movies.json
+++ b/top_movies.json