bad

2025-04-02 14:26:37 -05:00 · 2025-04-02 14:26:37 -05:00 · 914c085cac
commit 914c085cac
parent 547b161138
3 changed files with 94981 additions and 2180 deletions
--- a/app.py
+++ b/app.py
@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
 import json
 import numpy as np
 import random
+import math
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity

@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here'  # Replace with a secure key in producti
 with open('top_movies.json', 'r', encoding='utf-8') as f:
    movies = json.load(f)

-# Assign a unique ID and preprocess features for each movie
+# Preprocess each movie
 for i, movie in enumerate(movies):
-    movie['id'] = i  # Unique id for each movie
-    # Combine genres and tags into a feature string (could add description etc.)
+    movie['id'] = i  # Unique ID
+    # Combine genres and tags into one feature string.
    movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
-    # Ensure numeric values for year and runtime if possible:
+    # Ensure numeric values for year and runtime:
    try:
        movie['year_num'] = int(movie.get('year', '0'))
    except:
        movie['year_num'] = 0
    try:
-        # runtime might be a number already or a string; if string, try to convert.
        movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
    except:
        movie['runtime_num'] = 0
+    # Ensure vote_count is numeric.
+    try:
+        count = movie.get('vote_count', 0)
+        if isinstance(count, str):
+            count = count.replace(',', '')
+            if 'M' in count:
+                count = float(count.replace('M', '')) * 1e6
+            else:
+                count = int(count)
+        movie['vote_count'] = int(count)
+    except:
+        movie['vote_count'] = 0

 # Build the TF‑IDF vectorizer on movie features.
 vectorizer = TfidfVectorizer(stop_words='english')
 movie_features = [movie['features'] for movie in movies]
 movie_vectors = vectorizer.fit_transform(movie_features)

-# Precompute overall ranges for numeric features across the dataset.
+# Precompute overall ranges for numeric features.
 years = [m['year_num'] for m in movies if m['year_num'] > 0]
 runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
+max_vote = max([m['vote_count'] for m in movies]) if movies else 1
+
 min_year, max_year = (min(years), max(years)) if years else (0, 1)
 min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
 year_range = max_year - min_year if max_year != min_year else 1
 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
+rating_range = 10.0  # Assuming ratings are on a 0–10 scale

-def get_diverse_movies(num=10):
+def get_predicted_movies(num=10):
    """
-    Pick up to `num` movies that have not been shown yet, trying to cover different genres.
+    Return up to `num` movies that haven't been shown yet.
+    Uses the user's past ratings to predict which unseen movies they might like.
+    If no ratings exist, falls back to random selection.
    """
    asked = session.get('asked_movies', [])
    available = [m for m in movies if m['id'] not in asked]
    if not available:
        return []
-    selected = []
-    # List of desired genres to cover
-    desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror", 
-                      "Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
-    # Try to pick one movie per desired genre.
-    for genre in desired_genres:
-        for m in available:
-            if genre in m.get('genres', []) and m not in selected:
-                selected.append(m)
-                break
-        if len(selected) >= num:
-            break
-    # If we still need more movies, fill the remainder randomly.
-    if len(selected) < num:
-        remaining = [m for m in available if m not in selected]
-        random.shuffle(remaining)
-        selected.extend(remaining[:(num - len(selected))])
-    return selected[:num]
+    rated = session.get('rated_movies', {})
+    # Fallback to random selection if there are no like/dislike ratings.
+    if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
+        random.shuffle(available)
+        return available[:num]
+    
+    # Build prediction profiles.
+    liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
+    disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
+    
+    if liked_ids:
+        liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
+    else:
+        liked_profile = np.zeros((1, movie_vectors.shape[1]))
+    if disliked_ids:
+        disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
+    else:
+        disliked_profile = np.zeros((1, movie_vectors.shape[1]))
+    
+    # Compute numeric averages for liked movies.
+    liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
+    liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
+    liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
+    
+    avg_year = np.mean(liked_years) if liked_years else None
+    avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
+    avg_rating = np.mean(liked_ratings) if liked_ratings else None
+    
+    predictions = []
+    # Tunable weights.
+    w_text = 0.5
+    w_year = 0.1
+    w_runtime = 0.1
+    w_rating = 0.15
+    w_popularity = 0.15
+    
+    for movie in available:
+        i = movie['id']
+        # TEXT SIMILARITY.
+        movie_vector = movie_vectors[i].toarray()
+        like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
+        dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
+        text_score = like_sim - dislike_sim
+        
+        # YEAR SIMILARITY.
+        year_score = 0
+        if avg_year is not None and movie['year_num'] > 0:
+            diff_year = abs(movie['year_num'] - avg_year)
+            year_score = 1 - (diff_year / year_range)
+        
+        # RUNTIME SIMILARITY.
+        runtime_score = 0
+        if avg_runtime is not None and movie['runtime_num'] > 0:
+            diff_runtime = abs(movie['runtime_num'] - avg_runtime)
+            runtime_score = 1 - (diff_runtime / runtime_range)
+        
+        # RATING SIMILARITY.
+        rating_score = 0
+        movie_rating = movie.get('imdb_rating', 0)
+        if avg_rating is not None and movie_rating:
+            diff_rating = abs(movie_rating - avg_rating)
+            rating_score = 1 - (diff_rating / rating_range)
+        
+        # POPULARITY SCORE.
+        popularity_score = 0
+        if movie['vote_count'] > 0:
+            popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
+        
+        # Final prediction score.
+        final_score = (w_text * text_score +
+                       w_year * year_score +
+                       w_runtime * runtime_score +
+                       w_rating * rating_score +
+                       w_popularity * popularity_score)
+        predictions.append((movie, final_score))
+    
+    predictions.sort(key=lambda x: x[1], reverse=True)
+    return [pred[0] for pred in predictions[:num]]

 def enough_info():
    """
-    Determines whether we have collected enough ratings.
-    In this example, we require that the user has given a 'like' or 'dislike'
-    to at least 3 movies.
+    Check if the user has rated at least 3 movies (like/dislike).
    """
    rated = session.get('rated_movies', {})
    count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
@ -80,15 +155,13 @@ def enough_info():

@app.route('/')
 def home():
-    # Initialize session variables
    session.setdefault('rated_movies', {})  # {movie_id: rating}
-    session.setdefault('asked_movies', [])    # list of movie ids already asked
+    session.setdefault('asked_movies', [])    # list of movie IDs already shown
    return redirect(url_for('questionnaire'))

@app.route('/questionnaire', methods=['GET', 'POST'])
 def questionnaire():
    if request.method == 'POST':
-        # Process ratings from the current round.
        current_ids = request.form.getlist("movie_id")
        for movie_id in current_ids:
            rating = request.form.get(f"rating_{movie_id}")
@ -101,25 +174,34 @@ def questionnaire():
        else:
            return redirect(url_for('questionnaire'))
    else:
-        selected_movies = get_diverse_movies(num=10)
+        # Use prediction to select movies for the questionnaire.
+        selected_movies = get_predicted_movies(num=10)
        if not selected_movies:
            return redirect(url_for('recommend'))
        return render_template('questionnaire.html', movies=selected_movies)

 def advanced_recommendations():
    """
-    Build an advanced recommendation score for movies not rated by the user.
+    Compute an advanced hybrid recommendation score on unseen movies.
+    Only movies not already shown (asked) are considered.
    Combines:
-      1. Text similarity (from TF-IDF features on genres/tags).
-      2. Year similarity: movies with similar release years to liked movies.
-      3. Runtime similarity: movies with similar runtime to liked movies.
-    The final score is a weighted sum of these signals.
+      1. Text similarity (TF‑IDF) between liked/disliked profiles.
+      2. Year similarity.
+      3. Runtime similarity.
+      4. Rating similarity.
+      5. Popularity (log-scaled vote count).
+    Returns the top 20 recommendations.
    """
    rated = session.get('rated_movies', {})
+    asked = set(session.get('asked_movies', []))
+    # Only consider movies that haven't been shown to the user.
+    available = [m for m in movies if m['id'] not in asked]
+    if not available:
+        available = movies  # Fallback if all movies have been shown.
+
    liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
    disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
    
-    # Build text profiles for liked/disliked movies.
    if liked_ids:
        liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
    else:
@ -129,48 +211,56 @@ def advanced_recommendations():
    else:
        disliked_profile = np.zeros((1, movie_vectors.shape[1]))
    
-    # Compute numeric averages for liked movies (for year and runtime).
    liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
    liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
+    liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
    avg_year = np.mean(liked_years) if liked_years else None
    avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
+    avg_rating = np.mean(liked_ratings) if liked_ratings else None
    
    recommendations = []
-    # Weights for each component – adjust these to tune the algorithm.
-    w_text = 0.70
-    w_year = 0.15
-    w_runtime = 0.15
+    w_text = 0.5
+    w_year = 0.1
+    w_runtime = 0.1
+    w_rating = 0.15
+    w_popularity = 0.15
    
-    for i, movie in enumerate(movies):
-        movie_id = str(movie['id'])
-        if rated.get(movie_id, "not seen") != "not seen":
-            continue  # Skip movies already rated.
-
-        # TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
+    for movie in available:
+        i = movie['id']
        movie_vector = movie_vectors[i].toarray()
        like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
        dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
        text_score = like_sim - dislike_sim
        
-        # NUMERIC SIMILARITY for Year.
        year_score = 0
        if avg_year is not None and movie['year_num'] > 0:
            diff_year = abs(movie['year_num'] - avg_year)
-            year_score = 1 - (diff_year / year_range)  # normalized similarity (1 means identical)
+            year_score = 1 - (diff_year / year_range)
        
-        # NUMERIC SIMILARITY for Runtime.
        runtime_score = 0
        if avg_runtime is not None and movie['runtime_num'] > 0:
            diff_runtime = abs(movie['runtime_num'] - avg_runtime)
            runtime_score = 1 - (diff_runtime / runtime_range)
        
-        # Final combined score.
-        final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
+        rating_score = 0
+        movie_rating = movie.get('imdb_rating', 0)
+        if avg_rating is not None and movie_rating:
+            diff_rating = abs(movie_rating - avg_rating)
+            rating_score = 1 - (diff_rating / rating_range)
+        
+        popularity_score = 0
+        if movie['vote_count'] > 0:
+            popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
+        
+        final_score = (w_text * text_score +
+                       w_year * year_score +
+                       w_runtime * runtime_score +
+                       w_rating * rating_score +
+                       w_popularity * popularity_score)
        recommendations.append((movie, final_score))
    
-    # Sort recommendations by final score in descending order.
    recommendations.sort(key=lambda x: x[1], reverse=True)
-    return recommendations
+    return recommendations[:20]

@app.route('/recommend')
 def recommend():
@ -179,3 +269,5 @@ def recommend():

 if __name__ == '__main__':
    app.run(debug=True)
+
+
--- a/test.py
+++ b/test.py
@ -2,6 +2,7 @@ import requests
 import json
 import time
 from tqdm import tqdm  # progress bar library
+import concurrent.futures

 # Replace with your actual TMDb API key
 api_key = "96f3424d6fe55c2982e6e094416607f5"
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
        print(f"Exception while fetching keywords for movie {movie_id}: {e}")
    return keywords

+def process_movie(movie, page, idx, results_per_page):
+    """
+    Processes a single movie record:
+      - Computes its ranking,
+      - Extracts basic information,
+      - Fetches additional details and keywords.
+    """
+    ranking = (page - 1) * results_per_page + idx + 1
+    movie_id = movie.get("id")
+    title = movie.get("title")
+    release_date = movie.get("release_date", "")
+    year = release_date.split("-")[0] if release_date else None
+    vote_average = movie.get("vote_average")
+    vote_count = movie.get("vote_count")
+    overview = movie.get("overview")
+    poster_path = movie.get("poster_path")
+    poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
+    tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
+
+    # Get additional details and keywords.
+    details = get_movie_details_tmdb(movie_id)
+    runtime = details.get("runtime")
+    genres = details.get("genres", [])
+    
+    tags = get_movie_keywords(movie_id)
+    
+    movie_data = {
+        "ranking": ranking,
+        "title": title,
+        "year": year,
+        "runtime": runtime,
+        "content_rating": None,   # Not available via TMDb by default.
+        "metascore": None,        # Not applicable.
+        "imdb_rating": vote_average,  # Using TMDb's vote average.
+        "vote_count": vote_count,
+        "description": overview,
+        "poster": poster,
+        "url": tmdb_url,
+        "genres": genres,
+        "tags": tags
+    }
+    # Brief sleep to help throttle requests
+    time.sleep(0.2)
+    return movie_data
+
 def get_top_movies():
    """
-    Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
-    For each movie, additional details and keywords are fetched.
-    After processing each page, the current movies list is saved to a JSON file.
+    Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
+    After processing each page, the current list of movies is written to a JSON file.
    """
    movies = []
    base_url = "https://api.themoviedb.org/3/movie/top_rated"
@ -91,47 +136,21 @@ def get_top_movies():
            continue
        data = response.json()
        results = data.get("results", [])
+        results_per_page = len(results)
+        
+        # Process each movie concurrently using a thread pool.
+        with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
+            futures = []
            for idx, movie in enumerate(results):
-            # Ranking is computed by overall order.
-            ranking = (page - 1) * len(results) + idx + 1
-            movie_id = movie.get("id")
-            title = movie.get("title")
-            release_date = movie.get("release_date", "")
-            year = release_date.split("-")[0] if release_date else None
-            vote_average = movie.get("vote_average")
-            vote_count = movie.get("vote_count")
-            overview = movie.get("overview")
-            poster_path = movie.get("poster_path")
-            poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
-            tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
-            
-            # Get additional details: runtime and genres.
-            details = get_movie_details_tmdb(movie_id)
-            runtime = details.get("runtime")
-            genres = details.get("genres", [])
-            
-            # Get keywords (tags).
-            tags = get_movie_keywords(movie_id)
-            
-            movie_data = {
-                "ranking": ranking,
-                "title": title,
-                "year": year,
-                "runtime": runtime,
-                "content_rating": None,   # Not available via TMDb by default.
-                "metascore": None,        # Not applicable.
-                "imdb_rating": vote_average,  # Using TMDb's vote average.
-                "vote_count": vote_count,
-                "description": overview,
-                "poster": poster,
-                "url": tmdb_url,
-                "genres": genres,
-                "tags": tags
-            }
+                futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
+            # Collect results as they complete.
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    movie_data = future.result()
                    movies.append(movie_data)
-            # Pause a bit between detail requests to be courteous.
-            time.sleep(0.2)
-        # After processing each page, write the current movies list to the JSON file.
+                except Exception as e:
+                    print(f"Error processing movie: {e}")
+        # Write movies to JSON file incrementally after each page.
        write_movies(movies)
        # Pause between pages.
        time.sleep(0.5)
--- a/top_movies.json
+++ b/top_movies.json