Starting on Predictoin Algorithm

2025-04-02 13:49:04 -05:00 · 2025-04-02 13:49:04 -05:00 · 547b161138
commit 547b161138
7 changed files with 6406 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -0,0 +1,181 @@
 from flask import Flask, request, render_template, redirect, url_for, session
 import json
 import numpy as np
 import random
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 app = Flask(__name__)
 app.secret_key = 'your_secret_key_here'  # Replace with a secure key in production
 # Load movies from top_movies.json with UTF-8 encoding
 with open('top_movies.json', 'r', encoding='utf-8') as f:
    movies = json.load(f)
 # Assign a unique ID and preprocess features for each movie
 for i, movie in enumerate(movies):
    movie['id'] = i  # Unique id for each movie
    # Combine genres and tags into a feature string (could add description etc.)
    movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
    # Ensure numeric values for year and runtime if possible:
    try:
        movie['year_num'] = int(movie.get('year', '0'))
    except:
        movie['year_num'] = 0
    try:
        # runtime might be a number already or a string; if string, try to convert.
        movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
    except:
        movie['runtime_num'] = 0
 # Build the TF‑IDF vectorizer on movie features.
 vectorizer = TfidfVectorizer(stop_words='english')
 movie_features = [movie['features'] for movie in movies]
 movie_vectors = vectorizer.fit_transform(movie_features)
 # Precompute overall ranges for numeric features across the dataset.
 years = [m['year_num'] for m in movies if m['year_num'] > 0]
 runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
 min_year, max_year = (min(years), max(years)) if years else (0, 1)
 min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
 year_range = max_year - min_year if max_year != min_year else 1
 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
 def get_diverse_movies(num=10):
    """
    Pick up to `num` movies that have not been shown yet, trying to cover different genres.
    """
    asked = session.get('asked_movies', [])
    available = [m for m in movies if m['id'] not in asked]
    if not available:
        return []
    selected = []
    # List of desired genres to cover
    desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror", 
                      "Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
    # Try to pick one movie per desired genre.
    for genre in desired_genres:
        for m in available:
            if genre in m.get('genres', []) and m not in selected:
                selected.append(m)
                break
        if len(selected) >= num:
            break
    # If we still need more movies, fill the remainder randomly.
    if len(selected) < num:
        remaining = [m for m in available if m not in selected]
        random.shuffle(remaining)
        selected.extend(remaining[:(num - len(selected))])
    return selected[:num]
 def enough_info():
    """
    Determines whether we have collected enough ratings.
    In this example, we require that the user has given a 'like' or 'dislike'
    to at least 3 movies.
    """
    rated = session.get('rated_movies', {})
    count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
    return count >= 3
@app.route('/')
 def home():
    # Initialize session variables
    session.setdefault('rated_movies', {})  # {movie_id: rating}
    session.setdefault('asked_movies', [])    # list of movie ids already asked
    return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST'])
 def questionnaire():
    if request.method == 'POST':
        # Process ratings from the current round.
        current_ids = request.form.getlist("movie_id")
        for movie_id in current_ids:
            rating = request.form.get(f"rating_{movie_id}")
            session['rated_movies'][movie_id] = rating
            if int(movie_id) not in session['asked_movies']:
                session['asked_movies'].append(int(movie_id))
        remaining = [m for m in movies if m['id'] not in session['asked_movies']]
        if enough_info() or not remaining:
            return redirect(url_for('recommend'))
        else:
            return redirect(url_for('questionnaire'))
    else:
        selected_movies = get_diverse_movies(num=10)
        if not selected_movies:
            return redirect(url_for('recommend'))
        return render_template('questionnaire.html', movies=selected_movies)
 def advanced_recommendations():
    """
    Build an advanced recommendation score for movies not rated by the user.
    Combines:
      1. Text similarity (from TF-IDF features on genres/tags).
      2. Year similarity: movies with similar release years to liked movies.
      3. Runtime similarity: movies with similar runtime to liked movies.
    The final score is a weighted sum of these signals.
    """
    rated = session.get('rated_movies', {})
    liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
    disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
    # Build text profiles for liked/disliked movies.
    if liked_ids:
        liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
    else:
        liked_profile = np.zeros((1, movie_vectors.shape[1]))
    if disliked_ids:
        disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
    else:
        disliked_profile = np.zeros((1, movie_vectors.shape[1]))
    # Compute numeric averages for liked movies (for year and runtime).
    liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
    liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
    avg_year = np.mean(liked_years) if liked_years else None
    avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
    recommendations = []
    # Weights for each component – adjust these to tune the algorithm.
    w_text = 0.70
    w_year = 0.15
    w_runtime = 0.15
    for i, movie in enumerate(movies):
        movie_id = str(movie['id'])
        if rated.get(movie_id, "not seen") != "not seen":
            continue  # Skip movies already rated.
        # TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
        movie_vector = movie_vectors[i].toarray()
        like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
        dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
        text_score = like_sim - dislike_sim
        # NUMERIC SIMILARITY for Year.
        year_score = 0
        if avg_year is not None and movie['year_num'] > 0:
            diff_year = abs(movie['year_num'] - avg_year)
            year_score = 1 - (diff_year / year_range)  # normalized similarity (1 means identical)
        # NUMERIC SIMILARITY for Runtime.
        runtime_score = 0
        if avg_runtime is not None and movie['runtime_num'] > 0:
            diff_runtime = abs(movie['runtime_num'] - avg_runtime)
            runtime_score = 1 - (diff_runtime / runtime_range)
        # Final combined score.
        final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
        recommendations.append((movie, final_score))
    # Sort recommendations by final score in descending order.
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations
@app.route('/recommend')
 def recommend():
    recommendations = advanced_recommendations()
    return render_template('recommendations.html', recommendations=recommendations)
 if __name__ == '__main__':
    app.run(debug=True)
--- a/out.html
+++ b/out.html
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,77 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <title>Movie Slideshow</title>
    <style>
        /* Basic styling for slideshow */
        #movie-container {
            text-align: center;
            margin-top: 30px;
        }
        #movie-poster {
            width: 200px;
            margin: 20px;
        }
        .rating-buttons button {
            margin: 10px;
            padding: 10px 20px;
            font-size: 16px;
        }
    </style>
 </head>
 <body>
    <h1 style="text-align: center;">Rate Movies</h1>
    <form id="ratingForm" method="POST" action="/recommend">
        <!-- Hidden inputs for movie ratings; one per movie -->
        {% for movie in movies %}
            <input type="hidden" name="{{ movie.title }}" id="rating-{{ loop.index0 }}" value="not seen">
        {% endfor %}
        <div id="movie-container">
            <img id="movie-poster" src="" alt="Movie Poster">
            <h2 id="movie-title"></h2>
            <p id="movie-description"></p>
        </div>
        <div class="rating-buttons" style="text-align: center;">
            <button type="button" onclick="recordRating('like')">Like</button>
            <button type="button" onclick="recordRating('dislike')">Dislike</button>
            <button type="button" onclick="recordRating('not seen')">Not Seen</button>
        </div>
    </form>
    <script>
        const movies = {{ movies | tojson }};
        let currentIndex = 0;
        const posterEl = document.getElementById("movie-poster");
        const titleEl = document.getElementById("movie-title");
        const descriptionEl = document.getElementById("movie-description");
        // Function to display the movie at the given index
        function showMovie(index) {
            if (index >= movies.length) {
                // All movies rated; submit the form
                document.getElementById("ratingForm").submit();
                return;
            }
            const movie = movies[index];
            posterEl.src = movie.poster;
            posterEl.alt = movie.title;
            titleEl.textContent = movie.title + " (" + movie.year + ")";
            descriptionEl.textContent = movie.description;
        }
        // Record the rating for the current movie and show the next one
        function recordRating(rating) {
            // Update the hidden input for the current movie with the chosen rating
            document.getElementById("rating-" + currentIndex).value = rating;
            currentIndex++;
            showMovie(currentIndex);
        }
        // Initialize the slideshow with the first movie
        showMovie(currentIndex);
    </script>
 </body>
 </html>
--- a/templates/questionnaire.html
+++ b/templates/questionnaire.html
@ -0,0 +1,83 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <title>Movie Questionnaire</title>
    <style>
        /* Styling for the slideshow */
        #movie-container {
            text-align: center;
            margin-top: 30px;
        }
        #movie-poster {
            width: 200px;
            margin: 20px;
        }
        .rating-buttons button {
            margin: 10px;
            padding: 10px 20px;
            font-size: 16px;
        }
    </style>
 </head>
 <body>
    <h1 style="text-align: center;">Rate Movies</h1>
    <form id="questionForm" method="POST">
        <!-- Container where hidden inputs will be added for the round -->
        <div id="hiddenFields"></div>
        <div id="movie-container">
            <img id="movie-poster" src="" alt="Movie Poster">
            <h2 id="movie-title"></h2>
            <p id="movie-description"></p>
        </div>
        <div class="rating-buttons" style="text-align: center;">
            <button type="button" onclick="recordRating('like')">Like</button>
            <button type="button" onclick="recordRating('dislike')">Dislike</button>
            <button type="button" onclick="recordRating('not seen')">Not Seen</button>
        </div>
    </form>
    <script>
        // Movies for the current round are passed from the server.
        const movies = {{ movies | tojson }};
        let currentIndex = 0;
        let movieRatings = {};  // To store ratings for this batch
        function showMovie(index) {
            if (index >= movies.length) {
                // All movies rated in this round—append hidden fields and submit the form.
                const container = document.getElementById("hiddenFields");
                movies.forEach(movie => {
                    // Hidden input for movie id
                    const movieIdInput = document.createElement("input");
                    movieIdInput.type = "hidden";
                    movieIdInput.name = "movie_id";
                    movieIdInput.value = movie.id;
                    container.appendChild(movieIdInput);
                    // Hidden input for its rating
                    const ratingInput = document.createElement("input");
                    ratingInput.type = "hidden";
                    ratingInput.name = "rating_" + movie.id;
                    ratingInput.value = movieRatings[movie.id] || "not seen";
                    container.appendChild(ratingInput);
                });
                document.getElementById("questionForm").submit();
                return;
            }
            const movie = movies[currentIndex];
            document.getElementById("movie-poster").src = movie.poster;
            document.getElementById("movie-poster").alt = movie.title;
            document.getElementById("movie-title").textContent = movie.title + " (" + movie.year + ")";
            document.getElementById("movie-description").textContent = movie.description;
        }
        function recordRating(rating) {
            movieRatings[movies[currentIndex].id] = rating;
            currentIndex++;
            showMovie(currentIndex);
        }
        showMovie(currentIndex);
    </script>
 </body>
 </html>
--- a/templates/recommendations.html
+++ b/templates/recommendations.html
@ -0,0 +1,21 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <title>Movie Recommendations</title>
 </head>
 <body>
    <h1>Your Movie Recommendations</h1>
    {% for movie, score in recommendations %}
    <div style="margin-bottom: 20px;">
        <img src="{{ movie.poster }}" alt="{{ movie.title }}" width="70" style="vertical-align: middle;" />
        <strong>{{ movie.title }} ({{ movie.year }})</strong>
        <p>{{ movie.description }}</p>
        <a href="{{ movie.url }}" target="_blank">More Info</a>
        <p>Recommendation Score: {{ score | round(3) }}</p>
    </div>
    <hr>
    {% endfor %}
    <a href="/">Back to Questionnaire</a>
 </body>
 </html>
--- a/test.py
+++ b/test.py
@ -0,0 +1,142 @@
 import requests
 import json
 import time
 from tqdm import tqdm  # progress bar library
 # Replace with your actual TMDb API key
 api_key = "96f3424d6fe55c2982e6e094416607f5"
 # Output file where results are saved incrementally
 output_filename = "top_movies.json"
 def write_movies(movies, filename=output_filename):
    """Helper function to write the movies list to a JSON file."""
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(movies, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error saving data to JSON file: {e}")
 def get_movie_details_tmdb(movie_id):
    """
    Fetch additional details for a movie using the TMDb API.
    Returns runtime and genres.
    """
    details = {}
    details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
        "api_key": api_key,
        "language": "en-US"
    }
    try:
        response = requests.get(details_url, params=params)
        if response.status_code == 200:
            data = response.json()
            details["runtime"] = data.get("runtime")  # runtime in minutes
            details["genres"] = [g["name"] for g in data.get("genres", [])]
        else:
            print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
    except Exception as e:
        print(f"Exception while fetching details for movie {movie_id}: {e}")
    return details
 def get_movie_keywords(movie_id):
    """
    Fetch movie keywords (tags) using the TMDb API.
    """
    keywords = []
    keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
    params = {
        "api_key": api_key
    }
    try:
        response = requests.get(keywords_url, params=params)
        if response.status_code == 200:
            data = response.json()
            keywords = [kw["name"] for kw in data.get("keywords", [])]
        else:
            print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
    except Exception as e:
        print(f"Exception while fetching keywords for movie {movie_id}: {e}")
    return keywords
 def get_top_movies():
    """
    Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
    For each movie, additional details and keywords are fetched.
    After processing each page, the current movies list is saved to a JSON file.
    """
    movies = []
    base_url = "https://api.themoviedb.org/3/movie/top_rated"
    params = {
        "api_key": api_key,
        "language": "en-US",
        "page": 1
    }
    # Initial request to determine total pages.
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print("Failed to retrieve top rated movies")
        return []
    data = response.json()
    total_pages = data.get("total_pages", 1)
    # Loop through all pages.
    for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
        params["page"] = page
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Failed to retrieve page {page}")
            continue
        data = response.json()
        results = data.get("results", [])
        for idx, movie in enumerate(results):
            # Ranking is computed by overall order.
            ranking = (page - 1) * len(results) + idx + 1
            movie_id = movie.get("id")
            title = movie.get("title")
            release_date = movie.get("release_date", "")
            year = release_date.split("-")[0] if release_date else None
            vote_average = movie.get("vote_average")
            vote_count = movie.get("vote_count")
            overview = movie.get("overview")
            poster_path = movie.get("poster_path")
            poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
            tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
            # Get additional details: runtime and genres.
            details = get_movie_details_tmdb(movie_id)
            runtime = details.get("runtime")
            genres = details.get("genres", [])
            # Get keywords (tags).
            tags = get_movie_keywords(movie_id)
            movie_data = {
                "ranking": ranking,
                "title": title,
                "year": year,
                "runtime": runtime,
                "content_rating": None,   # Not available via TMDb by default.
                "metascore": None,        # Not applicable.
                "imdb_rating": vote_average,  # Using TMDb's vote average.
                "vote_count": vote_count,
                "description": overview,
                "poster": poster,
                "url": tmdb_url,
                "genres": genres,
                "tags": tags
            }
            movies.append(movie_data)
            # Pause a bit between detail requests to be courteous.
            time.sleep(0.2)
        # After processing each page, write the current movies list to the JSON file.
        write_movies(movies)
        # Pause between pages.
        time.sleep(0.5)
    return movies
 if __name__ == "__main__":
    top_movies = get_top_movies()
    print(f"\nData saved to {output_filename}")
--- a/top_movies.json
+++ b/top_movies.json