from flask import Flask, request, render_template, redirect, url_for, session import json import numpy as np import random from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity app = Flask(__name__) app.secret_key = 'your_secret_key_here' # Replace with a secure key in production # Load movies from top_movies.json with UTF-8 encoding with open('top_movies.json', 'r', encoding='utf-8') as f: movies = json.load(f) # Assign a unique ID and preprocess features for each movie for i, movie in enumerate(movies): movie['id'] = i # Unique id for each movie # Combine genres and tags into a feature string (could add description etc.) movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', [])) # Ensure numeric values for year and runtime if possible: try: movie['year_num'] = int(movie.get('year', '0')) except: movie['year_num'] = 0 try: # runtime might be a number already or a string; if string, try to convert. movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0 except: movie['runtime_num'] = 0 # Build the TF‑IDF vectorizer on movie features. vectorizer = TfidfVectorizer(stop_words='english') movie_features = [movie['features'] for movie in movies] movie_vectors = vectorizer.fit_transform(movie_features) # Precompute overall ranges for numeric features across the dataset. years = [m['year_num'] for m in movies if m['year_num'] > 0] runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0] min_year, max_year = (min(years), max(years)) if years else (0, 1) min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1) year_range = max_year - min_year if max_year != min_year else 1 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1 def get_diverse_movies(num=10): """ Pick up to `num` movies that have not been shown yet, trying to cover different genres. """ asked = session.get('asked_movies', []) available = [m for m in movies if m['id'] not in asked] if not available: return [] selected = [] # List of desired genres to cover desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror", "Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"] # Try to pick one movie per desired genre. for genre in desired_genres: for m in available: if genre in m.get('genres', []) and m not in selected: selected.append(m) break if len(selected) >= num: break # If we still need more movies, fill the remainder randomly. if len(selected) < num: remaining = [m for m in available if m not in selected] random.shuffle(remaining) selected.extend(remaining[:(num - len(selected))]) return selected[:num] def enough_info(): """ Determines whether we have collected enough ratings. In this example, we require that the user has given a 'like' or 'dislike' to at least 3 movies. """ rated = session.get('rated_movies', {}) count = sum(1 for rating in rated.values() if rating in ['like', 'dislike']) return count >= 3 @app.route('/') def home(): # Initialize session variables session.setdefault('rated_movies', {}) # {movie_id: rating} session.setdefault('asked_movies', []) # list of movie ids already asked return redirect(url_for('questionnaire')) @app.route('/questionnaire', methods=['GET', 'POST']) def questionnaire(): if request.method == 'POST': # Process ratings from the current round. current_ids = request.form.getlist("movie_id") for movie_id in current_ids: rating = request.form.get(f"rating_{movie_id}") session['rated_movies'][movie_id] = rating if int(movie_id) not in session['asked_movies']: session['asked_movies'].append(int(movie_id)) remaining = [m for m in movies if m['id'] not in session['asked_movies']] if enough_info() or not remaining: return redirect(url_for('recommend')) else: return redirect(url_for('questionnaire')) else: selected_movies = get_diverse_movies(num=10) if not selected_movies: return redirect(url_for('recommend')) return render_template('questionnaire.html', movies=selected_movies) def advanced_recommendations(): """ Build an advanced recommendation score for movies not rated by the user. Combines: 1. Text similarity (from TF-IDF features on genres/tags). 2. Year similarity: movies with similar release years to liked movies. 3. Runtime similarity: movies with similar runtime to liked movies. The final score is a weighted sum of these signals. """ rated = session.get('rated_movies', {}) liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like'] disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike'] # Build text profiles for liked/disliked movies. if liked_ids: liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0)) else: liked_profile = np.zeros((1, movie_vectors.shape[1])) if disliked_ids: disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0)) else: disliked_profile = np.zeros((1, movie_vectors.shape[1])) # Compute numeric averages for liked movies (for year and runtime). liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0] liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0] avg_year = np.mean(liked_years) if liked_years else None avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None recommendations = [] # Weights for each component – adjust these to tune the algorithm. w_text = 0.70 w_year = 0.15 w_runtime = 0.15 for i, movie in enumerate(movies): movie_id = str(movie['id']) if rated.get(movie_id, "not seen") != "not seen": continue # Skip movies already rated. # TEXT SIMILARITY: difference between similarity to liked and disliked profiles. movie_vector = movie_vectors[i].toarray() like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0 dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0 text_score = like_sim - dislike_sim # NUMERIC SIMILARITY for Year. year_score = 0 if avg_year is not None and movie['year_num'] > 0: diff_year = abs(movie['year_num'] - avg_year) year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical) # NUMERIC SIMILARITY for Runtime. runtime_score = 0 if avg_runtime is not None and movie['runtime_num'] > 0: diff_runtime = abs(movie['runtime_num'] - avg_runtime) runtime_score = 1 - (diff_runtime / runtime_range) # Final combined score. final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score recommendations.append((movie, final_score)) # Sort recommendations by final score in descending order. recommendations.sort(key=lambda x: x[1], reverse=True) return recommendations @app.route('/recommend') def recommend(): recommendations = advanced_recommendations() return render_template('recommendations.html', recommendations=recommendations) if __name__ == '__main__': app.run(debug=True)