from flask import Flask, request, render_template, redirect, url_for, session import json import numpy as np import random import math from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity app = Flask(__name__) app.secret_key = 'your_secret_key_here' # Replace with a secure key in production # Load movies from top_movies.json with UTF-8 encoding with open('top_movies.json', 'r', encoding='utf-8') as f: movies = json.load(f) # Preprocess each movie for i, movie in enumerate(movies): movie['id'] = i # Unique ID # Combine genres and tags into one feature string. movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', [])) # Ensure numeric values for year and runtime: try: movie['year_num'] = int(movie.get('year', '0')) except: movie['year_num'] = 0 try: movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0 except: movie['runtime_num'] = 0 # Ensure vote_count is numeric. try: count = movie.get('vote_count', 0) if isinstance(count, str): count = count.replace(',', '') if 'M' in count: count = float(count.replace('M', '')) * 1e6 else: count = int(count) movie['vote_count'] = int(count) except: movie['vote_count'] = 0 # Build the TF‑IDF vectorizer on movie features. vectorizer = TfidfVectorizer(stop_words='english') movie_features = [movie['features'] for movie in movies] movie_vectors = vectorizer.fit_transform(movie_features) # Precompute overall ranges for numeric features. years = [m['year_num'] for m in movies if m['year_num'] > 0] runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0] max_vote = max([m['vote_count'] for m in movies]) if movies else 1 min_year, max_year = (min(years), max(years)) if years else (0, 1) min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1) year_range = max_year - min_year if max_year != min_year else 1 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1 rating_range = 10.0 # Assuming ratings are on a 0–10 scale def get_predicted_movies(num=10): """ Return up to `num` movies that haven't been shown yet. Uses the user's past ratings to predict which unseen movies they might like. If no ratings exist, falls back to random selection. """ asked = session.get('asked_movies', []) available = [m for m in movies if m['id'] not in asked] if not available: return [] rated = session.get('rated_movies', {}) # Fallback to random selection if there are no like/dislike ratings. if not rated or not any(r in ['like', 'dislike'] for r in rated.values()): random.shuffle(available) return available[:num] # Build prediction profiles. liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like'] disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike'] if liked_ids: liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0)) else: liked_profile = np.zeros((1, movie_vectors.shape[1])) if disliked_ids: disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0)) else: disliked_profile = np.zeros((1, movie_vectors.shape[1])) # Compute numeric averages for liked movies. liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0] liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0] liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)] avg_year = np.mean(liked_years) if liked_years else None avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None avg_rating = np.mean(liked_ratings) if liked_ratings else None predictions = [] # Tunable weights. w_text = 0.5 w_year = 0.1 w_runtime = 0.1 w_rating = 0.15 w_popularity = 0.15 for movie in available: i = movie['id'] # TEXT SIMILARITY. movie_vector = movie_vectors[i].toarray() like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0 dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0 text_score = like_sim - dislike_sim # YEAR SIMILARITY. year_score = 0 if avg_year is not None and movie['year_num'] > 0: diff_year = abs(movie['year_num'] - avg_year) year_score = 1 - (diff_year / year_range) # RUNTIME SIMILARITY. runtime_score = 0 if avg_runtime is not None and movie['runtime_num'] > 0: diff_runtime = abs(movie['runtime_num'] - avg_runtime) runtime_score = 1 - (diff_runtime / runtime_range) # RATING SIMILARITY. rating_score = 0 movie_rating = movie.get('imdb_rating', 0) if avg_rating is not None and movie_rating: diff_rating = abs(movie_rating - avg_rating) rating_score = 1 - (diff_rating / rating_range) # POPULARITY SCORE. popularity_score = 0 if movie['vote_count'] > 0: popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1) # Final prediction score. final_score = (w_text * text_score + w_year * year_score + w_runtime * runtime_score + w_rating * rating_score + w_popularity * popularity_score) predictions.append((movie, final_score)) predictions.sort(key=lambda x: x[1], reverse=True) return [pred[0] for pred in predictions[:num]] def enough_info(): """ Check if the user has rated at least 3 movies (like/dislike). """ rated = session.get('rated_movies', {}) count = sum(1 for rating in rated.values() if rating in ['like', 'dislike']) return count >= 3 @app.route('/') def home(): session.setdefault('rated_movies', {}) # {movie_id: rating} session.setdefault('asked_movies', []) # list of movie IDs already shown return redirect(url_for('questionnaire')) @app.route('/questionnaire', methods=['GET', 'POST']) def questionnaire(): if request.method == 'POST': current_ids = request.form.getlist("movie_id") for movie_id in current_ids: rating = request.form.get(f"rating_{movie_id}") session['rated_movies'][movie_id] = rating if int(movie_id) not in session['asked_movies']: session['asked_movies'].append(int(movie_id)) remaining = [m for m in movies if m['id'] not in session['asked_movies']] if enough_info() or not remaining: return redirect(url_for('recommend')) else: return redirect(url_for('questionnaire')) else: # Use prediction to select movies for the questionnaire. selected_movies = get_predicted_movies(num=10) if not selected_movies: return redirect(url_for('recommend')) return render_template('questionnaire.html', movies=selected_movies) def advanced_recommendations(): """ Compute an advanced hybrid recommendation score on unseen movies. Only movies not already shown (asked) are considered. Combines: 1. Text similarity (TF‑IDF) between liked/disliked profiles. 2. Year similarity. 3. Runtime similarity. 4. Rating similarity. 5. Popularity (log-scaled vote count). Returns the top 20 recommendations. """ rated = session.get('rated_movies', {}) asked = set(session.get('asked_movies', [])) # Only consider movies that haven't been shown to the user. available = [m for m in movies if m['id'] not in asked] if not available: available = movies # Fallback if all movies have been shown. liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like'] disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike'] if liked_ids: liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0)) else: liked_profile = np.zeros((1, movie_vectors.shape[1])) if disliked_ids: disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0)) else: disliked_profile = np.zeros((1, movie_vectors.shape[1])) liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0] liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0] liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)] avg_year = np.mean(liked_years) if liked_years else None avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None avg_rating = np.mean(liked_ratings) if liked_ratings else None recommendations = [] w_text = 0.5 w_year = 0.1 w_runtime = 0.1 w_rating = 0.15 w_popularity = 0.15 for movie in available: i = movie['id'] movie_vector = movie_vectors[i].toarray() like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0 dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0 text_score = like_sim - dislike_sim year_score = 0 if avg_year is not None and movie['year_num'] > 0: diff_year = abs(movie['year_num'] - avg_year) year_score = 1 - (diff_year / year_range) runtime_score = 0 if avg_runtime is not None and movie['runtime_num'] > 0: diff_runtime = abs(movie['runtime_num'] - avg_runtime) runtime_score = 1 - (diff_runtime / runtime_range) rating_score = 0 movie_rating = movie.get('imdb_rating', 0) if avg_rating is not None and movie_rating: diff_rating = abs(movie_rating - avg_rating) rating_score = 1 - (diff_rating / rating_range) popularity_score = 0 if movie['vote_count'] > 0: popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1) final_score = (w_text * text_score + w_year * year_score + w_runtime * runtime_score + w_rating * rating_score + w_popularity * popularity_score) recommendations.append((movie, final_score)) recommendations.sort(key=lambda x: x[1], reverse=True) return recommendations[:20] @app.route('/recommend') def recommend(): recommendations = advanced_recommendations() return render_template('recommendations.html', recommendations=recommendations) if __name__ == '__main__': app.run(debug=True)