274 lines
11 KiB
Python
274 lines
11 KiB
Python
from flask import Flask, request, render_template, redirect, url_for, session
|
||
import json
|
||
import numpy as np
|
||
import random
|
||
import math
|
||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
||
app = Flask(__name__)
|
||
app.secret_key = 'your_secret_key_here' # Replace with a secure key in production
|
||
|
||
# Load movies from top_movies.json with UTF-8 encoding
|
||
with open('top_movies.json', 'r', encoding='utf-8') as f:
|
||
movies = json.load(f)
|
||
|
||
# Preprocess each movie
|
||
for i, movie in enumerate(movies):
|
||
movie['id'] = i # Unique ID
|
||
# Combine genres and tags into one feature string.
|
||
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
|
||
# Ensure numeric values for year and runtime:
|
||
try:
|
||
movie['year_num'] = int(movie.get('year', '0'))
|
||
except:
|
||
movie['year_num'] = 0
|
||
try:
|
||
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
|
||
except:
|
||
movie['runtime_num'] = 0
|
||
# Ensure vote_count is numeric.
|
||
try:
|
||
count = movie.get('vote_count', 0)
|
||
if isinstance(count, str):
|
||
count = count.replace(',', '')
|
||
if 'M' in count:
|
||
count = float(count.replace('M', '')) * 1e6
|
||
else:
|
||
count = int(count)
|
||
movie['vote_count'] = int(count)
|
||
except:
|
||
movie['vote_count'] = 0
|
||
|
||
# Build the TF‑IDF vectorizer on movie features.
|
||
vectorizer = TfidfVectorizer(stop_words='english')
|
||
movie_features = [movie['features'] for movie in movies]
|
||
movie_vectors = vectorizer.fit_transform(movie_features)
|
||
|
||
# Precompute overall ranges for numeric features.
|
||
years = [m['year_num'] for m in movies if m['year_num'] > 0]
|
||
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
|
||
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
|
||
|
||
min_year, max_year = (min(years), max(years)) if years else (0, 1)
|
||
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
|
||
year_range = max_year - min_year if max_year != min_year else 1
|
||
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
|
||
rating_range = 10.0 # Assuming ratings are on a 0–10 scale
|
||
|
||
def get_predicted_movies(num=10):
|
||
"""
|
||
Return up to `num` movies that haven't been shown yet.
|
||
Uses the user's past ratings to predict which unseen movies they might like.
|
||
If no ratings exist, falls back to random selection.
|
||
"""
|
||
asked = session.get('asked_movies', [])
|
||
available = [m for m in movies if m['id'] not in asked]
|
||
if not available:
|
||
return []
|
||
rated = session.get('rated_movies', {})
|
||
# Fallback to random selection if there are no like/dislike ratings.
|
||
if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
|
||
random.shuffle(available)
|
||
return available[:num]
|
||
|
||
# Build prediction profiles.
|
||
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||
|
||
if liked_ids:
|
||
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||
else:
|
||
liked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||
if disliked_ids:
|
||
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
|
||
else:
|
||
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||
|
||
# Compute numeric averages for liked movies.
|
||
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||
|
||
avg_year = np.mean(liked_years) if liked_years else None
|
||
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||
|
||
predictions = []
|
||
# Tunable weights.
|
||
w_text = 0.5
|
||
w_year = 0.1
|
||
w_runtime = 0.1
|
||
w_rating = 0.15
|
||
w_popularity = 0.15
|
||
|
||
for movie in available:
|
||
i = movie['id']
|
||
# TEXT SIMILARITY.
|
||
movie_vector = movie_vectors[i].toarray()
|
||
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||
text_score = like_sim - dislike_sim
|
||
|
||
# YEAR SIMILARITY.
|
||
year_score = 0
|
||
if avg_year is not None and movie['year_num'] > 0:
|
||
diff_year = abs(movie['year_num'] - avg_year)
|
||
year_score = 1 - (diff_year / year_range)
|
||
|
||
# RUNTIME SIMILARITY.
|
||
runtime_score = 0
|
||
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||
runtime_score = 1 - (diff_runtime / runtime_range)
|
||
|
||
# RATING SIMILARITY.
|
||
rating_score = 0
|
||
movie_rating = movie.get('imdb_rating', 0)
|
||
if avg_rating is not None and movie_rating:
|
||
diff_rating = abs(movie_rating - avg_rating)
|
||
rating_score = 1 - (diff_rating / rating_range)
|
||
|
||
# POPULARITY SCORE.
|
||
popularity_score = 0
|
||
if movie['vote_count'] > 0:
|
||
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||
|
||
# Final prediction score.
|
||
final_score = (w_text * text_score +
|
||
w_year * year_score +
|
||
w_runtime * runtime_score +
|
||
w_rating * rating_score +
|
||
w_popularity * popularity_score)
|
||
predictions.append((movie, final_score))
|
||
|
||
predictions.sort(key=lambda x: x[1], reverse=True)
|
||
return [pred[0] for pred in predictions[:num]]
|
||
|
||
def enough_info():
|
||
"""
|
||
Check if the user has rated at least 3 movies (like/dislike).
|
||
"""
|
||
rated = session.get('rated_movies', {})
|
||
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
|
||
return count >= 3
|
||
|
||
@app.route('/')
|
||
def home():
|
||
session.setdefault('rated_movies', {}) # {movie_id: rating}
|
||
session.setdefault('asked_movies', []) # list of movie IDs already shown
|
||
return redirect(url_for('questionnaire'))
|
||
|
||
@app.route('/questionnaire', methods=['GET', 'POST'])
|
||
def questionnaire():
|
||
if request.method == 'POST':
|
||
current_ids = request.form.getlist("movie_id")
|
||
for movie_id in current_ids:
|
||
rating = request.form.get(f"rating_{movie_id}")
|
||
session['rated_movies'][movie_id] = rating
|
||
if int(movie_id) not in session['asked_movies']:
|
||
session['asked_movies'].append(int(movie_id))
|
||
remaining = [m for m in movies if m['id'] not in session['asked_movies']]
|
||
if enough_info() or not remaining:
|
||
return redirect(url_for('recommend'))
|
||
else:
|
||
return redirect(url_for('questionnaire'))
|
||
else:
|
||
# Use prediction to select movies for the questionnaire.
|
||
selected_movies = get_predicted_movies(num=10)
|
||
if not selected_movies:
|
||
return redirect(url_for('recommend'))
|
||
return render_template('questionnaire.html', movies=selected_movies)
|
||
|
||
def advanced_recommendations():
|
||
"""
|
||
Compute an advanced hybrid recommendation score on unseen movies.
|
||
Only movies not already shown (asked) are considered.
|
||
Combines:
|
||
1. Text similarity (TF‑IDF) between liked/disliked profiles.
|
||
2. Year similarity.
|
||
3. Runtime similarity.
|
||
4. Rating similarity.
|
||
5. Popularity (log-scaled vote count).
|
||
Returns the top 20 recommendations.
|
||
"""
|
||
rated = session.get('rated_movies', {})
|
||
asked = set(session.get('asked_movies', []))
|
||
# Only consider movies that haven't been shown to the user.
|
||
available = [m for m in movies if m['id'] not in asked]
|
||
if not available:
|
||
available = movies # Fallback if all movies have been shown.
|
||
|
||
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||
|
||
if liked_ids:
|
||
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||
else:
|
||
liked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||
if disliked_ids:
|
||
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
|
||
else:
|
||
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||
|
||
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||
avg_year = np.mean(liked_years) if liked_years else None
|
||
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||
|
||
recommendations = []
|
||
w_text = 0.5
|
||
w_year = 0.1
|
||
w_runtime = 0.1
|
||
w_rating = 0.15
|
||
w_popularity = 0.15
|
||
|
||
for movie in available:
|
||
i = movie['id']
|
||
movie_vector = movie_vectors[i].toarray()
|
||
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||
text_score = like_sim - dislike_sim
|
||
|
||
year_score = 0
|
||
if avg_year is not None and movie['year_num'] > 0:
|
||
diff_year = abs(movie['year_num'] - avg_year)
|
||
year_score = 1 - (diff_year / year_range)
|
||
|
||
runtime_score = 0
|
||
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||
runtime_score = 1 - (diff_runtime / runtime_range)
|
||
|
||
rating_score = 0
|
||
movie_rating = movie.get('imdb_rating', 0)
|
||
if avg_rating is not None and movie_rating:
|
||
diff_rating = abs(movie_rating - avg_rating)
|
||
rating_score = 1 - (diff_rating / rating_range)
|
||
|
||
popularity_score = 0
|
||
if movie['vote_count'] > 0:
|
||
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||
|
||
final_score = (w_text * text_score +
|
||
w_year * year_score +
|
||
w_runtime * runtime_score +
|
||
w_rating * rating_score +
|
||
w_popularity * popularity_score)
|
||
recommendations.append((movie, final_score))
|
||
|
||
recommendations.sort(key=lambda x: x[1], reverse=True)
|
||
return recommendations[:20]
|
||
|
||
@app.route('/recommend')
|
||
def recommend():
|
||
recommendations = advanced_recommendations()
|
||
return render_template('recommendations.html', recommendations=recommendations)
|
||
|
||
if __name__ == '__main__':
|
||
app.run(debug=True)
|
||
|
||
|