Movie-Me-Now/app.py
OusmBlueNinja 914c085cac bad
2025-04-02 14:26:37 -05:00

274 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from flask import Flask, request, render_template, redirect, url_for, session
import json
import numpy as np
import random
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
app = Flask(__name__)
app.secret_key = 'your_secret_key_here' # Replace with a secure key in production
# Load movies from top_movies.json with UTF-8 encoding
with open('top_movies.json', 'r', encoding='utf-8') as f:
movies = json.load(f)
# Preprocess each movie
for i, movie in enumerate(movies):
movie['id'] = i # Unique ID
# Combine genres and tags into one feature string.
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
# Ensure numeric values for year and runtime:
try:
movie['year_num'] = int(movie.get('year', '0'))
except:
movie['year_num'] = 0
try:
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
except:
movie['runtime_num'] = 0
# Ensure vote_count is numeric.
try:
count = movie.get('vote_count', 0)
if isinstance(count, str):
count = count.replace(',', '')
if 'M' in count:
count = float(count.replace('M', '')) * 1e6
else:
count = int(count)
movie['vote_count'] = int(count)
except:
movie['vote_count'] = 0
# Build the TFIDF vectorizer on movie features.
vectorizer = TfidfVectorizer(stop_words='english')
movie_features = [movie['features'] for movie in movies]
movie_vectors = vectorizer.fit_transform(movie_features)
# Precompute overall ranges for numeric features.
years = [m['year_num'] for m in movies if m['year_num'] > 0]
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
min_year, max_year = (min(years), max(years)) if years else (0, 1)
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
year_range = max_year - min_year if max_year != min_year else 1
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
rating_range = 10.0 # Assuming ratings are on a 010 scale
def get_predicted_movies(num=10):
"""
Return up to `num` movies that haven't been shown yet.
Uses the user's past ratings to predict which unseen movies they might like.
If no ratings exist, falls back to random selection.
"""
asked = session.get('asked_movies', [])
available = [m for m in movies if m['id'] not in asked]
if not available:
return []
rated = session.get('rated_movies', {})
# Fallback to random selection if there are no like/dislike ratings.
if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
random.shuffle(available)
return available[:num]
# Build prediction profiles.
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else:
liked_profile = np.zeros((1, movie_vectors.shape[1]))
if disliked_ids:
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
else:
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies.
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
predictions = []
# Tunable weights.
w_text = 0.5
w_year = 0.1
w_runtime = 0.1
w_rating = 0.15
w_popularity = 0.15
for movie in available:
i = movie['id']
# TEXT SIMILARITY.
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
# YEAR SIMILARITY.
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range)
# RUNTIME SIMILARITY.
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
# RATING SIMILARITY.
rating_score = 0
movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
# POPULARITY SCORE.
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
# Final prediction score.
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
predictions.append((movie, final_score))
predictions.sort(key=lambda x: x[1], reverse=True)
return [pred[0] for pred in predictions[:num]]
def enough_info():
"""
Check if the user has rated at least 3 movies (like/dislike).
"""
rated = session.get('rated_movies', {})
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
return count >= 3
@app.route('/')
def home():
session.setdefault('rated_movies', {}) # {movie_id: rating}
session.setdefault('asked_movies', []) # list of movie IDs already shown
return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST'])
def questionnaire():
if request.method == 'POST':
current_ids = request.form.getlist("movie_id")
for movie_id in current_ids:
rating = request.form.get(f"rating_{movie_id}")
session['rated_movies'][movie_id] = rating
if int(movie_id) not in session['asked_movies']:
session['asked_movies'].append(int(movie_id))
remaining = [m for m in movies if m['id'] not in session['asked_movies']]
if enough_info() or not remaining:
return redirect(url_for('recommend'))
else:
return redirect(url_for('questionnaire'))
else:
# Use prediction to select movies for the questionnaire.
selected_movies = get_predicted_movies(num=10)
if not selected_movies:
return redirect(url_for('recommend'))
return render_template('questionnaire.html', movies=selected_movies)
def advanced_recommendations():
"""
Compute an advanced hybrid recommendation score on unseen movies.
Only movies not already shown (asked) are considered.
Combines:
1. Text similarity (TFIDF) between liked/disliked profiles.
2. Year similarity.
3. Runtime similarity.
4. Rating similarity.
5. Popularity (log-scaled vote count).
Returns the top 20 recommendations.
"""
rated = session.get('rated_movies', {})
asked = set(session.get('asked_movies', []))
# Only consider movies that haven't been shown to the user.
available = [m for m in movies if m['id'] not in asked]
if not available:
available = movies # Fallback if all movies have been shown.
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else:
liked_profile = np.zeros((1, movie_vectors.shape[1]))
if disliked_ids:
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
else:
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
recommendations = []
w_text = 0.5
w_year = 0.1
w_runtime = 0.1
w_rating = 0.15
w_popularity = 0.15
for movie in available:
i = movie['id']
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range)
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
rating_score = 0
movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
recommendations.append((movie, final_score))
recommendations.sort(key=lambda x: x[1], reverse=True)
return recommendations[:20]
@app.route('/recommend')
def recommend():
recommendations = advanced_recommendations()
return render_template('recommendations.html', recommendations=recommendations)
if __name__ == '__main__':
app.run(debug=True)