This commit is contained in:
OusmBlueNinja 2025-04-02 14:26:37 -05:00
parent 547b161138
commit 914c085cac
3 changed files with 94981 additions and 2180 deletions

206
app.py
View File

@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
import json import json
import numpy as np import numpy as np
import random import random
import math
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here' # Replace with a secure key in producti
with open('top_movies.json', 'r', encoding='utf-8') as f: with open('top_movies.json', 'r', encoding='utf-8') as f:
movies = json.load(f) movies = json.load(f)
# Assign a unique ID and preprocess features for each movie # Preprocess each movie
for i, movie in enumerate(movies): for i, movie in enumerate(movies):
movie['id'] = i # Unique id for each movie movie['id'] = i # Unique ID
# Combine genres and tags into a feature string (could add description etc.) # Combine genres and tags into one feature string.
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', [])) movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
# Ensure numeric values for year and runtime if possible: # Ensure numeric values for year and runtime:
try: try:
movie['year_num'] = int(movie.get('year', '0')) movie['year_num'] = int(movie.get('year', '0'))
except: except:
movie['year_num'] = 0 movie['year_num'] = 0
try: try:
# runtime might be a number already or a string; if string, try to convert.
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0 movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
except: except:
movie['runtime_num'] = 0 movie['runtime_num'] = 0
# Ensure vote_count is numeric.
try:
count = movie.get('vote_count', 0)
if isinstance(count, str):
count = count.replace(',', '')
if 'M' in count:
count = float(count.replace('M', '')) * 1e6
else:
count = int(count)
movie['vote_count'] = int(count)
except:
movie['vote_count'] = 0
# Build the TFIDF vectorizer on movie features. # Build the TFIDF vectorizer on movie features.
vectorizer = TfidfVectorizer(stop_words='english') vectorizer = TfidfVectorizer(stop_words='english')
movie_features = [movie['features'] for movie in movies] movie_features = [movie['features'] for movie in movies]
movie_vectors = vectorizer.fit_transform(movie_features) movie_vectors = vectorizer.fit_transform(movie_features)
# Precompute overall ranges for numeric features across the dataset. # Precompute overall ranges for numeric features.
years = [m['year_num'] for m in movies if m['year_num'] > 0] years = [m['year_num'] for m in movies if m['year_num'] > 0]
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0] runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
min_year, max_year = (min(years), max(years)) if years else (0, 1) min_year, max_year = (min(years), max(years)) if years else (0, 1)
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1) min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
year_range = max_year - min_year if max_year != min_year else 1 year_range = max_year - min_year if max_year != min_year else 1
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1 runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
rating_range = 10.0 # Assuming ratings are on a 010 scale
def get_diverse_movies(num=10): def get_predicted_movies(num=10):
""" """
Pick up to `num` movies that have not been shown yet, trying to cover different genres. Return up to `num` movies that haven't been shown yet.
Uses the user's past ratings to predict which unseen movies they might like.
If no ratings exist, falls back to random selection.
""" """
asked = session.get('asked_movies', []) asked = session.get('asked_movies', [])
available = [m for m in movies if m['id'] not in asked] available = [m for m in movies if m['id'] not in asked]
if not available: if not available:
return [] return []
selected = [] rated = session.get('rated_movies', {})
# List of desired genres to cover # Fallback to random selection if there are no like/dislike ratings.
desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror", if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
"Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"] random.shuffle(available)
# Try to pick one movie per desired genre. return available[:num]
for genre in desired_genres:
for m in available: # Build prediction profiles.
if genre in m.get('genres', []) and m not in selected: liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
selected.append(m) disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
break
if len(selected) >= num: if liked_ids:
break liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
# If we still need more movies, fill the remainder randomly. else:
if len(selected) < num: liked_profile = np.zeros((1, movie_vectors.shape[1]))
remaining = [m for m in available if m not in selected] if disliked_ids:
random.shuffle(remaining) disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
selected.extend(remaining[:(num - len(selected))]) else:
return selected[:num] disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies.
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
predictions = []
# Tunable weights.
w_text = 0.5
w_year = 0.1
w_runtime = 0.1
w_rating = 0.15
w_popularity = 0.15
for movie in available:
i = movie['id']
# TEXT SIMILARITY.
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
# YEAR SIMILARITY.
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range)
# RUNTIME SIMILARITY.
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
# RATING SIMILARITY.
rating_score = 0
movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
# POPULARITY SCORE.
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
# Final prediction score.
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
predictions.append((movie, final_score))
predictions.sort(key=lambda x: x[1], reverse=True)
return [pred[0] for pred in predictions[:num]]
def enough_info(): def enough_info():
""" """
Determines whether we have collected enough ratings. Check if the user has rated at least 3 movies (like/dislike).
In this example, we require that the user has given a 'like' or 'dislike'
to at least 3 movies.
""" """
rated = session.get('rated_movies', {}) rated = session.get('rated_movies', {})
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike']) count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
@ -80,15 +155,13 @@ def enough_info():
@app.route('/') @app.route('/')
def home(): def home():
# Initialize session variables
session.setdefault('rated_movies', {}) # {movie_id: rating} session.setdefault('rated_movies', {}) # {movie_id: rating}
session.setdefault('asked_movies', []) # list of movie ids already asked session.setdefault('asked_movies', []) # list of movie IDs already shown
return redirect(url_for('questionnaire')) return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST']) @app.route('/questionnaire', methods=['GET', 'POST'])
def questionnaire(): def questionnaire():
if request.method == 'POST': if request.method == 'POST':
# Process ratings from the current round.
current_ids = request.form.getlist("movie_id") current_ids = request.form.getlist("movie_id")
for movie_id in current_ids: for movie_id in current_ids:
rating = request.form.get(f"rating_{movie_id}") rating = request.form.get(f"rating_{movie_id}")
@ -101,25 +174,34 @@ def questionnaire():
else: else:
return redirect(url_for('questionnaire')) return redirect(url_for('questionnaire'))
else: else:
selected_movies = get_diverse_movies(num=10) # Use prediction to select movies for the questionnaire.
selected_movies = get_predicted_movies(num=10)
if not selected_movies: if not selected_movies:
return redirect(url_for('recommend')) return redirect(url_for('recommend'))
return render_template('questionnaire.html', movies=selected_movies) return render_template('questionnaire.html', movies=selected_movies)
def advanced_recommendations(): def advanced_recommendations():
""" """
Build an advanced recommendation score for movies not rated by the user. Compute an advanced hybrid recommendation score on unseen movies.
Only movies not already shown (asked) are considered.
Combines: Combines:
1. Text similarity (from TF-IDF features on genres/tags). 1. Text similarity (TFIDF) between liked/disliked profiles.
2. Year similarity: movies with similar release years to liked movies. 2. Year similarity.
3. Runtime similarity: movies with similar runtime to liked movies. 3. Runtime similarity.
The final score is a weighted sum of these signals. 4. Rating similarity.
5. Popularity (log-scaled vote count).
Returns the top 20 recommendations.
""" """
rated = session.get('rated_movies', {}) rated = session.get('rated_movies', {})
asked = set(session.get('asked_movies', []))
# Only consider movies that haven't been shown to the user.
available = [m for m in movies if m['id'] not in asked]
if not available:
available = movies # Fallback if all movies have been shown.
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like'] liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike'] disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
# Build text profiles for liked/disliked movies.
if liked_ids: if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0)) liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else: else:
@ -129,48 +211,56 @@ def advanced_recommendations():
else: else:
disliked_profile = np.zeros((1, movie_vectors.shape[1])) disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies (for year and runtime).
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0] liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0] liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
recommendations = [] recommendations = []
# Weights for each component adjust these to tune the algorithm. w_text = 0.5
w_text = 0.70 w_year = 0.1
w_year = 0.15 w_runtime = 0.1
w_runtime = 0.15 w_rating = 0.15
w_popularity = 0.15
for i, movie in enumerate(movies): for movie in available:
movie_id = str(movie['id']) i = movie['id']
if rated.get(movie_id, "not seen") != "not seen":
continue # Skip movies already rated.
# TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
movie_vector = movie_vectors[i].toarray() movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0 like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0 dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim text_score = like_sim - dislike_sim
# NUMERIC SIMILARITY for Year.
year_score = 0 year_score = 0
if avg_year is not None and movie['year_num'] > 0: if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year) diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical) year_score = 1 - (diff_year / year_range)
# NUMERIC SIMILARITY for Runtime.
runtime_score = 0 runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0: if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime) diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range) runtime_score = 1 - (diff_runtime / runtime_range)
# Final combined score. rating_score = 0
final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
recommendations.append((movie, final_score)) recommendations.append((movie, final_score))
# Sort recommendations by final score in descending order.
recommendations.sort(key=lambda x: x[1], reverse=True) recommendations.sort(key=lambda x: x[1], reverse=True)
return recommendations return recommendations[:20]
@app.route('/recommend') @app.route('/recommend')
def recommend(): def recommend():
@ -179,3 +269,5 @@ def recommend():
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug=True) app.run(debug=True)

103
test.py
View File

@ -2,6 +2,7 @@ import requests
import json import json
import time import time
from tqdm import tqdm # progress bar library from tqdm import tqdm # progress bar library
import concurrent.futures
# Replace with your actual TMDb API key # Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5" api_key = "96f3424d6fe55c2982e6e094416607f5"
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
print(f"Exception while fetching keywords for movie {movie_id}: {e}") print(f"Exception while fetching keywords for movie {movie_id}: {e}")
return keywords return keywords
def process_movie(movie, page, idx, results_per_page):
"""
Processes a single movie record:
- Computes its ranking,
- Extracts basic information,
- Fetches additional details and keywords.
"""
ranking = (page - 1) * results_per_page + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details and keywords.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
# Brief sleep to help throttle requests
time.sleep(0.2)
return movie_data
def get_top_movies(): def get_top_movies():
""" """
Uses the TMDb API to retrieve top rated movies, then iterates through all pages. Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
For each movie, additional details and keywords are fetched. After processing each page, the current list of movies is written to a JSON file.
After processing each page, the current movies list is saved to a JSON file.
""" """
movies = [] movies = []
base_url = "https://api.themoviedb.org/3/movie/top_rated" base_url = "https://api.themoviedb.org/3/movie/top_rated"
@ -91,47 +136,21 @@ def get_top_movies():
continue continue
data = response.json() data = response.json()
results = data.get("results", []) results = data.get("results", [])
results_per_page = len(results)
# Process each movie concurrently using a thread pool.
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
futures = []
for idx, movie in enumerate(results): for idx, movie in enumerate(results):
# Ranking is computed by overall order. futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
ranking = (page - 1) * len(results) + idx + 1 # Collect results as they complete.
movie_id = movie.get("id") for future in concurrent.futures.as_completed(futures):
title = movie.get("title") try:
release_date = movie.get("release_date", "") movie_data = future.result()
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details: runtime and genres.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
# Get keywords (tags).
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
movies.append(movie_data) movies.append(movie_data)
# Pause a bit between detail requests to be courteous. except Exception as e:
time.sleep(0.2) print(f"Error processing movie: {e}")
# After processing each page, write the current movies list to the JSON file. # Write movies to JSON file incrementally after each page.
write_movies(movies) write_movies(movies)
# Pause between pages. # Pause between pages.
time.sleep(0.5) time.sleep(0.5)

File diff suppressed because it is too large Load Diff