This commit is contained in:
OusmBlueNinja 2025-04-02 14:26:37 -05:00
parent 547b161138
commit 914c085cac
3 changed files with 94981 additions and 2180 deletions

206
app.py
View File

@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
import json
import numpy as np
import random
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here' # Replace with a secure key in producti
with open('top_movies.json', 'r', encoding='utf-8') as f:
movies = json.load(f)
# Assign a unique ID and preprocess features for each movie
# Preprocess each movie
for i, movie in enumerate(movies):
movie['id'] = i # Unique id for each movie
# Combine genres and tags into a feature string (could add description etc.)
movie['id'] = i # Unique ID
# Combine genres and tags into one feature string.
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
# Ensure numeric values for year and runtime if possible:
# Ensure numeric values for year and runtime:
try:
movie['year_num'] = int(movie.get('year', '0'))
except:
movie['year_num'] = 0
try:
# runtime might be a number already or a string; if string, try to convert.
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
except:
movie['runtime_num'] = 0
# Ensure vote_count is numeric.
try:
count = movie.get('vote_count', 0)
if isinstance(count, str):
count = count.replace(',', '')
if 'M' in count:
count = float(count.replace('M', '')) * 1e6
else:
count = int(count)
movie['vote_count'] = int(count)
except:
movie['vote_count'] = 0
# Build the TFIDF vectorizer on movie features.
vectorizer = TfidfVectorizer(stop_words='english')
movie_features = [movie['features'] for movie in movies]
movie_vectors = vectorizer.fit_transform(movie_features)
# Precompute overall ranges for numeric features across the dataset.
# Precompute overall ranges for numeric features.
years = [m['year_num'] for m in movies if m['year_num'] > 0]
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
min_year, max_year = (min(years), max(years)) if years else (0, 1)
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
year_range = max_year - min_year if max_year != min_year else 1
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
rating_range = 10.0 # Assuming ratings are on a 010 scale
def get_diverse_movies(num=10):
def get_predicted_movies(num=10):
"""
Pick up to `num` movies that have not been shown yet, trying to cover different genres.
Return up to `num` movies that haven't been shown yet.
Uses the user's past ratings to predict which unseen movies they might like.
If no ratings exist, falls back to random selection.
"""
asked = session.get('asked_movies', [])
available = [m for m in movies if m['id'] not in asked]
if not available:
return []
selected = []
# List of desired genres to cover
desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror",
"Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
# Try to pick one movie per desired genre.
for genre in desired_genres:
for m in available:
if genre in m.get('genres', []) and m not in selected:
selected.append(m)
break
if len(selected) >= num:
break
# If we still need more movies, fill the remainder randomly.
if len(selected) < num:
remaining = [m for m in available if m not in selected]
random.shuffle(remaining)
selected.extend(remaining[:(num - len(selected))])
return selected[:num]
rated = session.get('rated_movies', {})
# Fallback to random selection if there are no like/dislike ratings.
if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
random.shuffle(available)
return available[:num]
# Build prediction profiles.
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else:
liked_profile = np.zeros((1, movie_vectors.shape[1]))
if disliked_ids:
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
else:
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies.
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
predictions = []
# Tunable weights.
w_text = 0.5
w_year = 0.1
w_runtime = 0.1
w_rating = 0.15
w_popularity = 0.15
for movie in available:
i = movie['id']
# TEXT SIMILARITY.
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
# YEAR SIMILARITY.
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range)
# RUNTIME SIMILARITY.
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
# RATING SIMILARITY.
rating_score = 0
movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
# POPULARITY SCORE.
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
# Final prediction score.
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
predictions.append((movie, final_score))
predictions.sort(key=lambda x: x[1], reverse=True)
return [pred[0] for pred in predictions[:num]]
def enough_info():
"""
Determines whether we have collected enough ratings.
In this example, we require that the user has given a 'like' or 'dislike'
to at least 3 movies.
Check if the user has rated at least 3 movies (like/dislike).
"""
rated = session.get('rated_movies', {})
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
@ -80,15 +155,13 @@ def enough_info():
@app.route('/')
def home():
# Initialize session variables
session.setdefault('rated_movies', {}) # {movie_id: rating}
session.setdefault('asked_movies', []) # list of movie ids already asked
session.setdefault('asked_movies', []) # list of movie IDs already shown
return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST'])
def questionnaire():
if request.method == 'POST':
# Process ratings from the current round.
current_ids = request.form.getlist("movie_id")
for movie_id in current_ids:
rating = request.form.get(f"rating_{movie_id}")
@ -101,25 +174,34 @@ def questionnaire():
else:
return redirect(url_for('questionnaire'))
else:
selected_movies = get_diverse_movies(num=10)
# Use prediction to select movies for the questionnaire.
selected_movies = get_predicted_movies(num=10)
if not selected_movies:
return redirect(url_for('recommend'))
return render_template('questionnaire.html', movies=selected_movies)
def advanced_recommendations():
"""
Build an advanced recommendation score for movies not rated by the user.
Compute an advanced hybrid recommendation score on unseen movies.
Only movies not already shown (asked) are considered.
Combines:
1. Text similarity (from TF-IDF features on genres/tags).
2. Year similarity: movies with similar release years to liked movies.
3. Runtime similarity: movies with similar runtime to liked movies.
The final score is a weighted sum of these signals.
1. Text similarity (TFIDF) between liked/disliked profiles.
2. Year similarity.
3. Runtime similarity.
4. Rating similarity.
5. Popularity (log-scaled vote count).
Returns the top 20 recommendations.
"""
rated = session.get('rated_movies', {})
asked = set(session.get('asked_movies', []))
# Only consider movies that haven't been shown to the user.
available = [m for m in movies if m['id'] not in asked]
if not available:
available = movies # Fallback if all movies have been shown.
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
# Build text profiles for liked/disliked movies.
if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else:
@ -129,48 +211,56 @@ def advanced_recommendations():
else:
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies (for year and runtime).
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
avg_rating = np.mean(liked_ratings) if liked_ratings else None
recommendations = []
# Weights for each component adjust these to tune the algorithm.
w_text = 0.70
w_year = 0.15
w_runtime = 0.15
w_text = 0.5
w_year = 0.1
w_runtime = 0.1
w_rating = 0.15
w_popularity = 0.15
for i, movie in enumerate(movies):
movie_id = str(movie['id'])
if rated.get(movie_id, "not seen") != "not seen":
continue # Skip movies already rated.
# TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
for movie in available:
i = movie['id']
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
# NUMERIC SIMILARITY for Year.
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical)
year_score = 1 - (diff_year / year_range)
# NUMERIC SIMILARITY for Runtime.
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
# Final combined score.
final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
rating_score = 0
movie_rating = movie.get('imdb_rating', 0)
if avg_rating is not None and movie_rating:
diff_rating = abs(movie_rating - avg_rating)
rating_score = 1 - (diff_rating / rating_range)
popularity_score = 0
if movie['vote_count'] > 0:
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
final_score = (w_text * text_score +
w_year * year_score +
w_runtime * runtime_score +
w_rating * rating_score +
w_popularity * popularity_score)
recommendations.append((movie, final_score))
# Sort recommendations by final score in descending order.
recommendations.sort(key=lambda x: x[1], reverse=True)
return recommendations
return recommendations[:20]
@app.route('/recommend')
def recommend():
@ -179,3 +269,5 @@ def recommend():
if __name__ == '__main__':
app.run(debug=True)

103
test.py
View File

@ -2,6 +2,7 @@ import requests
import json
import time
from tqdm import tqdm # progress bar library
import concurrent.futures
# Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5"
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
return keywords
def process_movie(movie, page, idx, results_per_page):
"""
Processes a single movie record:
- Computes its ranking,
- Extracts basic information,
- Fetches additional details and keywords.
"""
ranking = (page - 1) * results_per_page + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details and keywords.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
# Brief sleep to help throttle requests
time.sleep(0.2)
return movie_data
def get_top_movies():
"""
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
For each movie, additional details and keywords are fetched.
After processing each page, the current movies list is saved to a JSON file.
Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
After processing each page, the current list of movies is written to a JSON file.
"""
movies = []
base_url = "https://api.themoviedb.org/3/movie/top_rated"
@ -91,47 +136,21 @@ def get_top_movies():
continue
data = response.json()
results = data.get("results", [])
results_per_page = len(results)
# Process each movie concurrently using a thread pool.
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
futures = []
for idx, movie in enumerate(results):
# Ranking is computed by overall order.
ranking = (page - 1) * len(results) + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details: runtime and genres.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
# Get keywords (tags).
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
# Collect results as they complete.
for future in concurrent.futures.as_completed(futures):
try:
movie_data = future.result()
movies.append(movie_data)
# Pause a bit between detail requests to be courteous.
time.sleep(0.2)
# After processing each page, write the current movies list to the JSON file.
except Exception as e:
print(f"Error processing movie: {e}")
# Write movies to JSON file incrementally after each page.
write_movies(movies)
# Pause between pages.
time.sleep(0.5)

File diff suppressed because it is too large Load Diff