bad
This commit is contained in:
parent
547b161138
commit
914c085cac
206
app.py
206
app.py
@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
|
||||
import json
|
||||
import numpy as np
|
||||
import random
|
||||
import math
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here' # Replace with a secure key in producti
|
||||
with open('top_movies.json', 'r', encoding='utf-8') as f:
|
||||
movies = json.load(f)
|
||||
|
||||
# Assign a unique ID and preprocess features for each movie
|
||||
# Preprocess each movie
|
||||
for i, movie in enumerate(movies):
|
||||
movie['id'] = i # Unique id for each movie
|
||||
# Combine genres and tags into a feature string (could add description etc.)
|
||||
movie['id'] = i # Unique ID
|
||||
# Combine genres and tags into one feature string.
|
||||
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
|
||||
# Ensure numeric values for year and runtime if possible:
|
||||
# Ensure numeric values for year and runtime:
|
||||
try:
|
||||
movie['year_num'] = int(movie.get('year', '0'))
|
||||
except:
|
||||
movie['year_num'] = 0
|
||||
try:
|
||||
# runtime might be a number already or a string; if string, try to convert.
|
||||
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
|
||||
except:
|
||||
movie['runtime_num'] = 0
|
||||
# Ensure vote_count is numeric.
|
||||
try:
|
||||
count = movie.get('vote_count', 0)
|
||||
if isinstance(count, str):
|
||||
count = count.replace(',', '')
|
||||
if 'M' in count:
|
||||
count = float(count.replace('M', '')) * 1e6
|
||||
else:
|
||||
count = int(count)
|
||||
movie['vote_count'] = int(count)
|
||||
except:
|
||||
movie['vote_count'] = 0
|
||||
|
||||
# Build the TF‑IDF vectorizer on movie features.
|
||||
vectorizer = TfidfVectorizer(stop_words='english')
|
||||
movie_features = [movie['features'] for movie in movies]
|
||||
movie_vectors = vectorizer.fit_transform(movie_features)
|
||||
|
||||
# Precompute overall ranges for numeric features across the dataset.
|
||||
# Precompute overall ranges for numeric features.
|
||||
years = [m['year_num'] for m in movies if m['year_num'] > 0]
|
||||
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
|
||||
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
|
||||
|
||||
min_year, max_year = (min(years), max(years)) if years else (0, 1)
|
||||
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
|
||||
year_range = max_year - min_year if max_year != min_year else 1
|
||||
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
|
||||
rating_range = 10.0 # Assuming ratings are on a 0–10 scale
|
||||
|
||||
def get_diverse_movies(num=10):
|
||||
def get_predicted_movies(num=10):
|
||||
"""
|
||||
Pick up to `num` movies that have not been shown yet, trying to cover different genres.
|
||||
Return up to `num` movies that haven't been shown yet.
|
||||
Uses the user's past ratings to predict which unseen movies they might like.
|
||||
If no ratings exist, falls back to random selection.
|
||||
"""
|
||||
asked = session.get('asked_movies', [])
|
||||
available = [m for m in movies if m['id'] not in asked]
|
||||
if not available:
|
||||
return []
|
||||
selected = []
|
||||
# List of desired genres to cover
|
||||
desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror",
|
||||
"Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
|
||||
# Try to pick one movie per desired genre.
|
||||
for genre in desired_genres:
|
||||
for m in available:
|
||||
if genre in m.get('genres', []) and m not in selected:
|
||||
selected.append(m)
|
||||
break
|
||||
if len(selected) >= num:
|
||||
break
|
||||
# If we still need more movies, fill the remainder randomly.
|
||||
if len(selected) < num:
|
||||
remaining = [m for m in available if m not in selected]
|
||||
random.shuffle(remaining)
|
||||
selected.extend(remaining[:(num - len(selected))])
|
||||
return selected[:num]
|
||||
rated = session.get('rated_movies', {})
|
||||
# Fallback to random selection if there are no like/dislike ratings.
|
||||
if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
|
||||
random.shuffle(available)
|
||||
return available[:num]
|
||||
|
||||
# Build prediction profiles.
|
||||
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||||
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||||
|
||||
if liked_ids:
|
||||
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||||
else:
|
||||
liked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||
if disliked_ids:
|
||||
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
|
||||
else:
|
||||
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||
|
||||
# Compute numeric averages for liked movies.
|
||||
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||||
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||||
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||||
|
||||
avg_year = np.mean(liked_years) if liked_years else None
|
||||
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||||
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||||
|
||||
predictions = []
|
||||
# Tunable weights.
|
||||
w_text = 0.5
|
||||
w_year = 0.1
|
||||
w_runtime = 0.1
|
||||
w_rating = 0.15
|
||||
w_popularity = 0.15
|
||||
|
||||
for movie in available:
|
||||
i = movie['id']
|
||||
# TEXT SIMILARITY.
|
||||
movie_vector = movie_vectors[i].toarray()
|
||||
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||||
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||||
text_score = like_sim - dislike_sim
|
||||
|
||||
# YEAR SIMILARITY.
|
||||
year_score = 0
|
||||
if avg_year is not None and movie['year_num'] > 0:
|
||||
diff_year = abs(movie['year_num'] - avg_year)
|
||||
year_score = 1 - (diff_year / year_range)
|
||||
|
||||
# RUNTIME SIMILARITY.
|
||||
runtime_score = 0
|
||||
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||||
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||||
runtime_score = 1 - (diff_runtime / runtime_range)
|
||||
|
||||
# RATING SIMILARITY.
|
||||
rating_score = 0
|
||||
movie_rating = movie.get('imdb_rating', 0)
|
||||
if avg_rating is not None and movie_rating:
|
||||
diff_rating = abs(movie_rating - avg_rating)
|
||||
rating_score = 1 - (diff_rating / rating_range)
|
||||
|
||||
# POPULARITY SCORE.
|
||||
popularity_score = 0
|
||||
if movie['vote_count'] > 0:
|
||||
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||||
|
||||
# Final prediction score.
|
||||
final_score = (w_text * text_score +
|
||||
w_year * year_score +
|
||||
w_runtime * runtime_score +
|
||||
w_rating * rating_score +
|
||||
w_popularity * popularity_score)
|
||||
predictions.append((movie, final_score))
|
||||
|
||||
predictions.sort(key=lambda x: x[1], reverse=True)
|
||||
return [pred[0] for pred in predictions[:num]]
|
||||
|
||||
def enough_info():
|
||||
"""
|
||||
Determines whether we have collected enough ratings.
|
||||
In this example, we require that the user has given a 'like' or 'dislike'
|
||||
to at least 3 movies.
|
||||
Check if the user has rated at least 3 movies (like/dislike).
|
||||
"""
|
||||
rated = session.get('rated_movies', {})
|
||||
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
|
||||
@ -80,15 +155,13 @@ def enough_info():
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
# Initialize session variables
|
||||
session.setdefault('rated_movies', {}) # {movie_id: rating}
|
||||
session.setdefault('asked_movies', []) # list of movie ids already asked
|
||||
session.setdefault('asked_movies', []) # list of movie IDs already shown
|
||||
return redirect(url_for('questionnaire'))
|
||||
|
||||
@app.route('/questionnaire', methods=['GET', 'POST'])
|
||||
def questionnaire():
|
||||
if request.method == 'POST':
|
||||
# Process ratings from the current round.
|
||||
current_ids = request.form.getlist("movie_id")
|
||||
for movie_id in current_ids:
|
||||
rating = request.form.get(f"rating_{movie_id}")
|
||||
@ -101,25 +174,34 @@ def questionnaire():
|
||||
else:
|
||||
return redirect(url_for('questionnaire'))
|
||||
else:
|
||||
selected_movies = get_diverse_movies(num=10)
|
||||
# Use prediction to select movies for the questionnaire.
|
||||
selected_movies = get_predicted_movies(num=10)
|
||||
if not selected_movies:
|
||||
return redirect(url_for('recommend'))
|
||||
return render_template('questionnaire.html', movies=selected_movies)
|
||||
|
||||
def advanced_recommendations():
|
||||
"""
|
||||
Build an advanced recommendation score for movies not rated by the user.
|
||||
Compute an advanced hybrid recommendation score on unseen movies.
|
||||
Only movies not already shown (asked) are considered.
|
||||
Combines:
|
||||
1. Text similarity (from TF-IDF features on genres/tags).
|
||||
2. Year similarity: movies with similar release years to liked movies.
|
||||
3. Runtime similarity: movies with similar runtime to liked movies.
|
||||
The final score is a weighted sum of these signals.
|
||||
1. Text similarity (TF‑IDF) between liked/disliked profiles.
|
||||
2. Year similarity.
|
||||
3. Runtime similarity.
|
||||
4. Rating similarity.
|
||||
5. Popularity (log-scaled vote count).
|
||||
Returns the top 20 recommendations.
|
||||
"""
|
||||
rated = session.get('rated_movies', {})
|
||||
asked = set(session.get('asked_movies', []))
|
||||
# Only consider movies that haven't been shown to the user.
|
||||
available = [m for m in movies if m['id'] not in asked]
|
||||
if not available:
|
||||
available = movies # Fallback if all movies have been shown.
|
||||
|
||||
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||||
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||||
|
||||
# Build text profiles for liked/disliked movies.
|
||||
if liked_ids:
|
||||
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||||
else:
|
||||
@ -129,48 +211,56 @@ def advanced_recommendations():
|
||||
else:
|
||||
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||
|
||||
# Compute numeric averages for liked movies (for year and runtime).
|
||||
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||||
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||||
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||||
avg_year = np.mean(liked_years) if liked_years else None
|
||||
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||||
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||||
|
||||
recommendations = []
|
||||
# Weights for each component – adjust these to tune the algorithm.
|
||||
w_text = 0.70
|
||||
w_year = 0.15
|
||||
w_runtime = 0.15
|
||||
w_text = 0.5
|
||||
w_year = 0.1
|
||||
w_runtime = 0.1
|
||||
w_rating = 0.15
|
||||
w_popularity = 0.15
|
||||
|
||||
for i, movie in enumerate(movies):
|
||||
movie_id = str(movie['id'])
|
||||
if rated.get(movie_id, "not seen") != "not seen":
|
||||
continue # Skip movies already rated.
|
||||
|
||||
# TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
|
||||
for movie in available:
|
||||
i = movie['id']
|
||||
movie_vector = movie_vectors[i].toarray()
|
||||
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||||
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||||
text_score = like_sim - dislike_sim
|
||||
|
||||
# NUMERIC SIMILARITY for Year.
|
||||
year_score = 0
|
||||
if avg_year is not None and movie['year_num'] > 0:
|
||||
diff_year = abs(movie['year_num'] - avg_year)
|
||||
year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical)
|
||||
year_score = 1 - (diff_year / year_range)
|
||||
|
||||
# NUMERIC SIMILARITY for Runtime.
|
||||
runtime_score = 0
|
||||
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||||
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||||
runtime_score = 1 - (diff_runtime / runtime_range)
|
||||
|
||||
# Final combined score.
|
||||
final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
|
||||
rating_score = 0
|
||||
movie_rating = movie.get('imdb_rating', 0)
|
||||
if avg_rating is not None and movie_rating:
|
||||
diff_rating = abs(movie_rating - avg_rating)
|
||||
rating_score = 1 - (diff_rating / rating_range)
|
||||
|
||||
popularity_score = 0
|
||||
if movie['vote_count'] > 0:
|
||||
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||||
|
||||
final_score = (w_text * text_score +
|
||||
w_year * year_score +
|
||||
w_runtime * runtime_score +
|
||||
w_rating * rating_score +
|
||||
w_popularity * popularity_score)
|
||||
recommendations.append((movie, final_score))
|
||||
|
||||
# Sort recommendations by final score in descending order.
|
||||
recommendations.sort(key=lambda x: x[1], reverse=True)
|
||||
return recommendations
|
||||
return recommendations[:20]
|
||||
|
||||
@app.route('/recommend')
|
||||
def recommend():
|
||||
@ -179,3 +269,5 @@ def recommend():
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
|
||||
|
||||
|
103
test.py
103
test.py
@ -2,6 +2,7 @@ import requests
|
||||
import json
|
||||
import time
|
||||
from tqdm import tqdm # progress bar library
|
||||
import concurrent.futures
|
||||
|
||||
# Replace with your actual TMDb API key
|
||||
api_key = "96f3424d6fe55c2982e6e094416607f5"
|
||||
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
|
||||
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
|
||||
return keywords
|
||||
|
||||
def process_movie(movie, page, idx, results_per_page):
|
||||
"""
|
||||
Processes a single movie record:
|
||||
- Computes its ranking,
|
||||
- Extracts basic information,
|
||||
- Fetches additional details and keywords.
|
||||
"""
|
||||
ranking = (page - 1) * results_per_page + idx + 1
|
||||
movie_id = movie.get("id")
|
||||
title = movie.get("title")
|
||||
release_date = movie.get("release_date", "")
|
||||
year = release_date.split("-")[0] if release_date else None
|
||||
vote_average = movie.get("vote_average")
|
||||
vote_count = movie.get("vote_count")
|
||||
overview = movie.get("overview")
|
||||
poster_path = movie.get("poster_path")
|
||||
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
|
||||
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
|
||||
|
||||
# Get additional details and keywords.
|
||||
details = get_movie_details_tmdb(movie_id)
|
||||
runtime = details.get("runtime")
|
||||
genres = details.get("genres", [])
|
||||
|
||||
tags = get_movie_keywords(movie_id)
|
||||
|
||||
movie_data = {
|
||||
"ranking": ranking,
|
||||
"title": title,
|
||||
"year": year,
|
||||
"runtime": runtime,
|
||||
"content_rating": None, # Not available via TMDb by default.
|
||||
"metascore": None, # Not applicable.
|
||||
"imdb_rating": vote_average, # Using TMDb's vote average.
|
||||
"vote_count": vote_count,
|
||||
"description": overview,
|
||||
"poster": poster,
|
||||
"url": tmdb_url,
|
||||
"genres": genres,
|
||||
"tags": tags
|
||||
}
|
||||
# Brief sleep to help throttle requests
|
||||
time.sleep(0.2)
|
||||
return movie_data
|
||||
|
||||
def get_top_movies():
|
||||
"""
|
||||
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
|
||||
For each movie, additional details and keywords are fetched.
|
||||
After processing each page, the current movies list is saved to a JSON file.
|
||||
Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
|
||||
After processing each page, the current list of movies is written to a JSON file.
|
||||
"""
|
||||
movies = []
|
||||
base_url = "https://api.themoviedb.org/3/movie/top_rated"
|
||||
@ -91,47 +136,21 @@ def get_top_movies():
|
||||
continue
|
||||
data = response.json()
|
||||
results = data.get("results", [])
|
||||
results_per_page = len(results)
|
||||
|
||||
# Process each movie concurrently using a thread pool.
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||
futures = []
|
||||
for idx, movie in enumerate(results):
|
||||
# Ranking is computed by overall order.
|
||||
ranking = (page - 1) * len(results) + idx + 1
|
||||
movie_id = movie.get("id")
|
||||
title = movie.get("title")
|
||||
release_date = movie.get("release_date", "")
|
||||
year = release_date.split("-")[0] if release_date else None
|
||||
vote_average = movie.get("vote_average")
|
||||
vote_count = movie.get("vote_count")
|
||||
overview = movie.get("overview")
|
||||
poster_path = movie.get("poster_path")
|
||||
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
|
||||
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
|
||||
|
||||
# Get additional details: runtime and genres.
|
||||
details = get_movie_details_tmdb(movie_id)
|
||||
runtime = details.get("runtime")
|
||||
genres = details.get("genres", [])
|
||||
|
||||
# Get keywords (tags).
|
||||
tags = get_movie_keywords(movie_id)
|
||||
|
||||
movie_data = {
|
||||
"ranking": ranking,
|
||||
"title": title,
|
||||
"year": year,
|
||||
"runtime": runtime,
|
||||
"content_rating": None, # Not available via TMDb by default.
|
||||
"metascore": None, # Not applicable.
|
||||
"imdb_rating": vote_average, # Using TMDb's vote average.
|
||||
"vote_count": vote_count,
|
||||
"description": overview,
|
||||
"poster": poster,
|
||||
"url": tmdb_url,
|
||||
"genres": genres,
|
||||
"tags": tags
|
||||
}
|
||||
futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
|
||||
# Collect results as they complete.
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
movie_data = future.result()
|
||||
movies.append(movie_data)
|
||||
# Pause a bit between detail requests to be courteous.
|
||||
time.sleep(0.2)
|
||||
# After processing each page, write the current movies list to the JSON file.
|
||||
except Exception as e:
|
||||
print(f"Error processing movie: {e}")
|
||||
# Write movies to JSON file incrementally after each page.
|
||||
write_movies(movies)
|
||||
# Pause between pages.
|
||||
time.sleep(0.5)
|
||||
|
96832
top_movies.json
96832
top_movies.json
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user