Starting on Predictoin Algorithm

This commit is contained in:
OusmBlueNinja 2025-04-02 13:49:04 -05:00
commit 547b161138
7 changed files with 6406 additions and 0 deletions

181
app.py Normal file
View File

@ -0,0 +1,181 @@
from flask import Flask, request, render_template, redirect, url_for, session
import json
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
app = Flask(__name__)
app.secret_key = 'your_secret_key_here' # Replace with a secure key in production
# Load movies from top_movies.json with UTF-8 encoding
with open('top_movies.json', 'r', encoding='utf-8') as f:
movies = json.load(f)
# Assign a unique ID and preprocess features for each movie
for i, movie in enumerate(movies):
movie['id'] = i # Unique id for each movie
# Combine genres and tags into a feature string (could add description etc.)
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
# Ensure numeric values for year and runtime if possible:
try:
movie['year_num'] = int(movie.get('year', '0'))
except:
movie['year_num'] = 0
try:
# runtime might be a number already or a string; if string, try to convert.
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
except:
movie['runtime_num'] = 0
# Build the TFIDF vectorizer on movie features.
vectorizer = TfidfVectorizer(stop_words='english')
movie_features = [movie['features'] for movie in movies]
movie_vectors = vectorizer.fit_transform(movie_features)
# Precompute overall ranges for numeric features across the dataset.
years = [m['year_num'] for m in movies if m['year_num'] > 0]
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
min_year, max_year = (min(years), max(years)) if years else (0, 1)
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
year_range = max_year - min_year if max_year != min_year else 1
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
def get_diverse_movies(num=10):
"""
Pick up to `num` movies that have not been shown yet, trying to cover different genres.
"""
asked = session.get('asked_movies', [])
available = [m for m in movies if m['id'] not in asked]
if not available:
return []
selected = []
# List of desired genres to cover
desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror",
"Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
# Try to pick one movie per desired genre.
for genre in desired_genres:
for m in available:
if genre in m.get('genres', []) and m not in selected:
selected.append(m)
break
if len(selected) >= num:
break
# If we still need more movies, fill the remainder randomly.
if len(selected) < num:
remaining = [m for m in available if m not in selected]
random.shuffle(remaining)
selected.extend(remaining[:(num - len(selected))])
return selected[:num]
def enough_info():
"""
Determines whether we have collected enough ratings.
In this example, we require that the user has given a 'like' or 'dislike'
to at least 3 movies.
"""
rated = session.get('rated_movies', {})
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
return count >= 3
@app.route('/')
def home():
# Initialize session variables
session.setdefault('rated_movies', {}) # {movie_id: rating}
session.setdefault('asked_movies', []) # list of movie ids already asked
return redirect(url_for('questionnaire'))
@app.route('/questionnaire', methods=['GET', 'POST'])
def questionnaire():
if request.method == 'POST':
# Process ratings from the current round.
current_ids = request.form.getlist("movie_id")
for movie_id in current_ids:
rating = request.form.get(f"rating_{movie_id}")
session['rated_movies'][movie_id] = rating
if int(movie_id) not in session['asked_movies']:
session['asked_movies'].append(int(movie_id))
remaining = [m for m in movies if m['id'] not in session['asked_movies']]
if enough_info() or not remaining:
return redirect(url_for('recommend'))
else:
return redirect(url_for('questionnaire'))
else:
selected_movies = get_diverse_movies(num=10)
if not selected_movies:
return redirect(url_for('recommend'))
return render_template('questionnaire.html', movies=selected_movies)
def advanced_recommendations():
"""
Build an advanced recommendation score for movies not rated by the user.
Combines:
1. Text similarity (from TF-IDF features on genres/tags).
2. Year similarity: movies with similar release years to liked movies.
3. Runtime similarity: movies with similar runtime to liked movies.
The final score is a weighted sum of these signals.
"""
rated = session.get('rated_movies', {})
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
# Build text profiles for liked/disliked movies.
if liked_ids:
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
else:
liked_profile = np.zeros((1, movie_vectors.shape[1]))
if disliked_ids:
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
else:
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
# Compute numeric averages for liked movies (for year and runtime).
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
avg_year = np.mean(liked_years) if liked_years else None
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
recommendations = []
# Weights for each component adjust these to tune the algorithm.
w_text = 0.70
w_year = 0.15
w_runtime = 0.15
for i, movie in enumerate(movies):
movie_id = str(movie['id'])
if rated.get(movie_id, "not seen") != "not seen":
continue # Skip movies already rated.
# TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
movie_vector = movie_vectors[i].toarray()
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
text_score = like_sim - dislike_sim
# NUMERIC SIMILARITY for Year.
year_score = 0
if avg_year is not None and movie['year_num'] > 0:
diff_year = abs(movie['year_num'] - avg_year)
year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical)
# NUMERIC SIMILARITY for Runtime.
runtime_score = 0
if avg_runtime is not None and movie['runtime_num'] > 0:
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
runtime_score = 1 - (diff_runtime / runtime_range)
# Final combined score.
final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
recommendations.append((movie, final_score))
# Sort recommendations by final score in descending order.
recommendations.sort(key=lambda x: x[1], reverse=True)
return recommendations
@app.route('/recommend')
def recommend():
recommendations = advanced_recommendations()
return render_template('recommendations.html', recommendations=recommendations)
if __name__ == '__main__':
app.run(debug=True)

1584
out.html Normal file

File diff suppressed because it is too large Load Diff

77
templates/index.html Normal file
View File

@ -0,0 +1,77 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Movie Slideshow</title>
<style>
/* Basic styling for slideshow */
#movie-container {
text-align: center;
margin-top: 30px;
}
#movie-poster {
width: 200px;
margin: 20px;
}
.rating-buttons button {
margin: 10px;
padding: 10px 20px;
font-size: 16px;
}
</style>
</head>
<body>
<h1 style="text-align: center;">Rate Movies</h1>
<form id="ratingForm" method="POST" action="/recommend">
<!-- Hidden inputs for movie ratings; one per movie -->
{% for movie in movies %}
<input type="hidden" name="{{ movie.title }}" id="rating-{{ loop.index0 }}" value="not seen">
{% endfor %}
<div id="movie-container">
<img id="movie-poster" src="" alt="Movie Poster">
<h2 id="movie-title"></h2>
<p id="movie-description"></p>
</div>
<div class="rating-buttons" style="text-align: center;">
<button type="button" onclick="recordRating('like')">Like</button>
<button type="button" onclick="recordRating('dislike')">Dislike</button>
<button type="button" onclick="recordRating('not seen')">Not Seen</button>
</div>
</form>
<script>
const movies = {{ movies | tojson }};
let currentIndex = 0;
const posterEl = document.getElementById("movie-poster");
const titleEl = document.getElementById("movie-title");
const descriptionEl = document.getElementById("movie-description");
// Function to display the movie at the given index
function showMovie(index) {
if (index >= movies.length) {
// All movies rated; submit the form
document.getElementById("ratingForm").submit();
return;
}
const movie = movies[index];
posterEl.src = movie.poster;
posterEl.alt = movie.title;
titleEl.textContent = movie.title + " (" + movie.year + ")";
descriptionEl.textContent = movie.description;
}
// Record the rating for the current movie and show the next one
function recordRating(rating) {
// Update the hidden input for the current movie with the chosen rating
document.getElementById("rating-" + currentIndex).value = rating;
currentIndex++;
showMovie(currentIndex);
}
// Initialize the slideshow with the first movie
showMovie(currentIndex);
</script>
</body>
</html>

View File

@ -0,0 +1,83 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Movie Questionnaire</title>
<style>
/* Styling for the slideshow */
#movie-container {
text-align: center;
margin-top: 30px;
}
#movie-poster {
width: 200px;
margin: 20px;
}
.rating-buttons button {
margin: 10px;
padding: 10px 20px;
font-size: 16px;
}
</style>
</head>
<body>
<h1 style="text-align: center;">Rate Movies</h1>
<form id="questionForm" method="POST">
<!-- Container where hidden inputs will be added for the round -->
<div id="hiddenFields"></div>
<div id="movie-container">
<img id="movie-poster" src="" alt="Movie Poster">
<h2 id="movie-title"></h2>
<p id="movie-description"></p>
</div>
<div class="rating-buttons" style="text-align: center;">
<button type="button" onclick="recordRating('like')">Like</button>
<button type="button" onclick="recordRating('dislike')">Dislike</button>
<button type="button" onclick="recordRating('not seen')">Not Seen</button>
</div>
</form>
<script>
// Movies for the current round are passed from the server.
const movies = {{ movies | tojson }};
let currentIndex = 0;
let movieRatings = {}; // To store ratings for this batch
function showMovie(index) {
if (index >= movies.length) {
// All movies rated in this round—append hidden fields and submit the form.
const container = document.getElementById("hiddenFields");
movies.forEach(movie => {
// Hidden input for movie id
const movieIdInput = document.createElement("input");
movieIdInput.type = "hidden";
movieIdInput.name = "movie_id";
movieIdInput.value = movie.id;
container.appendChild(movieIdInput);
// Hidden input for its rating
const ratingInput = document.createElement("input");
ratingInput.type = "hidden";
ratingInput.name = "rating_" + movie.id;
ratingInput.value = movieRatings[movie.id] || "not seen";
container.appendChild(ratingInput);
});
document.getElementById("questionForm").submit();
return;
}
const movie = movies[currentIndex];
document.getElementById("movie-poster").src = movie.poster;
document.getElementById("movie-poster").alt = movie.title;
document.getElementById("movie-title").textContent = movie.title + " (" + movie.year + ")";
document.getElementById("movie-description").textContent = movie.description;
}
function recordRating(rating) {
movieRatings[movies[currentIndex].id] = rating;
currentIndex++;
showMovie(currentIndex);
}
showMovie(currentIndex);
</script>
</body>
</html>

View File

@ -0,0 +1,21 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Movie Recommendations</title>
</head>
<body>
<h1>Your Movie Recommendations</h1>
{% for movie, score in recommendations %}
<div style="margin-bottom: 20px;">
<img src="{{ movie.poster }}" alt="{{ movie.title }}" width="70" style="vertical-align: middle;" />
<strong>{{ movie.title }} ({{ movie.year }})</strong>
<p>{{ movie.description }}</p>
<a href="{{ movie.url }}" target="_blank">More Info</a>
<p>Recommendation Score: {{ score | round(3) }}</p>
</div>
<hr>
{% endfor %}
<a href="/">Back to Questionnaire</a>
</body>
</html>

142
test.py Normal file
View File

@ -0,0 +1,142 @@
import requests
import json
import time
from tqdm import tqdm # progress bar library
# Replace with your actual TMDb API key
api_key = "96f3424d6fe55c2982e6e094416607f5"
# Output file where results are saved incrementally
output_filename = "top_movies.json"
def write_movies(movies, filename=output_filename):
"""Helper function to write the movies list to a JSON file."""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(movies, f, indent=4, ensure_ascii=False)
except Exception as e:
print(f"Error saving data to JSON file: {e}")
def get_movie_details_tmdb(movie_id):
"""
Fetch additional details for a movie using the TMDb API.
Returns runtime and genres.
"""
details = {}
details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
params = {
"api_key": api_key,
"language": "en-US"
}
try:
response = requests.get(details_url, params=params)
if response.status_code == 200:
data = response.json()
details["runtime"] = data.get("runtime") # runtime in minutes
details["genres"] = [g["name"] for g in data.get("genres", [])]
else:
print(f"Failed to get details for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching details for movie {movie_id}: {e}")
return details
def get_movie_keywords(movie_id):
"""
Fetch movie keywords (tags) using the TMDb API.
"""
keywords = []
keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
params = {
"api_key": api_key
}
try:
response = requests.get(keywords_url, params=params)
if response.status_code == 200:
data = response.json()
keywords = [kw["name"] for kw in data.get("keywords", [])]
else:
print(f"Failed to get keywords for movie {movie_id}: status code {response.status_code}")
except Exception as e:
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
return keywords
def get_top_movies():
"""
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
For each movie, additional details and keywords are fetched.
After processing each page, the current movies list is saved to a JSON file.
"""
movies = []
base_url = "https://api.themoviedb.org/3/movie/top_rated"
params = {
"api_key": api_key,
"language": "en-US",
"page": 1
}
# Initial request to determine total pages.
response = requests.get(base_url, params=params)
if response.status_code != 200:
print("Failed to retrieve top rated movies")
return []
data = response.json()
total_pages = data.get("total_pages", 1)
# Loop through all pages.
for page in tqdm(range(1, total_pages + 1), desc="Scraping top rated movies"):
params["page"] = page
response = requests.get(base_url, params=params)
if response.status_code != 200:
print(f"Failed to retrieve page {page}")
continue
data = response.json()
results = data.get("results", [])
for idx, movie in enumerate(results):
# Ranking is computed by overall order.
ranking = (page - 1) * len(results) + idx + 1
movie_id = movie.get("id")
title = movie.get("title")
release_date = movie.get("release_date", "")
year = release_date.split("-")[0] if release_date else None
vote_average = movie.get("vote_average")
vote_count = movie.get("vote_count")
overview = movie.get("overview")
poster_path = movie.get("poster_path")
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
# Get additional details: runtime and genres.
details = get_movie_details_tmdb(movie_id)
runtime = details.get("runtime")
genres = details.get("genres", [])
# Get keywords (tags).
tags = get_movie_keywords(movie_id)
movie_data = {
"ranking": ranking,
"title": title,
"year": year,
"runtime": runtime,
"content_rating": None, # Not available via TMDb by default.
"metascore": None, # Not applicable.
"imdb_rating": vote_average, # Using TMDb's vote average.
"vote_count": vote_count,
"description": overview,
"poster": poster,
"url": tmdb_url,
"genres": genres,
"tags": tags
}
movies.append(movie_data)
# Pause a bit between detail requests to be courteous.
time.sleep(0.2)
# After processing each page, write the current movies list to the JSON file.
write_movies(movies)
# Pause between pages.
time.sleep(0.5)
return movies
if __name__ == "__main__":
top_movies = get_top_movies()
print(f"\nData saved to {output_filename}")

4318
top_movies.json Normal file

File diff suppressed because it is too large Load Diff