bad
This commit is contained in:
parent
547b161138
commit
914c085cac
206
app.py
206
app.py
@ -2,6 +2,7 @@ from flask import Flask, request, render_template, redirect, url_for, session
|
|||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
|
import math
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
@ -12,67 +13,141 @@ app.secret_key = 'your_secret_key_here' # Replace with a secure key in producti
|
|||||||
with open('top_movies.json', 'r', encoding='utf-8') as f:
|
with open('top_movies.json', 'r', encoding='utf-8') as f:
|
||||||
movies = json.load(f)
|
movies = json.load(f)
|
||||||
|
|
||||||
# Assign a unique ID and preprocess features for each movie
|
# Preprocess each movie
|
||||||
for i, movie in enumerate(movies):
|
for i, movie in enumerate(movies):
|
||||||
movie['id'] = i # Unique id for each movie
|
movie['id'] = i # Unique ID
|
||||||
# Combine genres and tags into a feature string (could add description etc.)
|
# Combine genres and tags into one feature string.
|
||||||
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
|
movie['features'] = ' '.join(movie.get('genres', [])) + ' ' + ' '.join(movie.get('tags', []))
|
||||||
# Ensure numeric values for year and runtime if possible:
|
# Ensure numeric values for year and runtime:
|
||||||
try:
|
try:
|
||||||
movie['year_num'] = int(movie.get('year', '0'))
|
movie['year_num'] = int(movie.get('year', '0'))
|
||||||
except:
|
except:
|
||||||
movie['year_num'] = 0
|
movie['year_num'] = 0
|
||||||
try:
|
try:
|
||||||
# runtime might be a number already or a string; if string, try to convert.
|
|
||||||
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
|
movie['runtime_num'] = float(movie.get('runtime')) if movie.get('runtime') else 0
|
||||||
except:
|
except:
|
||||||
movie['runtime_num'] = 0
|
movie['runtime_num'] = 0
|
||||||
|
# Ensure vote_count is numeric.
|
||||||
|
try:
|
||||||
|
count = movie.get('vote_count', 0)
|
||||||
|
if isinstance(count, str):
|
||||||
|
count = count.replace(',', '')
|
||||||
|
if 'M' in count:
|
||||||
|
count = float(count.replace('M', '')) * 1e6
|
||||||
|
else:
|
||||||
|
count = int(count)
|
||||||
|
movie['vote_count'] = int(count)
|
||||||
|
except:
|
||||||
|
movie['vote_count'] = 0
|
||||||
|
|
||||||
# Build the TF‑IDF vectorizer on movie features.
|
# Build the TF‑IDF vectorizer on movie features.
|
||||||
vectorizer = TfidfVectorizer(stop_words='english')
|
vectorizer = TfidfVectorizer(stop_words='english')
|
||||||
movie_features = [movie['features'] for movie in movies]
|
movie_features = [movie['features'] for movie in movies]
|
||||||
movie_vectors = vectorizer.fit_transform(movie_features)
|
movie_vectors = vectorizer.fit_transform(movie_features)
|
||||||
|
|
||||||
# Precompute overall ranges for numeric features across the dataset.
|
# Precompute overall ranges for numeric features.
|
||||||
years = [m['year_num'] for m in movies if m['year_num'] > 0]
|
years = [m['year_num'] for m in movies if m['year_num'] > 0]
|
||||||
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
|
runtimes = [m['runtime_num'] for m in movies if m['runtime_num'] > 0]
|
||||||
|
max_vote = max([m['vote_count'] for m in movies]) if movies else 1
|
||||||
|
|
||||||
min_year, max_year = (min(years), max(years)) if years else (0, 1)
|
min_year, max_year = (min(years), max(years)) if years else (0, 1)
|
||||||
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
|
min_runtime, max_runtime = (min(runtimes), max(runtimes)) if runtimes else (0, 1)
|
||||||
year_range = max_year - min_year if max_year != min_year else 1
|
year_range = max_year - min_year if max_year != min_year else 1
|
||||||
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
|
runtime_range = max_runtime - min_runtime if max_runtime != min_runtime else 1
|
||||||
|
rating_range = 10.0 # Assuming ratings are on a 0–10 scale
|
||||||
|
|
||||||
def get_diverse_movies(num=10):
|
def get_predicted_movies(num=10):
|
||||||
"""
|
"""
|
||||||
Pick up to `num` movies that have not been shown yet, trying to cover different genres.
|
Return up to `num` movies that haven't been shown yet.
|
||||||
|
Uses the user's past ratings to predict which unseen movies they might like.
|
||||||
|
If no ratings exist, falls back to random selection.
|
||||||
"""
|
"""
|
||||||
asked = session.get('asked_movies', [])
|
asked = session.get('asked_movies', [])
|
||||||
available = [m for m in movies if m['id'] not in asked]
|
available = [m for m in movies if m['id'] not in asked]
|
||||||
if not available:
|
if not available:
|
||||||
return []
|
return []
|
||||||
selected = []
|
rated = session.get('rated_movies', {})
|
||||||
# List of desired genres to cover
|
# Fallback to random selection if there are no like/dislike ratings.
|
||||||
desired_genres = ["Action", "Adventure", "Comedy", "Drama", "Horror",
|
if not rated or not any(r in ['like', 'dislike'] for r in rated.values()):
|
||||||
"Romance", "Sci-Fi", "Thriller", "Animation", "Documentary"]
|
random.shuffle(available)
|
||||||
# Try to pick one movie per desired genre.
|
return available[:num]
|
||||||
for genre in desired_genres:
|
|
||||||
for m in available:
|
# Build prediction profiles.
|
||||||
if genre in m.get('genres', []) and m not in selected:
|
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||||||
selected.append(m)
|
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||||||
break
|
|
||||||
if len(selected) >= num:
|
if liked_ids:
|
||||||
break
|
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||||||
# If we still need more movies, fill the remainder randomly.
|
else:
|
||||||
if len(selected) < num:
|
liked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||||
remaining = [m for m in available if m not in selected]
|
if disliked_ids:
|
||||||
random.shuffle(remaining)
|
disliked_profile = np.asarray(movie_vectors[disliked_ids].mean(axis=0))
|
||||||
selected.extend(remaining[:(num - len(selected))])
|
else:
|
||||||
return selected[:num]
|
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||||
|
|
||||||
|
# Compute numeric averages for liked movies.
|
||||||
|
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||||||
|
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||||||
|
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||||||
|
|
||||||
|
avg_year = np.mean(liked_years) if liked_years else None
|
||||||
|
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||||||
|
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||||||
|
|
||||||
|
predictions = []
|
||||||
|
# Tunable weights.
|
||||||
|
w_text = 0.5
|
||||||
|
w_year = 0.1
|
||||||
|
w_runtime = 0.1
|
||||||
|
w_rating = 0.15
|
||||||
|
w_popularity = 0.15
|
||||||
|
|
||||||
|
for movie in available:
|
||||||
|
i = movie['id']
|
||||||
|
# TEXT SIMILARITY.
|
||||||
|
movie_vector = movie_vectors[i].toarray()
|
||||||
|
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||||||
|
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||||||
|
text_score = like_sim - dislike_sim
|
||||||
|
|
||||||
|
# YEAR SIMILARITY.
|
||||||
|
year_score = 0
|
||||||
|
if avg_year is not None and movie['year_num'] > 0:
|
||||||
|
diff_year = abs(movie['year_num'] - avg_year)
|
||||||
|
year_score = 1 - (diff_year / year_range)
|
||||||
|
|
||||||
|
# RUNTIME SIMILARITY.
|
||||||
|
runtime_score = 0
|
||||||
|
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||||||
|
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||||||
|
runtime_score = 1 - (diff_runtime / runtime_range)
|
||||||
|
|
||||||
|
# RATING SIMILARITY.
|
||||||
|
rating_score = 0
|
||||||
|
movie_rating = movie.get('imdb_rating', 0)
|
||||||
|
if avg_rating is not None and movie_rating:
|
||||||
|
diff_rating = abs(movie_rating - avg_rating)
|
||||||
|
rating_score = 1 - (diff_rating / rating_range)
|
||||||
|
|
||||||
|
# POPULARITY SCORE.
|
||||||
|
popularity_score = 0
|
||||||
|
if movie['vote_count'] > 0:
|
||||||
|
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||||||
|
|
||||||
|
# Final prediction score.
|
||||||
|
final_score = (w_text * text_score +
|
||||||
|
w_year * year_score +
|
||||||
|
w_runtime * runtime_score +
|
||||||
|
w_rating * rating_score +
|
||||||
|
w_popularity * popularity_score)
|
||||||
|
predictions.append((movie, final_score))
|
||||||
|
|
||||||
|
predictions.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
return [pred[0] for pred in predictions[:num]]
|
||||||
|
|
||||||
def enough_info():
|
def enough_info():
|
||||||
"""
|
"""
|
||||||
Determines whether we have collected enough ratings.
|
Check if the user has rated at least 3 movies (like/dislike).
|
||||||
In this example, we require that the user has given a 'like' or 'dislike'
|
|
||||||
to at least 3 movies.
|
|
||||||
"""
|
"""
|
||||||
rated = session.get('rated_movies', {})
|
rated = session.get('rated_movies', {})
|
||||||
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
|
count = sum(1 for rating in rated.values() if rating in ['like', 'dislike'])
|
||||||
@ -80,15 +155,13 @@ def enough_info():
|
|||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def home():
|
def home():
|
||||||
# Initialize session variables
|
|
||||||
session.setdefault('rated_movies', {}) # {movie_id: rating}
|
session.setdefault('rated_movies', {}) # {movie_id: rating}
|
||||||
session.setdefault('asked_movies', []) # list of movie ids already asked
|
session.setdefault('asked_movies', []) # list of movie IDs already shown
|
||||||
return redirect(url_for('questionnaire'))
|
return redirect(url_for('questionnaire'))
|
||||||
|
|
||||||
@app.route('/questionnaire', methods=['GET', 'POST'])
|
@app.route('/questionnaire', methods=['GET', 'POST'])
|
||||||
def questionnaire():
|
def questionnaire():
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
# Process ratings from the current round.
|
|
||||||
current_ids = request.form.getlist("movie_id")
|
current_ids = request.form.getlist("movie_id")
|
||||||
for movie_id in current_ids:
|
for movie_id in current_ids:
|
||||||
rating = request.form.get(f"rating_{movie_id}")
|
rating = request.form.get(f"rating_{movie_id}")
|
||||||
@ -101,25 +174,34 @@ def questionnaire():
|
|||||||
else:
|
else:
|
||||||
return redirect(url_for('questionnaire'))
|
return redirect(url_for('questionnaire'))
|
||||||
else:
|
else:
|
||||||
selected_movies = get_diverse_movies(num=10)
|
# Use prediction to select movies for the questionnaire.
|
||||||
|
selected_movies = get_predicted_movies(num=10)
|
||||||
if not selected_movies:
|
if not selected_movies:
|
||||||
return redirect(url_for('recommend'))
|
return redirect(url_for('recommend'))
|
||||||
return render_template('questionnaire.html', movies=selected_movies)
|
return render_template('questionnaire.html', movies=selected_movies)
|
||||||
|
|
||||||
def advanced_recommendations():
|
def advanced_recommendations():
|
||||||
"""
|
"""
|
||||||
Build an advanced recommendation score for movies not rated by the user.
|
Compute an advanced hybrid recommendation score on unseen movies.
|
||||||
|
Only movies not already shown (asked) are considered.
|
||||||
Combines:
|
Combines:
|
||||||
1. Text similarity (from TF-IDF features on genres/tags).
|
1. Text similarity (TF‑IDF) between liked/disliked profiles.
|
||||||
2. Year similarity: movies with similar release years to liked movies.
|
2. Year similarity.
|
||||||
3. Runtime similarity: movies with similar runtime to liked movies.
|
3. Runtime similarity.
|
||||||
The final score is a weighted sum of these signals.
|
4. Rating similarity.
|
||||||
|
5. Popularity (log-scaled vote count).
|
||||||
|
Returns the top 20 recommendations.
|
||||||
"""
|
"""
|
||||||
rated = session.get('rated_movies', {})
|
rated = session.get('rated_movies', {})
|
||||||
|
asked = set(session.get('asked_movies', []))
|
||||||
|
# Only consider movies that haven't been shown to the user.
|
||||||
|
available = [m for m in movies if m['id'] not in asked]
|
||||||
|
if not available:
|
||||||
|
available = movies # Fallback if all movies have been shown.
|
||||||
|
|
||||||
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
liked_ids = [int(mid) for mid, rating in rated.items() if rating == 'like']
|
||||||
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
disliked_ids = [int(mid) for mid, rating in rated.items() if rating == 'dislike']
|
||||||
|
|
||||||
# Build text profiles for liked/disliked movies.
|
|
||||||
if liked_ids:
|
if liked_ids:
|
||||||
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
liked_profile = np.asarray(movie_vectors[liked_ids].mean(axis=0))
|
||||||
else:
|
else:
|
||||||
@ -129,48 +211,56 @@ def advanced_recommendations():
|
|||||||
else:
|
else:
|
||||||
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
disliked_profile = np.zeros((1, movie_vectors.shape[1]))
|
||||||
|
|
||||||
# Compute numeric averages for liked movies (for year and runtime).
|
|
||||||
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
liked_years = [movies[i]['year_num'] for i in liked_ids if movies[i]['year_num'] > 0]
|
||||||
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
liked_runtimes = [movies[i]['runtime_num'] for i in liked_ids if movies[i]['runtime_num'] > 0]
|
||||||
|
liked_ratings = [movies[i].get('imdb_rating', 0) for i in liked_ids if movies[i].get('imdb_rating', 0)]
|
||||||
avg_year = np.mean(liked_years) if liked_years else None
|
avg_year = np.mean(liked_years) if liked_years else None
|
||||||
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
avg_runtime = np.mean(liked_runtimes) if liked_runtimes else None
|
||||||
|
avg_rating = np.mean(liked_ratings) if liked_ratings else None
|
||||||
|
|
||||||
recommendations = []
|
recommendations = []
|
||||||
# Weights for each component – adjust these to tune the algorithm.
|
w_text = 0.5
|
||||||
w_text = 0.70
|
w_year = 0.1
|
||||||
w_year = 0.15
|
w_runtime = 0.1
|
||||||
w_runtime = 0.15
|
w_rating = 0.15
|
||||||
|
w_popularity = 0.15
|
||||||
|
|
||||||
for i, movie in enumerate(movies):
|
for movie in available:
|
||||||
movie_id = str(movie['id'])
|
i = movie['id']
|
||||||
if rated.get(movie_id, "not seen") != "not seen":
|
|
||||||
continue # Skip movies already rated.
|
|
||||||
|
|
||||||
# TEXT SIMILARITY: difference between similarity to liked and disliked profiles.
|
|
||||||
movie_vector = movie_vectors[i].toarray()
|
movie_vector = movie_vectors[i].toarray()
|
||||||
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
like_sim = cosine_similarity(movie_vector, liked_profile)[0][0] if np.linalg.norm(liked_profile) != 0 else 0
|
||||||
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
dislike_sim = cosine_similarity(movie_vector, disliked_profile)[0][0] if np.linalg.norm(disliked_profile) != 0 else 0
|
||||||
text_score = like_sim - dislike_sim
|
text_score = like_sim - dislike_sim
|
||||||
|
|
||||||
# NUMERIC SIMILARITY for Year.
|
|
||||||
year_score = 0
|
year_score = 0
|
||||||
if avg_year is not None and movie['year_num'] > 0:
|
if avg_year is not None and movie['year_num'] > 0:
|
||||||
diff_year = abs(movie['year_num'] - avg_year)
|
diff_year = abs(movie['year_num'] - avg_year)
|
||||||
year_score = 1 - (diff_year / year_range) # normalized similarity (1 means identical)
|
year_score = 1 - (diff_year / year_range)
|
||||||
|
|
||||||
# NUMERIC SIMILARITY for Runtime.
|
|
||||||
runtime_score = 0
|
runtime_score = 0
|
||||||
if avg_runtime is not None and movie['runtime_num'] > 0:
|
if avg_runtime is not None and movie['runtime_num'] > 0:
|
||||||
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
diff_runtime = abs(movie['runtime_num'] - avg_runtime)
|
||||||
runtime_score = 1 - (diff_runtime / runtime_range)
|
runtime_score = 1 - (diff_runtime / runtime_range)
|
||||||
|
|
||||||
# Final combined score.
|
rating_score = 0
|
||||||
final_score = w_text * text_score + w_year * year_score + w_runtime * runtime_score
|
movie_rating = movie.get('imdb_rating', 0)
|
||||||
|
if avg_rating is not None and movie_rating:
|
||||||
|
diff_rating = abs(movie_rating - avg_rating)
|
||||||
|
rating_score = 1 - (diff_rating / rating_range)
|
||||||
|
|
||||||
|
popularity_score = 0
|
||||||
|
if movie['vote_count'] > 0:
|
||||||
|
popularity_score = math.log(movie['vote_count'] + 1) / math.log(max_vote + 1)
|
||||||
|
|
||||||
|
final_score = (w_text * text_score +
|
||||||
|
w_year * year_score +
|
||||||
|
w_runtime * runtime_score +
|
||||||
|
w_rating * rating_score +
|
||||||
|
w_popularity * popularity_score)
|
||||||
recommendations.append((movie, final_score))
|
recommendations.append((movie, final_score))
|
||||||
|
|
||||||
# Sort recommendations by final score in descending order.
|
|
||||||
recommendations.sort(key=lambda x: x[1], reverse=True)
|
recommendations.sort(key=lambda x: x[1], reverse=True)
|
||||||
return recommendations
|
return recommendations[:20]
|
||||||
|
|
||||||
@app.route('/recommend')
|
@app.route('/recommend')
|
||||||
def recommend():
|
def recommend():
|
||||||
@ -179,3 +269,5 @@ def recommend():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
105
test.py
105
test.py
@ -2,6 +2,7 @@ import requests
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from tqdm import tqdm # progress bar library
|
from tqdm import tqdm # progress bar library
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
# Replace with your actual TMDb API key
|
# Replace with your actual TMDb API key
|
||||||
api_key = "96f3424d6fe55c2982e6e094416607f5"
|
api_key = "96f3424d6fe55c2982e6e094416607f5"
|
||||||
@ -60,11 +61,55 @@ def get_movie_keywords(movie_id):
|
|||||||
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
|
print(f"Exception while fetching keywords for movie {movie_id}: {e}")
|
||||||
return keywords
|
return keywords
|
||||||
|
|
||||||
|
def process_movie(movie, page, idx, results_per_page):
|
||||||
|
"""
|
||||||
|
Processes a single movie record:
|
||||||
|
- Computes its ranking,
|
||||||
|
- Extracts basic information,
|
||||||
|
- Fetches additional details and keywords.
|
||||||
|
"""
|
||||||
|
ranking = (page - 1) * results_per_page + idx + 1
|
||||||
|
movie_id = movie.get("id")
|
||||||
|
title = movie.get("title")
|
||||||
|
release_date = movie.get("release_date", "")
|
||||||
|
year = release_date.split("-")[0] if release_date else None
|
||||||
|
vote_average = movie.get("vote_average")
|
||||||
|
vote_count = movie.get("vote_count")
|
||||||
|
overview = movie.get("overview")
|
||||||
|
poster_path = movie.get("poster_path")
|
||||||
|
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
|
||||||
|
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
|
||||||
|
|
||||||
|
# Get additional details and keywords.
|
||||||
|
details = get_movie_details_tmdb(movie_id)
|
||||||
|
runtime = details.get("runtime")
|
||||||
|
genres = details.get("genres", [])
|
||||||
|
|
||||||
|
tags = get_movie_keywords(movie_id)
|
||||||
|
|
||||||
|
movie_data = {
|
||||||
|
"ranking": ranking,
|
||||||
|
"title": title,
|
||||||
|
"year": year,
|
||||||
|
"runtime": runtime,
|
||||||
|
"content_rating": None, # Not available via TMDb by default.
|
||||||
|
"metascore": None, # Not applicable.
|
||||||
|
"imdb_rating": vote_average, # Using TMDb's vote average.
|
||||||
|
"vote_count": vote_count,
|
||||||
|
"description": overview,
|
||||||
|
"poster": poster,
|
||||||
|
"url": tmdb_url,
|
||||||
|
"genres": genres,
|
||||||
|
"tags": tags
|
||||||
|
}
|
||||||
|
# Brief sleep to help throttle requests
|
||||||
|
time.sleep(0.2)
|
||||||
|
return movie_data
|
||||||
|
|
||||||
def get_top_movies():
|
def get_top_movies():
|
||||||
"""
|
"""
|
||||||
Uses the TMDb API to retrieve top rated movies, then iterates through all pages.
|
Uses the TMDb API to retrieve top-rated movies and processes them concurrently.
|
||||||
For each movie, additional details and keywords are fetched.
|
After processing each page, the current list of movies is written to a JSON file.
|
||||||
After processing each page, the current movies list is saved to a JSON file.
|
|
||||||
"""
|
"""
|
||||||
movies = []
|
movies = []
|
||||||
base_url = "https://api.themoviedb.org/3/movie/top_rated"
|
base_url = "https://api.themoviedb.org/3/movie/top_rated"
|
||||||
@ -91,47 +136,21 @@ def get_top_movies():
|
|||||||
continue
|
continue
|
||||||
data = response.json()
|
data = response.json()
|
||||||
results = data.get("results", [])
|
results = data.get("results", [])
|
||||||
for idx, movie in enumerate(results):
|
results_per_page = len(results)
|
||||||
# Ranking is computed by overall order.
|
|
||||||
ranking = (page - 1) * len(results) + idx + 1
|
|
||||||
movie_id = movie.get("id")
|
|
||||||
title = movie.get("title")
|
|
||||||
release_date = movie.get("release_date", "")
|
|
||||||
year = release_date.split("-")[0] if release_date else None
|
|
||||||
vote_average = movie.get("vote_average")
|
|
||||||
vote_count = movie.get("vote_count")
|
|
||||||
overview = movie.get("overview")
|
|
||||||
poster_path = movie.get("poster_path")
|
|
||||||
poster = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else None
|
|
||||||
tmdb_url = f"https://www.themoviedb.org/movie/{movie_id}"
|
|
||||||
|
|
||||||
# Get additional details: runtime and genres.
|
# Process each movie concurrently using a thread pool.
|
||||||
details = get_movie_details_tmdb(movie_id)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||||
runtime = details.get("runtime")
|
futures = []
|
||||||
genres = details.get("genres", [])
|
for idx, movie in enumerate(results):
|
||||||
|
futures.append(executor.submit(process_movie, movie, page, idx, results_per_page))
|
||||||
# Get keywords (tags).
|
# Collect results as they complete.
|
||||||
tags = get_movie_keywords(movie_id)
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
try:
|
||||||
movie_data = {
|
movie_data = future.result()
|
||||||
"ranking": ranking,
|
movies.append(movie_data)
|
||||||
"title": title,
|
except Exception as e:
|
||||||
"year": year,
|
print(f"Error processing movie: {e}")
|
||||||
"runtime": runtime,
|
# Write movies to JSON file incrementally after each page.
|
||||||
"content_rating": None, # Not available via TMDb by default.
|
|
||||||
"metascore": None, # Not applicable.
|
|
||||||
"imdb_rating": vote_average, # Using TMDb's vote average.
|
|
||||||
"vote_count": vote_count,
|
|
||||||
"description": overview,
|
|
||||||
"poster": poster,
|
|
||||||
"url": tmdb_url,
|
|
||||||
"genres": genres,
|
|
||||||
"tags": tags
|
|
||||||
}
|
|
||||||
movies.append(movie_data)
|
|
||||||
# Pause a bit between detail requests to be courteous.
|
|
||||||
time.sleep(0.2)
|
|
||||||
# After processing each page, write the current movies list to the JSON file.
|
|
||||||
write_movies(movies)
|
write_movies(movies)
|
||||||
# Pause between pages.
|
# Pause between pages.
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
96832
top_movies.json
96832
top_movies.json
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user