Sentiment-based Movie Recommender Python
👤 Sharing: AI
```python
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
try:
nltk.download('vader_lexicon')
except LookupError:
print("Vader lexicon not found. Downloading...")
nltk.download('vader_lexicon')
def load_movie_data(filepath):
"""Loads movie data from a CSV file into a pandas DataFrame."""
try:
df = pd.read_csv(filepath)
return df
except FileNotFoundError:
print(f"Error: File not found at {filepath}")
return None
except Exception as e:
print(f"Error loading data: {e}")
return None
def clean_data(df):
"""Handles missing values and potentially cleans text columns."""
# Fill missing values in numeric columns with the mean
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].mean()) # Or use df[col].median() if appropriate
else:
#Fill non numeric columns with an empty string to enable TF-IDF vectorization
df[col] = df[col].fillna('')
return df
def calculate_sentiment(text):
"""Calculates sentiment scores (positive, negative, neutral, compound) for a given text."""
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(text)
return scores
def extract_features(df, text_column='overview'):
"""Extracts TF-IDF features from the specified text column."""
tfidf = TfidfVectorizer(stop_words='english') # remove common english stop words for better feature extraction
tfidf_matrix = tfidf.fit_transform(df[text_column])
return tfidf_matrix, tfidf
def calculate_similarity(tfidf_matrix):
"""Calculates the cosine similarity matrix from the TF-IDF matrix."""
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
def get_recommendations(movie_title, df, similarity_matrix, num_recommendations=10, sentiment_column='overview'):
"""
Recommends movies based on similarity and sentiment.
Args:
movie_title (str): The title of the movie to base recommendations on.
df (pd.DataFrame): The DataFrame containing movie data.
similarity_matrix (np.ndarray): The cosine similarity matrix.
num_recommendations (int, optional): The number of recommendations to return. Defaults to 10.
sentiment_column (str, optional): The column containing the text to use for sentiment analysis. Defaults to 'overview'.
Returns:
pd.DataFrame: A DataFrame containing the recommended movies, sorted by a combination of similarity and sentiment. Returns None if the target movie is not found.
"""
try:
movie_index = df[df['title'] == movie_title].index[0] # Get the index of the movie
except IndexError:
print(f"Movie '{movie_title}' not found in the dataset.")
return None
# Get the similarity scores for all movies compared to the target movie
similarity_scores = list(enumerate(similarity_matrix[movie_index]))
# Sort the movies by similarity score in descending order
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
# Get the indices of the top N most similar movies (excluding the movie itself)
top_movie_indices = [i[0] for i in similarity_scores[1:num_recommendations + 1]]
# Create a DataFrame of the recommended movies
recommended_movies = df.iloc[top_movie_indices].copy() # .copy() prevents SettingWithCopyWarning
# Calculate sentiment scores for the recommended movies
recommended_movies['sentiment_scores'] = recommended_movies[sentiment_column].apply(calculate_sentiment)
recommended_movies['compound_sentiment'] = recommended_movies['sentiment_scores'].apply(lambda x: x['compound'])
# Combine similarity and sentiment scores to rank recommendations
# Adjust weights as needed to prioritize similarity or sentiment
similarity_weight = 0.7
sentiment_weight = 0.3
recommended_movies['combined_score'] = (
similarity_weight * [similarity_scores[i+1][1] for i in range(num_recommendations)] # using similarity_scores directly
+ sentiment_weight * recommended_movies['compound_sentiment']
)
# Sort the recommended movies by the combined score
recommended_movies = recommended_movies.sort_values(by='combined_score', ascending=False)
# Return the recommended movies
return recommended_movies
def main(filepath, movie_title):
"""
Main function to load data, process it, and get movie recommendations.
Args:
filepath (str): The path to the CSV file containing movie data.
movie_title (str): The title of the movie for which to get recommendations.
"""
df = load_movie_data(filepath)
if df is None:
return # Exit if loading data failed
df = clean_data(df)
tfidf_matrix, tfidf_vectorizer = extract_features(df)
similarity_matrix = calculate_similarity(tfidf_matrix)
recommendations = get_recommendations(movie_title, df, similarity_matrix)
if recommendations is not None:
print(f"Recommendations for '{movie_title}':")
print(recommendations[['title', 'compound_sentiment', 'combined_score']]) # Display relevant columns
if __name__ == "__main__":
# Replace 'movies.csv' with the actual path to your movie dataset
# Replace 'The Dark Knight' with the movie title you want recommendations for
FILEPATH = 'movies_metadata.csv'
MOVIE_TITLE = 'The Dark Knight'
main(FILEPATH, MOVIE_TITLE)
```
👁️ Viewed: 10
Comments