Sentiment-based Movie Recommender Python

👤 Sharing: AI
```python
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

try:
    nltk.download('vader_lexicon')
except LookupError:
    print("Vader lexicon not found. Downloading...")
    nltk.download('vader_lexicon')


def load_movie_data(filepath):
    """Loads movie data from a CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(filepath)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


def clean_data(df):
    """Handles missing values and potentially cleans text columns."""
    # Fill missing values in numeric columns with the mean
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())  # Or use df[col].median() if appropriate
        else:
            #Fill non numeric columns with an empty string to enable TF-IDF vectorization
            df[col] = df[col].fillna('')

    return df



def calculate_sentiment(text):
    """Calculates sentiment scores (positive, negative, neutral, compound) for a given text."""
    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(text)
    return scores


def extract_features(df, text_column='overview'):
    """Extracts TF-IDF features from the specified text column."""
    tfidf = TfidfVectorizer(stop_words='english') # remove common english stop words for better feature extraction
    tfidf_matrix = tfidf.fit_transform(df[text_column])
    return tfidf_matrix, tfidf


def calculate_similarity(tfidf_matrix):
    """Calculates the cosine similarity matrix from the TF-IDF matrix."""
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix


def get_recommendations(movie_title, df, similarity_matrix, num_recommendations=10, sentiment_column='overview'):
    """
    Recommends movies based on similarity and sentiment.

    Args:
        movie_title (str): The title of the movie to base recommendations on.
        df (pd.DataFrame): The DataFrame containing movie data.
        similarity_matrix (np.ndarray): The cosine similarity matrix.
        num_recommendations (int, optional): The number of recommendations to return. Defaults to 10.
        sentiment_column (str, optional): The column containing the text to use for sentiment analysis. Defaults to 'overview'.

    Returns:
        pd.DataFrame: A DataFrame containing the recommended movies, sorted by a combination of similarity and sentiment.  Returns None if the target movie is not found.
    """

    try:
        movie_index = df[df['title'] == movie_title].index[0]  # Get the index of the movie
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return None

    # Get the similarity scores for all movies compared to the target movie
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))

    # Sort the movies by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top N most similar movies (excluding the movie itself)
    top_movie_indices = [i[0] for i in similarity_scores[1:num_recommendations + 1]]

    # Create a DataFrame of the recommended movies
    recommended_movies = df.iloc[top_movie_indices].copy()  # .copy() prevents SettingWithCopyWarning

    # Calculate sentiment scores for the recommended movies
    recommended_movies['sentiment_scores'] = recommended_movies[sentiment_column].apply(calculate_sentiment)
    recommended_movies['compound_sentiment'] = recommended_movies['sentiment_scores'].apply(lambda x: x['compound'])

    # Combine similarity and sentiment scores to rank recommendations
    # Adjust weights as needed to prioritize similarity or sentiment
    similarity_weight = 0.7
    sentiment_weight = 0.3
    recommended_movies['combined_score'] = (
            similarity_weight * [similarity_scores[i+1][1] for i in range(num_recommendations)]  # using similarity_scores directly
            + sentiment_weight * recommended_movies['compound_sentiment']
    )

    # Sort the recommended movies by the combined score
    recommended_movies = recommended_movies.sort_values(by='combined_score', ascending=False)

    # Return the recommended movies
    return recommended_movies



def main(filepath, movie_title):
    """
    Main function to load data, process it, and get movie recommendations.

    Args:
        filepath (str): The path to the CSV file containing movie data.
        movie_title (str): The title of the movie for which to get recommendations.
    """
    df = load_movie_data(filepath)

    if df is None:
        return  # Exit if loading data failed

    df = clean_data(df)

    tfidf_matrix, tfidf_vectorizer = extract_features(df)
    similarity_matrix = calculate_similarity(tfidf_matrix)

    recommendations = get_recommendations(movie_title, df, similarity_matrix)

    if recommendations is not None:
        print(f"Recommendations for '{movie_title}':")
        print(recommendations[['title', 'compound_sentiment', 'combined_score']])  # Display relevant columns


if __name__ == "__main__":
    # Replace 'movies.csv' with the actual path to your movie dataset
    # Replace 'The Dark Knight' with the movie title you want recommendations for
    FILEPATH = 'movies_metadata.csv'
    MOVIE_TITLE = 'The Dark Knight'

    main(FILEPATH, MOVIE_TITLE)
```
👁️ Viewed: 10

Comments