AI-based Email Classifier Python

👤 Sharing: AI
```python
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK resources (if not already downloaded)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')



from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# --- 1. Data Loading and Preprocessing ---

def load_data(filepath):
    """Loads data from a CSV file, expecting 'text' and 'label' columns."""
    try:
        data = pd.read_csv(filepath)
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except KeyError:
        print("Error: CSV file must contain 'text' and 'label' columns.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None



def preprocess_text(text):
    """Cleans and normalizes text data: lowercase, remove punctuation, stopwords, lemmatization."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if not w in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    return ' '.join(words)



# --- 2. Feature Extraction ---

def create_tfidf_features(train_texts, test_texts, max_features=5000):
    """Creates TF-IDF features from text data."""
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)  # Limit feature size
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)  # Fit on *training* data
    test_tfidf = tfidf_vectorizer.transform(test_texts)      # Transform *test* data

    return train_tfidf, test_tfidf, tfidf_vectorizer  # Return vectorizer for prediction


# --- 3. Model Training ---

def train_model(train_tfidf, train_labels, alpha=1.0):
    """Trains a Multinomial Naive Bayes model."""
    model = MultinomialNB(alpha=alpha)  # Laplace smoothing (alpha)
    model.fit(train_tfidf, train_labels)
    return model


# --- 4. Model Evaluation ---

def evaluate_model(model, test_tfidf, test_labels):
    """Evaluates the model and prints classification report and confusion matrix."""
    predictions = model.predict(test_tfidf)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(test_labels, predictions))

    cm = confusion_matrix(test_labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


# --- 5. Prediction ---

def predict_email(email_text, tfidf_vectorizer, model):
    """Predicts the label of a single email."""
    processed_email = preprocess_text(email_text)
    email_tfidf = tfidf_vectorizer.transform([processed_email])
    prediction = model.predict(email_tfidf)[0]
    return prediction


# --- Main Function ---

def main(filepath):
    """Main function to load, preprocess, train, and evaluate the email classifier."""

    data = load_data(filepath)
    if data is None:
        return  # Exit if loading failed

    # Handle potential missing values.  Simplest approach: drop rows with NaNs
    data = data.dropna() # Drop rows with any missing values
    if data.isnull().values.any():  # Optional check *after* dropping
        print("Warning: Missing values remain after initial handling.")


    data['text'] = data['text'].astype(str) # Ensure text column is string type
    data['label'] = data['label'].astype(str) # Ensure label column is string type


    data['processed_text'] = data['text'].apply(preprocess_text)

    X = data['processed_text']
    y = data['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_tfidf, X_test_tfidf, tfidf_vectorizer = create_tfidf_features(X_train, X_test)

    model = train_model(X_train_tfidf, y_train)

    evaluate_model(model, X_test_tfidf, y_test)

    # Example Prediction
    example_email = "This is a test email about a great offer.  Click here to claim your prize!"
    prediction = predict_email(example_email, tfidf_vectorizer, model)
    print(f"\nExample email prediction: {prediction}")



if __name__ == "__main__":
    # Replace 'your_email_data.csv' with the actual path to your CSV file
    main('your_email_data.csv')
```
👁️ Viewed: 10

Comments