AI-based Email Classifier Python
👤 Sharing: AI
```python
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Download necessary NLTK resources (if not already downloaded)
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
# --- 1. Data Loading and Preprocessing ---
def load_data(filepath):
"""Loads data from a CSV file, expecting 'text' and 'label' columns."""
try:
data = pd.read_csv(filepath)
return data
except FileNotFoundError:
print(f"Error: File not found at {filepath}")
return None
except KeyError:
print("Error: CSV file must contain 'text' and 'label' columns.")
return None
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None
def preprocess_text(text):
"""Cleans and normalizes text data: lowercase, remove punctuation, stopwords, lemmatization."""
text = text.lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
stop_words = set(stopwords.words('english'))
words = text.split()
words = [w for w in words if not w in stop_words]
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(w) for w in words]
return ' '.join(words)
# --- 2. Feature Extraction ---
def create_tfidf_features(train_texts, test_texts, max_features=5000):
"""Creates TF-IDF features from text data."""
tfidf_vectorizer = TfidfVectorizer(max_features=max_features) # Limit feature size
train_tfidf = tfidf_vectorizer.fit_transform(train_texts) # Fit on *training* data
test_tfidf = tfidf_vectorizer.transform(test_texts) # Transform *test* data
return train_tfidf, test_tfidf, tfidf_vectorizer # Return vectorizer for prediction
# --- 3. Model Training ---
def train_model(train_tfidf, train_labels, alpha=1.0):
"""Trains a Multinomial Naive Bayes model."""
model = MultinomialNB(alpha=alpha) # Laplace smoothing (alpha)
model.fit(train_tfidf, train_labels)
return model
# --- 4. Model Evaluation ---
def evaluate_model(model, test_tfidf, test_labels):
"""Evaluates the model and prints classification report and confusion matrix."""
predictions = model.predict(test_tfidf)
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, predictions))
cm = confusion_matrix(test_labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# --- 5. Prediction ---
def predict_email(email_text, tfidf_vectorizer, model):
"""Predicts the label of a single email."""
processed_email = preprocess_text(email_text)
email_tfidf = tfidf_vectorizer.transform([processed_email])
prediction = model.predict(email_tfidf)[0]
return prediction
# --- Main Function ---
def main(filepath):
"""Main function to load, preprocess, train, and evaluate the email classifier."""
data = load_data(filepath)
if data is None:
return # Exit if loading failed
# Handle potential missing values. Simplest approach: drop rows with NaNs
data = data.dropna() # Drop rows with any missing values
if data.isnull().values.any(): # Optional check *after* dropping
print("Warning: Missing values remain after initial handling.")
data['text'] = data['text'].astype(str) # Ensure text column is string type
data['label'] = data['label'].astype(str) # Ensure label column is string type
data['processed_text'] = data['text'].apply(preprocess_text)
X = data['processed_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = create_tfidf_features(X_train, X_test)
model = train_model(X_train_tfidf, y_train)
evaluate_model(model, X_test_tfidf, y_test)
# Example Prediction
example_email = "This is a test email about a great offer. Click here to claim your prize!"
prediction = predict_email(example_email, tfidf_vectorizer, model)
print(f"\nExample email prediction: {prediction}")
if __name__ == "__main__":
# Replace 'your_email_data.csv' with the actual path to your CSV file
main('your_email_data.csv')
```
👁️ Viewed: 10
Comments