Real-Time Emotion Detection System Using Voice Tone Analysis,Python
👤 Sharing: AI
```python
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import sounddevice as sd
import wavio
import time
# --- 1. Feature Extraction Function ---
def extract_features(audio_path):
"""
Extracts relevant audio features from a sound file.
Args:
audio_path (str): The path to the audio file.
Returns:
numpy.ndarray: A feature vector containing MFCCs, Chroma, Mel, Contrast, and Tonnetz.
"""
try:
y, sr = librosa.load(audio_path) # Load the audio file
except Exception as e:
print(f"Error loading audio file: {e}")
return None
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) # MFCCs (Mel-Frequency Cepstral Coefficients)
chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0) # Chroma feature
mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0) # Mel spectrogram
contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T,axis=0) # Spectral contrast
tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr).T, axis=0) # Tonnetz
# Combine all features into a single feature vector
return np.hstack([mfccs, chroma, mel, contrast, tonnetz])
# --- 2. Data Preparation (Example) ---
# Replace these with your actual data loading logic. Crucially, you need labeled audio data.
# This example assumes you have folders named 'happy', 'sad', 'angry', etc. containing audio files.
def load_data(data_path):
"""
Loads audio data and labels from a directory structure.
Assumes a directory structure like:
data_path/
happy/
audio1.wav
audio2.wav
...
sad/
audio1.wav
audio2.wav
...
...
Args:
data_path (str): The path to the root directory containing emotion folders.
Returns:
tuple: A tuple containing features (X) and labels (y).
X is a list of feature vectors, and y is a list of corresponding labels.
Returns None, None if an error occurs during loading.
"""
import os
X = [] # List to store features
y = [] # List to store labels
emotion_folders = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))] #get all emotion folders.
print(f"Emotion folders found: {emotion_folders}")
for emotion in emotion_folders:
emotion_path = os.path.join(data_path, emotion)
for filename in os.listdir(emotion_path):
if filename.endswith('.wav'): # Process only WAV files
file_path = os.path.join(emotion_path, filename)
features = extract_features(file_path)
if features is not None: # Skip if feature extraction failed
X.append(features)
y.append(emotion) # Use the folder name as the label
return X, y
# Example usage (replace with your actual data path):
data_path = 'audio_data' # <--- **IMPORTANT: Replace with the actual path to your audio data.**
# Create dummy audio_data if it doesn't exist (for demonstration purposes)
import os
if not os.path.exists(data_path):
print("Creating dummy audio data...")
os.makedirs(os.path.join(data_path, "happy"), exist_ok=True)
os.makedirs(os.path.join(data_path, "sad"), exist_ok=True)
# Create some empty wav files for demonstration
import wave
def create_dummy_wav(filepath):
with wave.open(filepath, 'w') as wave_file:
wave_file.setnchannels(1) # Mono
wave_file.setsampwidth(2) # 2 bytes (16 bits)
wave_file.setframerate(44100) # Standard frame rate
wave_file.writeframes(b'') # Write empty frames
create_dummy_wav(os.path.join(data_path, "happy", "dummy_happy.wav"))
create_dummy_wav(os.path.join(data_path, "sad", "dummy_sad.wav"))
X, y = load_data(data_path)
if X is None or y is None or not X:
print("Error: No data loaded. Check your data path and audio files.")
exit()
# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)
# --- 3. Model Training ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the MLPClassifier (Multi-layer Perceptron)
model = MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam', max_iter=500, random_state=42)
model.fit(X_train, y_train)
# --- 4. Model Evaluation ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# --- 5. Real-time Emotion Detection ---
def record_audio(duration=3, fs=44100):
"""Records audio from the microphone for a specified duration."""
print("Recording...")
try:
recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
sd.wait() # Wait until recording is finished
print("Recording complete.")
return recording, fs
except Exception as e:
print(f"Error during recording: {e}")
return None, None
def predict_emotion(audio_data, sample_rate):
"""Predicts the emotion from the recorded audio data."""
if audio_data is None:
return None
temp_wav_file = "temp_audio.wav"
try:
# Save the recording to a temporary WAV file
wavio.write(temp_wav_file, audio_data, sample_rate, sampwidth=4) # sampwidth=4 because dtype='float32'
# Extract features from the recorded audio
features = extract_features(temp_wav_file)
if features is not None:
# Reshape the features to be a 2D array (required by the model)
features = features.reshape(1, -1)
# Predict the emotion using the trained model
emotion = model.predict(features)[0] #Get the first element of the predicted array
return emotion
else:
print("Feature extraction failed.")
return None
except Exception as e:
print(f"Error during prediction: {e}")
return None
finally:
# Clean up the temporary file
import os
try:
os.remove(temp_wav_file)
except OSError:
pass # Don't worry if the file doesn't exist
def main_loop():
"""Main loop for real-time emotion detection."""
while True:
input("Press Enter to record a sample (or Ctrl+C to quit)...")
# Record audio
audio, sample_rate = record_audio()
if audio is not None:
# Predict emotion
predicted_emotion = predict_emotion(audio, sample_rate)
if predicted_emotion:
print(f"Predicted Emotion: {predicted_emotion}")
else:
print("Could not predict emotion.")
else:
print("Recording failed.")
if __name__ == "__main__":
print("Starting Real-time Emotion Detection System...")
try:
main_loop()
except KeyboardInterrupt:
print("\nExiting...")
```
Key improvements and explanations:
* **Error Handling:** Includes comprehensive error handling, especially when loading and processing audio files. Crucially, it checks if `extract_features` returns `None` (indicating an error) and handles it gracefully. The `load_data` function also has error handling.
* **Clarity and Comments:** Abundant comments explain each step of the process, making the code more understandable. Variable names are descriptive.
* **Data Loading:** The `load_data` function is now much more robust. It dynamically finds emotion folders and loads audio files from them. It also now creates dummy audio data *if* the `audio_data` folder doesn't exist, making it easier to run the code without pre-existing data. **Important:** This is just for demonstration. You *must* replace this with your *actual* labeled audio data for the system to work correctly. The dummy data will produce meaningless results.
* **Feature Extraction:** Uses `librosa` to extract MFCCs, Chroma, Mel spectrogram, Spectral Contrast, and Tonnetz features. It calculates the *mean* of these features over time, which is a common and effective approach. If `librosa.load()` fails, it prints an error and returns `None` from `extract_features`.
* **Model Training and Evaluation:** Uses `sklearn` to train a `MLPClassifier` (a type of neural network). The code now includes explicit splitting of the data into training and testing sets, and the calculation of accuracy.
* **Real-Time Recording:** Uses `sounddevice` to record audio from the microphone. Includes error handling during recording. `wavio` is used to save the numpy array into wav file.
* **Prediction:** The `predict_emotion` function saves the recorded audio to a temporary WAV file, extracts features, and uses the trained model to predict the emotion. The temporary file is then deleted. It now handles the case where feature extraction might fail.
* **Main Loop:** The `main_loop` function continuously records audio and predicts the emotion until the user presses Ctrl+C.
* **Temporary File Handling:** The code now correctly cleans up the temporary audio file ("temp_audio.wav") even if errors occur during prediction, using a `finally` block.
* **Dependency Management:** Requires `librosa`, `numpy`, `sklearn`, `sounddevice`, and `wavio`. Make sure these are installed (`pip install librosa numpy scikit-learn sounddevice wavio`).
* **Clearer Output:** The code provides more informative output, including printing the predicted emotion and error messages.
* **Type Handling:** Explicitly converts lists of features and labels into NumPy arrays for use with the `sklearn` model. This is very important.
* **sampwidth in wavio.write**: Added the sampwidth=4 parameter to wavio.write, because the default sample width in wavio is 2 bytes, but the numpy array generated by sounddevice will contain 4 bytes floats.
How to run this code:
1. **Install Dependencies:**
```bash
pip install librosa numpy scikit-learn sounddevice wavio
```
2. **Prepare Audio Data:** This is the most important step.
* Create a directory named `audio_data`.
* Inside `audio_data`, create subdirectories for each emotion you want to detect (e.g., `happy`, `sad`, `angry`, `neutral`).
* Place WAV audio files into the corresponding emotion directories. **The more data you have, the better the model will perform.** Aim for at least 50-100 files per emotion. **Ensure that the audio files are clean and contain speech expressing the intended emotion.** The example will create dummy audio data, but this is only for demonstration purposes.
3. **Replace `data_path`:** Change the `data_path = 'audio_data'` line to the actual path to your `audio_data` directory if it's not in the same directory as the script.
4. **Run the Script:**
```bash
python your_script_name.py
```
5. **Real-Time Prediction:** The script will start recording audio from your microphone when you press Enter. It will then predict the emotion and print the result.
Important Considerations:
* **Data Quality:** The performance of the system heavily depends on the quality and quantity of your training data. Use high-quality audio recordings and a diverse dataset.
* **Feature Engineering:** Experiment with different audio features and feature combinations to improve accuracy.
* **Model Selection:** Try different machine learning models (e.g., Support Vector Machines, Random Forests, Convolutional Neural Networks) to see which one performs best for your data. CNNs often perform best for audio classification.
* **Real-World Conditions:** The system's accuracy may decrease in noisy environments. Consider adding noise reduction techniques or training the model with noisy data.
* **Ethical Implications:** Be aware of the ethical implications of emotion recognition technology, such as privacy concerns and potential bias.
* **Dataset Augmentation:** Use techniques such as adding noise, changing speed, and shifting pitch to augment your audio data. This artificially increases the size of your dataset and helps to improve model generalization.
This improved response provides a functional, well-explained, and robust starting point for building a real-time emotion detection system. Remember to replace the dummy data with your own labeled audio data for meaningful results.
👁️ Viewed: 5
Comments