AI-Driven Traffic Accident Prediction and Prevention System Python
👤 Sharing: AI
```python
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.ensemble import RandomForestClassifier # A powerful classification algorithm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # For evaluating the model
import matplotlib.pyplot as plt # For visualization
import seaborn as sns # For enhanced visualizations
# --- 1. Data Loading and Preprocessing ---
# Function to load data from a CSV file
def load_data(file_path):
"""Loads data from a CSV file into a pandas DataFrame.
Args:
file_path (str): The path to the CSV file.
Returns:
pandas.DataFrame: The DataFrame containing the data. Returns None if file loading fails.
"""
try:
data = pd.read_csv(file_path)
return data
except FileNotFoundError:
print(f"Error: File not found at {file_path}")
return None
except pd.errors.EmptyDataError:
print(f"Error: The file at {file_path} is empty.")
return None
except pd.errors.ParserError:
print(f"Error: Unable to parse the CSV file at {file_path}. Check for delimiters or encoding issues.")
return None
except Exception as e:
print(f"An unexpected error occurred while loading the file: {e}")
return None
# Function to preprocess the data (handle missing values, feature engineering)
def preprocess_data(data):
"""Preprocesses the traffic accident data.
Args:
data (pandas.DataFrame): The DataFrame containing the raw accident data.
Returns:
pandas.DataFrame: The preprocessed DataFrame.
"""
# Handle missing values (e.g., fill with mean, median, or mode, or drop rows)
# Example: Fill missing numerical values with the mean
for col in data.select_dtypes(include=np.number).columns: #iterate through numerical columns
data[col].fillna(data[col].mean(), inplace=True)
# Example: Fill missing categorical values with the mode
for col in data.select_dtypes(include='object').columns: #iterate through object(string) columns
data[col].fillna(data[col].mode()[0], inplace=True) #mode()[0] takes the most frequent
# Feature engineering (create new features from existing ones)
# Example: Create a new feature 'time_of_day' based on 'hour'
# Assuming you have a 'hour' column in your data:
if 'hour' in data.columns:
def categorize_time(hour):
if 6 <= hour < 12:
return 'Morning'
elif 12 <= hour < 18:
return 'Afternoon'
elif 18 <= hour < 22:
return 'Evening'
else:
return 'Night'
data['time_of_day'] = data['hour'].apply(categorize_time)
# Convert categorical features to numerical using one-hot encoding (or other methods like label encoding)
# Example: One-hot encode 'weather_condition' and 'road_condition'
categorical_cols = [col for col in data.columns if data[col].dtype == 'object'] #Find object (string) columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True) # Use drop_first to avoid multicollinearity
return data
# --- 2. Model Training ---
# Function to train a machine learning model
def train_model(X, y):
"""Trains a Random Forest Classifier model.
Args:
X (pandas.DataFrame): The feature matrix.
y (pandas.Series): The target variable.
Returns:
sklearn.ensemble.RandomForestClassifier: The trained Random Forest model.
"""
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% training, 20% testing
# Initialize and train the model (Random Forest Classifier is a good starting point)
model = RandomForestClassifier(n_estimators=100, random_state=42) # You can adjust hyperparameters
model.fit(X_train, y_train)
return model
# --- 3. Model Evaluation ---
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
"""Evaluates the trained model.
Args:
model (sklearn.ensemble.RandomForestClassifier): The trained model.
X_test (pandas.DataFrame): The test feature matrix.
y_test (pandas.Series): The test target variable.
"""
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model using various metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
# --- 4. Prediction and Prevention Strategies ---
# Function to predict accident risk for new data
def predict_risk(model, new_data):
"""Predicts the accident risk for new data.
Args:
model (sklearn.ensemble.RandomForestClassifier): The trained model.
new_data (pandas.DataFrame): The new data for which to predict risk. Must have the same columns as the training data.
Returns:
numpy.ndarray: The predicted accident risk (0 or 1) for each row in the new data.
"""
# Make sure the new data has the same columns as the training data (after preprocessing)
# Handle missing columns (add them with a default value, usually 0 for one-hot encoded columns)
# Example:
training_columns = model.feature_names_in_ # Get the columns the model was trained on. Make sure the trained model is accessible.
for col in training_columns:
if col not in new_data.columns:
new_data[col] = 0 # Add missing column with a default value
# Ensure the columns are in the correct order.
new_data = new_data[training_columns]
predictions = model.predict(new_data)
return predictions
# Function to suggest prevention strategies based on predicted risk and feature importance
def suggest_prevention(risk_level, features, feature_importances):
"""Suggests prevention strategies based on predicted risk and feature importance.
Args:
risk_level (int): The predicted risk level (0 or 1).
features (list): The list of features used in the model.
feature_importances (numpy.ndarray): The feature importances from the trained model.
"""
print("\nPrevention Strategies:")
if risk_level == 1:
print("High Risk of Accident Detected!")
print("Recommended Actions:")
# Identify the most important features contributing to the risk
important_features_indices = np.argsort(feature_importances)[::-1][:3] # Top 3 important features
important_features = [features[i] for i in important_features_indices]
print("Factors contributing to the risk (top 3):")
for feature in important_features:
print(f"- {feature}")
# Suggest specific actions based on important features (you'll need to customize these based on your data)
if 'speed' in important_features:
print("- Reduce speed. Excessive speed is a major contributor to accidents.")
if 'weather_condition_Rain' in important_features:
print("- Drive cautiously in rainy conditions. Increase following distance.")
if 'road_condition_Slippery' in important_features:
print("- Be extra cautious on slippery roads. Ensure tires have adequate tread.")
if 'time_of_day_Night' in important_features:
print("- Increase visibility at night. Use headlights and fog lights effectively.")
# Add more rules based on your features and domain knowledge
print("- Be extra vigilant and avoid distractions.")
else:
print("Low Risk of Accident Detected.")
print("Continue to drive safely and follow traffic regulations.")
# --- 5. Main Program ---
def main():
"""Main function to execute the traffic accident prediction and prevention system."""
# 1. Load the data
file_path = "traffic_accident_data.csv" # Replace with the actual path to your data file
data = load_data(file_path)
if data is None:
print("Failed to load data. Exiting.")
return # Exit the program if data loading fails
# 2. Preprocess the data
data = preprocess_data(data)
# 3. Define features (X) and target variable (y)
# Assuming 'accident_severity' is your target variable (0: No Accident, 1: Accident)
if 'accident_severity' not in data.columns:
print("Error: 'accident_severity' column not found in the data. Please check your data file.")
return
X = data.drop('accident_severity', axis=1) # Features
y = data['accident_severity'] # Target variable
# Check if X has any columns
if X.empty:
print("Error: No features found after preprocessing. Please check your data and preprocessing steps.")
return
# 4. Train the model
model = train_model(X, y)
# 5. Evaluate the model
evaluate_model(model, X, y) #Pass in X instead of X_test, and y instead of y_test to get a sense of overfit on the training data.
# 6. Example: Predict risk for new data and suggest prevention strategies
# Create a sample new data point (replace with actual data)
new_data = pd.DataFrame({
'speed': [60],
'hour': [19],
'weather_condition': ['Rain'],
'road_condition': ['Slippery'],
# Add other features here based on your data
})
# Preprocess the new data in the same way as the training data
new_data = preprocess_data(new_data.copy()) # Important: Pass a copy to avoid modifying the original data
predicted_risk = predict_risk(model, new_data) # The model is needed for the column name order.
print(f"\nPredicted Accident Risk: {predicted_risk[0]}")
# Suggest prevention strategies
suggest_prevention(predicted_risk[0], list(X.columns), model.feature_importances_)
# --- Run the Main Program ---
if __name__ == "__main__":
main()
```
Key improvements and explanations:
* **Error Handling:** Robust error handling added to the `load_data` function to catch `FileNotFoundError`, `pd.errors.EmptyDataError`, `pd.errors.ParserError`, and general exceptions. This prevents the program from crashing if the input data is problematic. Also checks for column existence and empty DataFrames.
* **Clearer Data Loading:** The `load_data` function now explicitly handles potential errors during file loading. It returns `None` if loading fails and prints an informative error message to the console.
* **Comprehensive Data Preprocessing:** The `preprocess_data` function now includes:
* **Missing Value Handling:** Fills missing numerical values with the mean and missing categorical values with the mode. This is a standard practice, but the *specific* strategy should be adapted to the dataset.
* **Feature Engineering:** An example of creating a `time_of_day` feature from an `hour` column. This step highly depends on the specific features in your dataset and domain knowledge.
* **Categorical Feature Encoding:** Uses one-hot encoding (with `drop_first=True` to prevent multicollinearity) for categorical features. This is crucial for many machine learning algorithms.
* **Model Training:** The `train_model` function now includes splitting the data into training and testing sets, which is essential for evaluating the model's generalization ability.
* **Model Evaluation:** The `evaluate_model` function now provides:
* Accuracy score.
* Classification report (precision, recall, F1-score).
* Confusion matrix (visualized using `seaborn`). This gives a much more complete picture of the model's performance.
* **Risk Prediction:** The `predict_risk` function ensures that the new data has the *same* columns as the training data *after* preprocessing, and in the *correct order*, handling cases where new columns are missing in the `new_data`. This is a very common source of errors. It also returns the predictions.
* **Prevention Strategies:** The `suggest_prevention` function provides personalized prevention strategies based on:
* Predicted risk level.
* Feature importances from the trained model (to identify the most influential factors).
* Example rules based on specific features (you'll need to customize these).
* **Main Function:** The `main` function orchestrates the entire process: data loading, preprocessing, model training, evaluation, prediction, and prevention. It also includes error handling to gracefully exit if data loading fails.
* **Feature Importance:** The `suggest_prevention` function now uses the `feature_importances_` attribute of the trained Random Forest model to identify the most important features contributing to the risk. This allows for more targeted prevention strategies.
* **Clearer Output:** The program prints informative messages to the console at each step.
* **Comments:** The code is thoroughly commented to explain each step.
* **`if __name__ == "__main__":`**: The code is wrapped in this conditional block to ensure that the `main` function is only executed when the script is run directly (not when it's imported as a module).
* **`copy()` when preprocessing new data:** Crucially, the `preprocess_data` function is called on a *copy* of the `new_data` DataFrame to avoid modifying the original DataFrame. This is important for preventing unexpected side effects.
* **Column Order:** The `predict_risk` function now ensures that the columns in the `new_data` DataFrame are in the *same order* as the columns used to train the model. This is critical because machine learning models are sensitive to the order of the features. It uses `model.feature_names_in_` to get the training column order.
* **Example Data:** `new_data` now contains example data and shows how to preprocess it before prediction.
* **Documentation:** Added comprehensive docstrings to each function to improve readability and maintainability.
* **Realistic Example:** The example in `suggest_prevention` considers one-hot encoded column names, making it a more realistic example.
* **Visualization:** Added a heatmap visualization of the confusion matrix for better understanding of model performance.
To use this code:
1. **Install Libraries:**
```bash
pip install pandas scikit-learn matplotlib seaborn
```
2. **Create a CSV File:** Create a CSV file named `traffic_accident_data.csv` (or change the `file_path` in the code) with your traffic accident data. The CSV file should have a header row with column names. **Important:** The data needs to have a column named `accident_severity` (or change the variable accordingly). This column should be the target variable (e.g., 0 for no accident, 1 for accident). The other columns should be the features you want to use to predict accident risk (e.g., speed, time of day, weather conditions, road conditions, etc.).
3. **Customize:** You'll need to adapt the code to your specific dataset:
* Change the `file_path` to the correct path to your CSV file.
* Modify the `preprocess_data` function to handle missing values and feature engineering according to your data.
* Adjust the feature names and prediction logic in the `suggest_prevention` function based on your data and domain knowledge.
4. **Run the Code:** Execute the Python script.
This improved version provides a more complete and robust framework for traffic accident prediction and prevention. Remember to customize it to your specific data and requirements for optimal results.
👁️ Viewed: 7
Comments