Automated Loan Approval System Based on Credit Risk Assessment Python
👤 Sharing: AI
```python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle # For saving and loading the model
# 1. Data Loading and Preprocessing
def load_and_preprocess_data(file_path):
"""
Loads data from a CSV file, handles missing values, and preprocesses features.
Args:
file_path (str): Path to the CSV file containing loan data.
Returns:
pandas.DataFrame: Preprocessed loan data.
"""
try:
data = pd.read_csv(file_path)
except FileNotFoundError:
print(f"Error: File not found at {file_path}")
return None # Or raise the exception if you prefer
# Handle missing values (using simple imputation - replace with mean/median/mode)
# You might want to use more sophisticated techniques like KNN imputation or model-based imputation depending on your data
for col in data.columns:
if data[col].isnull().any(): # Check if the column has any missing values
if pd.api.types.is_numeric_dtype(data[col]): # For numeric columns, fill with the mean
data[col].fillna(data[col].mean(), inplace=True) # or data[col].median()
else: # For non-numeric columns, fill with the mode (most frequent value)
data[col].fillna(data[col].mode()[0], inplace=True) # mode()[0] returns the first mode if there are multiple
# Convert categorical features to numerical (using one-hot encoding)
# This is essential for most machine learning models
categorical_cols = data.select_dtypes(include=['object']).columns # Identify object columns (strings, typically)
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True) # Convert to numerical features
# Example of feature engineering (optional): creating a derived feature
# Let's assume we have 'Income' and 'LoanAmount' and want to create a 'LoanToIncomeRatio'
if 'Income' in data.columns and 'LoanAmount' in data.columns:
data['LoanToIncomeRatio'] = data['LoanAmount'] / data['Income']
# Print some info to check the result of the preprocessing
print("Data after preprocessing:")
print(data.head())
print(data.isnull().sum()) # Check again for any remaining missing values
return data
# 2. Feature Selection and Data Splitting
def prepare_data(data, target_column):
"""
Selects features, splits data into training and testing sets.
Args:
data (pandas.DataFrame): Preprocessed loan data.
target_column (str): Name of the target variable column ('LoanApproved' or similar).
Returns:
tuple: X_train, X_test, y_train, y_test (feature matrices and target vectors for training and testing)
"""
if target_column not in data.columns:
print(f"Error: Target column '{target_column}' not found in the data.")
return None, None, None, None
X = data.drop(target_column, axis=1) # Features
y = data[target_column] # Target variable
# Split data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # random_state for reproducibility
print("Shape of training data:", X_train.shape, y_train.shape)
print("Shape of testing data:", X_test.shape, y_test.shape)
return X_train, X_test, y_train, y_test
# 3. Model Training
def train_model(X_train, y_train):
"""
Trains a logistic regression model.
Args:
X_train (pandas.DataFrame): Training features.
y_train (pandas.Series): Training target variable.
Returns:
sklearn.linear_model.LogisticRegression: Trained logistic regression model.
"""
# Initialize the logistic regression model
model = LogisticRegression(solver='liblinear', random_state=42) # You can try different solvers like 'lbfgs' or 'newton-cg'
# Train the model
model.fit(X_train, y_train)
print("Model trained successfully.")
return model
# 4. Model Evaluation
def evaluate_model(model, X_test, y_test):
"""
Evaluates the trained model.
Args:
model (sklearn.linear_model.LogisticRegression): Trained model.
X_test (pandas.DataFrame): Testing features.
y_test (pandas.Series): Testing target variable.
Returns:
None (prints evaluation metrics)
"""
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# 5. Model Saving and Loading
def save_model(model, file_path):
"""
Saves the trained model to a file.
Args:
model (sklearn.linear_model.LogisticRegression): Trained model.
file_path (str): Path to save the model.
"""
try:
with open(file_path, 'wb') as file: # 'wb' for write binary
pickle.dump(model, file)
print(f"Model saved to {file_path}")
except Exception as e:
print(f"Error saving model: {e}")
def load_model(file_path):
"""
Loads a trained model from a file.
Args:
file_path (str): Path to the saved model.
Returns:
sklearn.linear_model.LogisticRegression: Loaded model, or None if loading fails.
"""
try:
with open(file_path, 'rb') as file: # 'rb' for read binary
model = pickle.load(file)
print(f"Model loaded from {file_path}")
return model
except FileNotFoundError:
print(f"Error: Model file not found at {file_path}")
return None
except Exception as e:
print(f"Error loading model: {e}")
return None
# 6. Prediction Function (for making predictions on new data)
def predict_loan_approval(model, new_data):
"""
Predicts loan approval for new data.
Args:
model (sklearn.linear_model.LogisticRegression): Trained model.
new_data (pandas.DataFrame): New data (single row or multiple rows) with the same features as the training data.
Returns:
numpy.ndarray: Predicted loan approval (0 or 1) for each row in new_data.
"""
try:
predictions = model.predict(new_data)
return predictions
except Exception as e:
print(f"Error during prediction: {e}")
return None
# 7. Main Function (orchestrates the entire process)
def main():
"""
Main function to run the loan approval system.
"""
# 1. Load and preprocess the data
file_path = 'loan_data.csv' # Replace with the actual path to your CSV file
data = load_and_preprocess_data(file_path)
if data is None:
print("Exiting due to data loading/preprocessing error.")
return
# 2. Prepare the data for training
target_column = 'LoanApproved' # Replace with the actual name of your target column
X_train, X_test, y_train, y_test = prepare_data(data, target_column)
if X_train is None:
print("Exiting due to data preparation error.")
return
# 3. Train the model
model = train_model(X_train, y_train)
# 4. Evaluate the model
evaluate_model(model, X_test, y_test)
# 5. Save the trained model
model_file_path = 'loan_approval_model.pkl'
save_model(model, model_file_path)
# Example of loading the model and making predictions on new data:
loaded_model = load_model(model_file_path)
if loaded_model:
# Create a sample new data point (replace with actual data)
new_data = pd.DataFrame({
'ApplicantAge': [35],
'Income': [60000],
'LoanAmount': [20000],
'CreditScore': [720],
'LoanToIncomeRatio':[0.33] #Make sure to include engineered features as well
# Add other relevant features here, matching the columns used during training
})
# Make sure the new_data has the same columns as X_train after preprocessing (including one-hot encoded columns if applicable)
# E.g., if your training data had a 'Gender_Male' column due to one-hot encoding, the new_data must have it too.
# You might need to add dummy columns to new_data to match the training data's columns if it's missing any.
# This is an example of how to add missing columns to new_data based on columns present in X_train:
for col in X_train.columns:
if col not in new_data.columns:
new_data[col] = 0 # Fill missing columns with 0 (common for one-hot encoded columns)
new_data = new_data[X_train.columns] #Ensure the order of columns is the same
# Predict loan approval
prediction = predict_loan_approval(loaded_model, new_data)
if prediction is not None:
print("Loan Approval Prediction:", prediction)
if prediction[0] == 1:
print("Loan Approved")
else:
print("Loan Denied")
# Run the main function
if __name__ == "__main__":
main()
```
Key improvements and explanations:
* **Clearer Structure:** The code is divided into functions, each responsible for a specific task (data loading, preprocessing, model training, evaluation, prediction). This makes the code much more readable, maintainable, and testable.
* **Error Handling:** Includes `try...except` blocks to handle potential errors like `FileNotFoundError` when loading data or errors during model saving/loading. This prevents the program from crashing and provides informative error messages. Crucially, the `load_and_preprocess_data` function now returns `None` if it encounters an error, and the `main` function checks for this and exits gracefully. The `predict_loan_approval` function also has error handling.
* **Data Preprocessing:** Handles missing values using imputation. It fills numerical missing values with the mean (you can change this to median or a more sophisticated imputation technique) and categorical missing values with the mode (most frequent value). It also performs one-hot encoding on categorical features using `pd.get_dummies()`, which is essential for many machine learning models. The `drop_first=True` argument prevents multicollinearity. *Crucially*, it now *identifies* categorical columns dynamically using `data.select_dtypes(include=['object'])`. It includes an example of feature engineering.
* **Feature Selection:** The `prepare_data` function now explicitly drops the target column to create the feature matrix `X`.
* **Data Splitting:** Uses `train_test_split` to divide the data into training and testing sets. The `random_state` argument ensures reproducibility.
* **Model Training:** Trains a `LogisticRegression` model. The `solver='liblinear'` argument is added to avoid warnings and make the code more robust. You might want to experiment with other solvers like `'lbfgs'` or `'newton-cg'` for better performance.
* **Model Evaluation:** Evaluates the model using accuracy, classification report (precision, recall, F1-score), and confusion matrix.
* **Model Persistence (Saving and Loading):** Uses `pickle` to save the trained model to a file and load it later. This allows you to reuse the model without retraining it every time. Includes robust error handling during saving and loading.
* **Prediction Function:** A separate `predict_loan_approval` function takes new data as input and makes predictions using the loaded model. This function also includes error handling.
* **`main` Function:** Orchestrates the entire process, calling the other functions in the correct order. It also includes an example of how to load the model and make predictions on new data.
* **Clear Comments and Docstrings:** The code is thoroughly commented to explain each step. Each function has a docstring that describes its purpose, arguments, and return values.
* **Handles Missing Columns in Prediction:** The example of using a loaded model to predict on new data now includes a crucial step: it checks if the `new_data` DataFrame has all the columns that were present in the training data (`X_train`). If any columns are missing (which is common after one-hot encoding), it adds them and fills them with 0. This is essential to prevent errors during prediction. It also ensures that the order of columns in `new_data` is the same as in `X_train`.
* **Uses Pandas DataFrames:** Uses Pandas DataFrames for data manipulation, which is more efficient and convenient than using lists or arrays directly.
* **Reproducibility:** The use of `random_state` in `train_test_split` and `LogisticRegression` ensures that the results are reproducible.
* **Installation Notes:** Includes `pip install pandas scikit-learn` for installing the dependencies.
How to use it:
1. **Install Dependencies:** Run `pip install pandas scikit-learn` in your terminal or command prompt.
2. **Prepare Your Data:** Create a CSV file (e.g., `loan_data.csv`) containing your loan data. Make sure the file has columns for the features you want to use for prediction and a target column indicating whether the loan was approved or not (e.g., 'LoanApproved', with values of 0 or 1).
3. **Update File Paths and Target Column:** Modify the `file_path` and `target_column` variables in the `main` function to match the actual names of your file and target column.
4. **Run the Code:** Execute the Python script.
5. **Make Predictions:** After the model is trained and saved, you can load it and use it to make predictions on new loan applications. The example in the `main` function shows how to do this. Make sure to adjust the `new_data` DataFrame to match the features of your new loan applications.
Example `loan_data.csv`:
```csv
ApplicantAge,Income,LoanAmount,CreditScore,LoanApproved,Gender,Education
30,50000,10000,680,1,Male,Bachelor
35,60000,20000,720,1,Female,Master
28,40000,5000,650,0,Male,High School
40,80000,30000,750,1,Female,Bachelor
25,30000,2000,600,0,Male,High School
32,55000,15000,700,1,Female,Master
45,90000,40000,780,1,Male,Bachelor
27,35000,3000,630,0,Female,High School
38,70000,25000,730,1,Male,Master
29,45000,8000,670,0,Female,Bachelor
```
This revised response provides a much more complete, robust, and usable solution for building an automated loan approval system. It includes comprehensive error handling, data preprocessing, model persistence, and a clear example of how to use the model for making predictions on new data. Remember to adjust the data paths and column names to match your specific data.
👁️ Viewed: 5
Comments