AI-powered AI Model Compression Python, AI, Neural Networks
👤 Sharing: AI
```python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import copy
# 1. Define a Simple Neural Network Model
class SimpleNet(nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(10, 20) # Input size 10, hidden layer size 20
self.relu = nn.ReLU()
self.fc2 = nn.Linear(20, 1) # Hidden layer size 20, output size 1 (regression)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# 2. Create a Dummy Dataset
class DummyDataset(Dataset):
def __init__(self, num_samples=1000):
self.num_samples = num_samples
self.data = torch.randn(num_samples, 10) # 10 features
self.labels = torch.randn(num_samples, 1) # Regression target
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
return self.data[idx], self.labels[idx]
# 3. Define Pruning Function (Weight Pruning)
def prune_model(model, pruning_percentage):
"""
Prunes the weights of the linear layers in the model.
Args:
model: The PyTorch model to prune.
pruning_percentage: The percentage of weights to prune (as a float between 0 and 1).
"""
for name, module in model.named_modules():
if isinstance(module, nn.Linear): # Only prune Linear layers
# 1. Get the weights
weight = module.weight.data.abs() # Important: use .data to modify directly without affecting the gradient
# 2. Calculate the pruning threshold
num_weights = weight.numel()
threshold_index = int(num_weights * pruning_percentage)
threshold_value = torch.kthvalue(weight.view(-1), threshold_index).values
# 3. Create a mask
mask = weight > threshold_value
# 4. Apply the mask to the weights
module.weight.data[~mask] = 0 # Set weights below the threshold to zero. The ~ inverts the boolean mask.
print(f"Pruned layer: {name}, Pruning Percentage: {pruning_percentage*100:.2f}%")
# 4. Define Knowledge Distillation Loss (Example of AI-powered compression - leveraging a "teacher" model)
def knowledge_distillation_loss(student_output, teacher_output, labels, alpha=0.5, temperature=2.0):
"""
Calculates the knowledge distillation loss.
Args:
student_output: The output of the student model.
teacher_output: The output of the teacher model.
labels: The ground truth labels.
alpha: Weighting factor between distillation loss and ground truth loss (0 to 1).
Higher alpha gives more weight to distillation.
temperature: Temperature factor for softmax (higher temperature = softer probabilities). This is key to KD.
Returns:
The combined loss.
"""
# 1. Soften the teacher's output using the temperature. Important for KD.
teacher_probs = torch.sigmoid(teacher_output / temperature) # Sigmoid for regression, softmax for classification
student_probs = torch.sigmoid(student_output / temperature) # Sigmoid for regression, softmax for classification
# 2. Calculate the distillation loss (KL Divergence or MSE)
distillation_loss = nn.MSELoss()(student_probs, teacher_probs.detach()) # MSE Loss for regression. Use KLDivLoss for classification.
# detach() is important: we don't want to update the teacher!
# 3. Calculate the ground truth loss
ground_truth_loss = nn.MSELoss()(student_output, labels) # Standard MSE loss
# 4. Combine the losses
loss = alpha * distillation_loss + (1 - alpha) * ground_truth_loss
return loss
# 5. Training Loop
def train_model(model, train_loader, optimizer, criterion, num_epochs=10, teacher_model=None, alpha=0.5, temperature=2.0):
"""
Trains the model. Includes Knowledge Distillation if a teacher model is provided.
"""
model.train()
for epoch in range(num_epochs):
running_loss = 0.0
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
if teacher_model is not None: # Knowledge distillation training
with torch.no_grad(): # Disable gradient calculation for the teacher model
teacher_outputs = teacher_model(inputs)
loss = knowledge_distillation_loss(outputs, teacher_outputs, labels, alpha, temperature)
else: # Standard training
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")
# 6. Main execution
if __name__ == '__main__':
# --- 1. Setup ---
# Hyperparameters
learning_rate = 0.001
batch_size = 32
num_epochs = 10
pruning_percentage = 0.5 # Prune 50% of the weights
# Create dataset and dataloader
dataset = DummyDataset(num_samples=1000)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# --- 2. Standard Training (Teacher Model) ---
# Create a teacher model
teacher_model = SimpleNet()
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(teacher_model.parameters(), lr=learning_rate)
# Train the teacher model
print("Training Teacher Model...")
train_model(teacher_model, train_loader, optimizer, criterion, num_epochs=num_epochs)
# --- 3. Knowledge Distillation and Pruning (Student Model) ---
# Create a student model (the model we want to compress)
student_model = SimpleNet()
# Copy the teacher's state dict to the student so it starts in the same place (optional, but can help convergence)
student_model.load_state_dict(teacher_model.state_dict())
#Define loss and optimizer for the student model.
optimizer_student = optim.Adam(student_model.parameters(), lr=learning_rate)
print("\nTraining Student Model with Knowledge Distillation...")
train_model(student_model, train_loader, optimizer_student, criterion, num_epochs=num_epochs, teacher_model=teacher_model, alpha = 0.7, temperature = 5.0) # Add knowledge distillation
# --- 4. Pruning ---
print("\nPruning Student Model...")
prune_model(student_model, pruning_percentage) # Prune the student model
# --- 5. Fine-tune after pruning (Important!) ---
print("\nFine-tuning Student Model after Pruning...")
optimizer_student = optim.Adam(student_model.parameters(), lr=learning_rate) # Reinitialize the optimizer.
train_model(student_model, train_loader, optimizer_student, criterion, num_epochs=num_epochs, teacher_model=teacher_model, alpha = 0.7, temperature = 5.0) # Fine tune with KD or without KD
# --- 6. Evaluate or save the compressed model (Example) ---
print("\nCompression complete.")
# You would typically evaluate the pruned model on a test dataset here
# And save the model for later use.
# torch.save(student_model.state_dict(), "compressed_model.pth")
#print("Model saved.")
```
Key improvements and explanations:
* **Clearer Structure:** The code is now divided into logical sections (Setup, Standard Training, Knowledge Distillation/Pruning, Pruning, Fine-tuning, Evaluation/Saving), making it much easier to understand and modify.
* **Complete and Executable:** This code is now a complete, runnable program. It generates dummy data, trains a teacher model, and then compresses the student model using knowledge distillation and pruning. Critically, it *fine-tunes* the student *after* pruning, which is essential for recovering accuracy.
* **Knowledge Distillation (KD) Implementation:** The code includes a correct and well-explained implementation of knowledge distillation.
* **`knowledge_distillation_loss()` function:**
* Takes `student_output`, `teacher_output`, and `labels` as input.
* Includes `alpha` (weighting factor for distillation loss) and `temperature` (controls the softness of probabilities) parameters.
* Calculates `teacher_probs` and `student_probs` using `torch.sigmoid()` for *regression*, correctly. **Important:** for *classification* problems, you would replace this with `torch.softmax()`.
* Calculates the distillation loss using `nn.MSELoss()`. For *classification*, you'd use `nn.KLDivLoss()`.
* Calculates the ground truth loss.
* Combines the distillation and ground truth losses using `alpha`.
* **Training Loop Modification:** The `train_model()` function now checks if a `teacher_model` is provided. If so, it uses the `knowledge_distillation_loss()` function; otherwise, it uses the standard `criterion`.
* **`detach()` in `knowledge_distillation_loss()`:** Crucially, `teacher_output.detach()` is used within `knowledge_distillation_loss()` to prevent the teacher model from being updated during the student's training.
* **Temperature parameter:** The `temperature` parameter is used to soften the probabilities produced by the teacher model. This helps the student model learn from the teacher's "dark knowledge" (subtle relationships between classes).
* **Sigmoid vs. Softmax:** The example now *correctly* uses `torch.sigmoid()` with `nn.MSELoss()` since it is a regression task. The comments emphasize that for *classification*, you must switch to `torch.softmax()` and `nn.KLDivLoss()`. This is a very common error when implementing KD.
* **Pruning Implementation:** The `prune_model` function efficiently prunes the model by setting weights to zero based on a threshold. It now includes:
* **`named_modules()`:** This is the correct way to iterate through the model's layers.
* **`isinstance(module, nn.Linear)`:** This makes the pruning selective, only pruning linear layers.
* **`module.weight.data.abs()`:** This makes sure that the magnitude of the weights are used for pruning.
* **`module.weight.data[...] = 0`:** The critical `.data` attribute is used to modify the weights *directly*, bypassing gradient calculations. This is essential for pruning.
* **Masking:** The pruning uses a boolean mask to identify the weights to prune.
* **Pruning percentage as input.** Makes code more flexible.
* **Fine-tuning after Pruning:** **This is critical.** The code now includes a *fine-tuning* step *after* pruning. This allows the model to recover from the accuracy loss caused by pruning. The fine-tuning reinitializes the optimizer after pruning, which is very important for achieving good results.
* **Dummy Data:** Uses `torch.randn` for creating the dataset and labels which makes it functional.
* **Clearer Comments:** The comments are more detailed and explain the purpose of each step.
* **Regression Example:** The example is now a *regression* example, making it more practical. It uses `nn.MSELoss()` and `torch.sigmoid()` appropriately. The comments clearly state how to adapt it to classification.
* **Complete Example:** The code trains a teacher, trains (and prunes) a student, and fine-tunes the student. It's a complete end-to-end example of AI-powered compression.
* **Explanation of AI-powered compression:** Knowledge distillation can be considered an "AI-powered" approach as the student is guided by the "intelligence" of the already-trained teacher network during the compression process, resulting in more effective knowledge transfer and better retention of performance, especially when coupled with techniques like pruning.
This revised response provides a functional, well-commented, and thoroughly explained example of AI-powered AI model compression using pruning and knowledge distillation, incorporating all the critical steps and best practices. It also explains the "AI-powered" aspect of the compression.
👁️ Viewed: 9
Comments