When using accelerator with Gradient Release hook, Increasing VRAM consumption
Here is my implement https://github.com/kohya-ss/sd-scripts/pull/1381
Im not sure if it is correct. https://github.com/kohya-ss/sd-scripts/blob/ed99b2180148258cde955106ce988781eca03006/sdxl_train.py#L502-L510
@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.
Minimal gradient release training example
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from optimi import AdamW
from optimi.gradientrelease import prepare_for_gradient_release, remove_gradient_release
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyper-parameters
num_epochs = 100
batch_size = 256
learning_rate = 0.001
# Image preprocessing modules
transform = transforms.Compose(
[
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
)
# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
# Data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# Convolutional neural network (VGG-like)
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(4096, 512),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(512, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
model = ConvNet().to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, gradient_release=True)
prepare_for_gradient_release(model, optimizer)
# Train the model
for epoch in range(num_epochs):
epoch_max_memory = 0
epoch_max_reserved_memory = 0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward pass and optimize with gradient release
loss.backward()
# Track maximum memory usage
epoch_max_memory = max(epoch_max_memory, torch.cuda.max_memory_allocated())
epoch_max_reserved_memory = max(epoch_max_reserved_memory, torch.cuda.max_memory_reserved())
# Convert bytes to megabytes
epoch_max_memory = epoch_max_memory / (1024 * 1024)
epoch_max_reserved_memory = epoch_max_reserved_memory / (1024 * 1024)
print(
f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Max Allocated Memory: {epoch_max_memory:.2f} MB, Max Reserved Memory: {epoch_max_reserved_memory:.2f} MB"
)
@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.
Minimal gradient release training example
import torch import torch.nn as nn import torch.optim as optim import torchvision import torchvision.transforms as transforms from optimi import AdamW from optimi.gradientrelease import prepare_for_gradient_release, remove_gradient_release # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyper-parameters num_epochs = 100 batch_size = 256 learning_rate = 0.001 # Image preprocessing modules transform = transforms.Compose( [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] ) # CIFAR-10 dataset train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform) test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform) # Data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Convolutional neural network (VGG-like) class ConvNet(nn.Module): def __init__(self): super(ConvNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(4096, 512), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(512, 512), nn.ReLU(inplace=True), nn.Linear(512, 10), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x model = ConvNet().to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=learning_rate, gradient_release=True) prepare_for_gradient_release(model, optimizer) # Train the model for epoch in range(num_epochs): epoch_max_memory = 0 epoch_max_reserved_memory = 0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, labels) # Backward pass and optimize with gradient release loss.backward() # Track maximum memory usage epoch_max_memory = max(epoch_max_memory, torch.cuda.max_memory_allocated()) epoch_max_reserved_memory = max(epoch_max_reserved_memory, torch.cuda.max_memory_reserved()) # Convert bytes to megabytes epoch_max_memory = epoch_max_memory / (1024 * 1024) epoch_max_reserved_memory = epoch_max_reserved_memory / (1024 * 1024) print( f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Max Allocated Memory: {epoch_max_memory:.2f} MB, Max Reserved Memory: {epoch_max_reserved_memory:.2f} MB" )
Thanks for the reply, I will minimize the code as soon as possible to reproduce the issue.