Quantization and pruning from DeepSpeed Compression not working
I would like to use DeepSpeed for post-training compression with CUDA, using quantization or pruning.
I'm using a pretrained ResNet as a simple example to test how DeepSpeed works, in a simple case following this.
However, I'm not able to achieve any performance improvement at all, neither with weight quantization, activation quantization or sparse and row pruning. When using pruning, I checked that the weights are actually modified, but no gain in the performance is obtained.
Here is the full code I'm using:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import deepspeed
from deepspeed.compression.compress import init_compression, redundancy_clean
import argparse
# use GPUs if available
if torch.cuda.is_available():
print("CUDA Available")
device = torch.device('cuda')
else:
print('CUDA Not Available')
device = torch.device('cpu')
# Routine to compute the inference time
def checktime(model, ndata = 500):
timelist = []
for i in range(ndata):
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
out = model(data)
end.record()
torch.cuda.synchronize()
timelist.append(start.elapsed_time(end))
timelist = timelist[30:] # remove the first warm up calls
timelist = np.array(timelist)
print("Inference time [ms]. Mean: {:.1f}, Std: {:.1f}".format(timelist.mean(),timelist.std()))
return timelist
# An instance of the ResNet model
model = torchvision.models.resnet18().to(device)
model.eval()
"""
# Check names of layers
for name, param in model.named_parameters():
print(name)
"""
# An example input
data = torch.rand(4, 3, 224, 224, device=device)
# Compute the inference time in the standard pre-compressed model
timelist_standard = checktime(model)
# out: Inference time [ms]. Mean: 2.7, Std: 0.1
# Get arguments for DeepSpeed
parser = argparse.ArgumentParser(description='Deepspeed')
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
print("\n",args,"\n")
deepspeed.init_distributed() # I think this line is not required
# Compress the model
model = init_compression(model, args.deepspeed_config)
model = redundancy_clean(model, args.deepspeed_config)
model.eval()
# Compute the inference time in the compressed model
timelist_compressed = checktime(model)
# out: Inference time [ms]. Mean: 2.7, Std: 0.1
And this an example of the config file (although I tried with different variants):
{
"compression_training": {
"weight_quantization": {
"shared_parameters":{
"enabled": true,
"quantizer_kernel": false,
"schedule_offset": 0,
"quantize_groups": 1,
"quantize_verbose": true,
"quantization_type": "asymmetric",
"quantize_weight_in_forward": false,
"rounding": "nearest",
"fp16_mixed_quantize":{
"enabled": false,
"quantize_change_ratio": 0.001
}
},
"different_groups":{
"wq1": {
"params": {
"start_bits": 12,
"target_bits": 8,
"quantization_period": 50
},
"modules": [
"conv1",
"conv2"
]
}
}
}
}
}
Am I doing something wrong to apply the DeepSpeed tools for post-training compression? Am I missing something?
Thanks in advance, Pablo.