Quantization and pruning from DeepSpeed Compression not working

Open PabloVD opened this issue 3 years ago • 0 comments

I would like to use DeepSpeed for post-training compression with CUDA, using quantization or pruning.

I'm using a pretrained ResNet as a simple example to test how DeepSpeed works, in a simple case following this.

However, I'm not able to achieve any performance improvement at all, neither with weight quantization, activation quantization or sparse and row pruning. When using pruning, I checked that the weights are actually modified, but no gain in the performance is obtained.

Here is the full code I'm using:

import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt

import deepspeed
from deepspeed.compression.compress import init_compression, redundancy_clean
import argparse

# use GPUs if available
if torch.cuda.is_available():
    print("CUDA Available")
    device = torch.device('cuda')
else:
    print('CUDA Not Available')
    device = torch.device('cpu')

# Routine to compute the inference time
def checktime(model, ndata = 500):

    timelist = []

    for i in range(ndata):

        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        start.record()
        out = model(data)
        end.record()

        torch.cuda.synchronize()
        timelist.append(start.elapsed_time(end))

    timelist = timelist[30:]    # remove the first warm up calls
    timelist = np.array(timelist)
    print("Inference time [ms]. Mean: {:.1f}, Std: {:.1f}".format(timelist.mean(),timelist.std()))

    return timelist

# An instance of the ResNet model
model = torchvision.models.resnet18().to(device)
model.eval()

"""
# Check names of layers
for name, param in model.named_parameters():
    print(name)
"""

# An example input
data = torch.rand(4, 3, 224, 224, device=device)

# Compute the inference time in the standard pre-compressed model
timelist_standard = checktime(model)
# out: Inference time [ms]. Mean: 2.7, Std: 0.1

# Get arguments for DeepSpeed
parser = argparse.ArgumentParser(description='Deepspeed')
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
print("\n",args,"\n")
deepspeed.init_distributed()   # I think this line is not required

# Compress the model
model = init_compression(model, args.deepspeed_config)
model = redundancy_clean(model, args.deepspeed_config)
model.eval()

# Compute the inference time in the compressed model
timelist_compressed = checktime(model)
# out: Inference time [ms]. Mean: 2.7, Std: 0.1

And this an example of the config file (although I tried with different variants):

{ 
    "compression_training": {
      "weight_quantization": {
        "shared_parameters":{
          "enabled": true,
          "quantizer_kernel": false,
          "schedule_offset": 0,
          "quantize_groups": 1,
          "quantize_verbose": true,
          "quantization_type": "asymmetric",
          "quantize_weight_in_forward": false,
          "rounding": "nearest",
          "fp16_mixed_quantize":{
            "enabled": false,
            "quantize_change_ratio": 0.001
          }
        },
        "different_groups":{
          "wq1": {
            "params": {
                "start_bits": 12, 
                "target_bits": 8,
                "quantization_period": 50
            },
            "modules": [
              "conv1",
              "conv2"
            ]
          }
        }
      }
  }
 }

Am I doing something wrong to apply the DeepSpeed tools for post-training compression? Am I missing something?

Thanks in advance, Pablo.

Sep 01 '22 17:09 PabloVD