How to convert standard densenet121 pretrained model to efficient version?
I tried to convert pretrained densenet121 model provided in
https://github.com/shicai/DenseNet-Caffe
to efficient version obeying your DenseBlock naming convention. I have the following prototxt (efficient_densenet121.prototxt) and script to copy params (from standard DenseNet_121.prototxt and corresponding model) until the end of first transition layer:
name: "densenet121_efficient"
input: "data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
convolution_param {
num_output: 64
bias_term: false
pad: 3
kernel_size: 7
stride: 2
}
}
layer {
name: "conv1/bn"
type: "BatchNorm"
bottom: "conv1"
top: "conv1/bn"
batch_norm_param {
eps: 1e-5
}
}
layer {
name: "conv1/scale"
type: "Scale"
bottom: "conv1/bn"
top: "conv1/bn"
scale_param {
bias_term: true
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1/bn"
top: "conv1/bn"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1/bn"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
pad: 1
ceil_mode: false
}
}
# dense block 1
layer {
name: "denseblock1"
type: "DenseBlock"
bottom: "pool1"
top: "concat_2_6"
denseblock_param {
numTransition: 6
initChannel: 64
growthRate: 32
use_dropout: false
use_BC: true
BC_ultra_space_efficient: true
}
}
# transition layer 1
layer {
name: "conv2_blk/bn"
type: "BatchNorm"
bottom: "concat_2_6"
top: "conv2_blk/bn"
batch_norm_param {
eps: 1e-5
}
}
layer {
name: "conv2_blk/scale"
type: "Scale"
bottom: "conv2_blk/bn"
top: "conv2_blk/bn"
scale_param {
bias_term: true
}
}
layer {
name: "relu2_blk"
type: "ReLU"
bottom: "conv2_blk/bn"
top: "conv2_blk/bn"
}
layer {
name: "conv2_blk"
type: "Convolution"
bottom: "conv2_blk/bn"
top: "conv2_blk"
convolution_param {
num_output: 128
bias_term: false
kernel_size: 1
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2_blk"
top: "pool2"
pooling_param {
pool: AVE
kernel_size: 2
stride: 2
}
}
script to test copying params:
import sys
sys.path.insert(0, './efficient-caffe/python')
import caffe
import numpy as np
import google.protobuf as pb2
import ipdb
##
# @brief
#
# @param i ith denseblock, 1st denseblock prefix: conv2_, 2nd denseblock prefix: conv3_
# @param j jth subblock, prefix: convi_j/
# @param k which param in efficient denseblock
# 0: 3x3 filter, 1/2: scale/bias of scale layer before 1x1 conv
# 3/4: global mean/var of bn layer before 1x1 conv
# 5: 1x1 filter, 6/7: scale/bias of scale layer before 3x3 conv
# 8/9: global mean/var of bn layer before 3x3 conv
# 10: moving average factor of bn layers
#
# @return layer_name in standard source net
def find_src_layer(i, j, k):
layer_name = 'conv{}_{}/'.format(i,j)
idx = 0
assert k in np.arange(11)
if k == 0:
layer_name += 'x2'
elif k == 1:
layer_name += 'x1/scale'
elif k == 2:
layer_name += 'x1/scale'
idx = 1
elif k == 3:
layer_name += 'x1/bn'
elif k == 4:
layer_name += 'x1/bn'
idx = 1
elif k == 5:
layer_name += 'x1'
elif k == 6:
layer_name += 'x2/scale'
elif k == 7:
layer_name += 'x2/scale'
idx = 1
elif k == 8:
layer_name += 'x2/bn'
elif k == 9:
layer_name += 'x2/bn'
idx = 1
else:
layer_name += 'x1/bn'
idx = 2
return layer_name, idx
caffe.set_mode_gpu()
net_dst = caffe.Net('./efficient_densenet121.prototxt', './DenseNet_121.caffemodel', caffe.TEST)
net_src = caffe.Net('./DenseNet_121.prototxt', './DenseNet_121.caffemodel', caffe.TEST)
net_dst_proto = caffe.proto.caffe_pb2.NetParameter()
with open('./efficient_densenet121.prototxt', 'rb') as fd:
pb2.text_format.Merge(fd.read(), net_dst_proto)
j = 1
# copy denseblock params
for i, layer in enumerate(net_dst.layers):
if layer.type == 'DenseBlock':
# params are saved in layer.blobs
layer_proto = net_dst_proto.layer[i-1]
repeat = layer_proto.denseblock_param.numTransition
j += 1
if layer_proto.denseblock_param.use_BC:
for k, param in enumerate(layer.blobs):
nth_param = k / repeat
nth_repeat = k % repeat + 1
src_layer, nth = find_src_layer(j, nth_repeat, nth_param)
# print src_layer, nth, nth_repeat, nth_param
assert param.data.shape == net_src.params[src_layer][nth].data.shape or \
param.data.size == net_src.params[src_layer][nth].data.size
param.data[:] = net_src.params[src_layer][nth].data.copy().reshape(param.data.shape)
else:
pass
# sanity check
inp = np.ones((1,3,224,224))
o_dst = net_dst.forward(data=inp)
o_src = net_src.forward(data=inp)
top_blob = net_dst.top_names[net_dst._layer_names[i]][0] # concat_6_2 layer for first denseblock
print top_blob, net_dst.blobs[top_blob].data.mean(), net_src.blobs[top_blob].data.mean(), \
net_dst.blobs[top_blob].data.std(), net_src.blobs[top_blob].data.std()
However, when I run the above script, the mean and std of 'concat_2_6' layer does not match, not very big difference but obviously some issues exist when copying parameters, especially I found the result of efficient version seems irrelevant to the value of final parameter of DenseBlock layer. In https://github.com/Tongcheng/caffe/blob/master/src/caffe/layers/DenseBlock_layer.cpp#L159 it says it is related to batch norm layer's moving average factor, I am wondering is it because in standard densenet all batch norm layers's last parameter is the same, so in this efficient implementation it only has one parameter? Other than that, I still don't know why its value doesn't affect output and what will be the correct mapping of parameters from standard model to efficient model?
Hi, have you solved this problem yet? I need the pretrained model of efficient version too, can you help me? thanks @ZhengRui
Finally I decided to use the Pytorch pre-trained model, https://github.com/gpleiss/efficient_densenet_pytorch/issues/13