MobileNet Example(Maxpool Version) doesn't work
Hello! I have successfully completed the following workflow:
- Extract Brevitas model at Brevitas Example in ONNX format using a slightly modified version of the
test_brevitas_mobilenetfunction at finn/tests/brevitas/test_brevitas_mobilenet.py. - Compile the extracted ONNX Graph to HW following the example at build/mobilenet-v1.
In this process, I wanted to make slight modifications to the Brevitas model at Brevitas Example by replacing TruncAvgPool2d with QuantMaxPool2d. Then, I attempted to extract the modified model in ONNX format and synthesize it to HW following the example at finn-examples/build/mobilenet-v1.
However, during the partitioning process (I'm not exactly sure what this operation entails), I encountered an AssertionError: cycle-free graph violated: partition depends on itself error(self.partitioning(node) != partition_id is False, so asserted).
I'm wondering meaning of the error and how I can successfully synthesize the modified model to HW. Please help!
Let me show you some details.
Modified Brevitas Model(only the modified parts are provided compared to Brevitas Example):
class MobileNet_ori(nn.Module):
def __init__(
self,
channels,
first_stage_stride,
act_bit_width,
weight_bit_width,
round_average_pool=True,
weight_quant=CommonIntWeightPerChannelQuant,
first_layer_bit_width=8,
first_layer_weight_quant=CommonIntWeightPerChannelQuant,
last_layer_weight_quant=CommonIntWeightPerTensorQuant,
last_layer_bit_width=8,
avg_pool_kernel_size=7,
first_layer_stride=2,
in_channels=3,
num_classes=1000):
super(MobileNet_ori, self).__init__()
init_block_channels = channels[0][0]
self.features = Sequential()
init_block = ConvBlock(
in_channels=in_channels,
out_channels=init_block_channels,
kernel_size=3,
stride=first_layer_stride,
weight_bit_width=first_layer_bit_width,
weight_quant=first_layer_weight_quant,
act_bit_width=act_bit_width,
activation_scaling_per_channel=True)
self.features.add_module('init_block', init_block)
in_channels = init_block_channels
for i, channels_per_stage in enumerate(channels[1:]):
stage = Sequential()
pw_activation_scaling_per_channel = i < len(channels[1:]) - 1
for j, out_channels in enumerate(channels_per_stage):
stride = 2 if (j == 0) and ((i != 0) or first_stage_stride) else 1
mod = DwsConvBlock(
in_channels=in_channels,
out_channels=out_channels,
stride=stride,
act_bit_width=act_bit_width,
weight_bit_width=weight_bit_width,
weight_quant=weight_quant,
pw_activation_scaling_per_channel=pw_activation_scaling_per_channel)
stage.add_module('unit{}'.format(j + 1), mod)
in_channels = out_channels
self.features.add_module('stage{}'.format(i + 1), stage)
# Exporting to torch or ONNX qcdq requires round
avgpool_float_to_int_impl_type = 'ROUND' if round_average_pool else 'FLOOR'
self.final_pool = QuantMaxPool2d(
kernel_size=avg_pool_kernel_size,
stride=1)
# self.final_pool = TruncAvgPool2d(
# kernel_size=avg_pool_kernel_size,
# stride=1,
# bit_width=last_layer_bit_width,
# float_to_int_impl_type=avgpool_float_to_int_impl_type)
self.output = QuantLinear(
in_channels,
num_classes,
bias=True,
bias_quant=IntBias,
weight_quant=last_layer_weight_quant,
weight_bit_width=last_layer_bit_width)
def forward(self, x):
# print("x input",x.shape)
x = self.features(x)
# print("features",x.shape)
x = self.final_pool(x)
# print("final_pool",x.shape)
x = x.view(x.size(0), -1)
# print("view",x.shape)
out = self.output(x)
# print("output",x.shape)
return out
def quant_mobilenet_v1_ori():
channels = [[32], [64], [128, 128], [256, 256], [512, 512, 512, 512, 512, 512], [1024, 1024]]
first_stage_stride = False
width_scale = 1.0
bit_width = 8
round_avgpool = False
if width_scale != 1.0:
channels = [[int(cij * width_scale) for cij in ci] for ci in channels]
net = MobileNet_ori(
channels=channels,
first_stage_stride=first_stage_stride,
round_average_pool=round_avgpool,
act_bit_width=bit_width,
weight_bit_width=bit_width,
last_layer_bit_width=bit_width)
return net
Modified test_brevitas_mobilenet function at finn/tests/brevitas/test_brevitas_mobilenet.py
def test_brevitas_mobilenet_custom_dirty(mobilenet, img_torch, img_size, model_name):
mobilenet = mobilenet.eval()
# export preprocess
export_onnx_path = make_build_dir("test_brevitas_"+ model_name +"-custom_")
print("export_onnx_path:", export_onnx_path)
preproc_onnx = export_onnx_path + "/quant_"+model_name+"-custom_preproc.onnx"
print("preproc_onnx:",preproc_onnx)
lower_bound = 0.406
upper_bound = 0.485
ch = img_size[0]
random_list = np.random.uniform(low=lower_bound, high=upper_bound, size=ch).tolist()
mean = random_list if ch!=3 else [0.485, 0.456, 0.406] # correct inferrence is not my interest, so use random value.
std = 0.226
preproc = NormalizePreProc(mean, std, ch)
print("ch is",ch,"H is",img_size[1],"W is",img_size[2])
export_qonnx(preproc, torch.randn(1, ch, img_size[1], img_size[2]), preproc_onnx)
qonnx_cleanup(preproc_onnx, out_file=preproc_onnx)
preproc_model = ModelWrapper(preproc_onnx)
preproc_model = preproc_model.transform(ConvertQONNXtoFINN())
# set input finn datatype to UINT8
preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"])
preproc_model = preproc_model.transform(InferShapes())
preproc_model = preproc_model.transform(GiveUniqueNodeNames())
preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
preproc_model = preproc_model.transform(GiveReadableTensorNames())
finn_onnx = export_onnx_path + "/quant_"+model_name+"-custom_exported.onnx"
# mobilenet = get_test_model_trained("mobilenet", 4, 4)
export_qonnx(mobilenet, torch.randn(1, ch, img_size[1], img_size[2]), finn_onnx)
qonnx_cleanup(finn_onnx, out_file=finn_onnx)
# do forward pass in PyTorch/Brevitas
input_tensor = preproc.forward(img_torch)
expected = mobilenet.forward(input_tensor).detach().numpy()
expected_topk = expected.flatten()
expected_top5 = np.argsort(expected_topk)[-5:]
expected_top5 = np.flip(expected_top5)
expected_top5_prob = []
for index in expected_top5:
expected_top5_prob.append(expected_topk[index])
model = ModelWrapper(finn_onnx)
model = model.transform(ConvertQONNXtoFINN())
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(InsertTopK())
# get initializer from Mul that will be absorbed into topk
a0 = model.get_initializer(model.graph.node[-2].input[1])
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(InferShapes())
model = model.transform(InferDataTypes())
model = model.transform(InferDataLayouts())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveUniqueParameterTensors())
model = model.transform(GiveReadableTensorNames())
model.save(export_onnx_path + "/quant_"+model_name+"-custom_wo_preproc.onnx")
model = model.transform(MergeONNXModels(preproc_model))
model.save(export_onnx_path + "/quant_"+model_name+"-custom.onnx")
idict = {model.graph.input[0].name: img_np}
odict = oxe.execute_onnx(model, idict, True)
produced = odict[model.graph.output[0].name]
produced_prob = odict["TopK_0_out0"] * a0
print(produced.flatten(), expected_top5)
assert (produced.flatten() == expected_top5).all()
assert np.isclose(produced_prob.flatten(), expected_top5_prob, atol=2.2 * 1e-1).all()
return model
img_np = np.random.randint(0, 256, size=(1, 3, 224, 224)).astype(np.float32)
img_torch = torch.from_numpy(img_np).float()
model = test_brevitas_mobilenet_custom_dirty(quant_mobilenet_v1_ori(), img_torch = img_torch, img_size = (3,224,224), model_name = "mobilenet_v1_max")
The error message:
Running step: step_mobilenet_streamline [1/14]
Running step: step_mobilenet_lower_convs [2/14]
Running step: step_mobilenet_convert_to_hls_layers_separate_th [3/14]
Running step: step_create_dataflow_partition [4/14]
Traceback (most recent call last):
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/builder/build_dataflow.py", line 177, in build_dataflow_cfg
model = transform_step(model, cfg)
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/builder/build_dataflow_steps.py", line 379, in step_create_dataflow_partition
parent_model = model.transform(
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/core/modelwrapper.py", line 146, in transform
(transformed_model, model_was_changed) = transformation.apply(transformed_model)
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/transformation/fpgadataflow/create_dataflow_partition.py", line 80, in apply
parent_model = model.transform(
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/core/modelwrapper.py", line 146, in transform
(transformed_model, model_was_changed) = transformation.apply(transformed_model)
File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/transformation/create_generic_partitions.py", line 120, in apply
assert (
AssertionError: cycle-free graph violated: partition depends on itself
Hi,
this error usually indicates that your graph contains incompatible nodes that FINN could not convert to HLS backend nodes. If any such node remains in the graph, dataflow partitioning fails.
Can you share an image of your onnx graph after the "convert_to_hls" step(s)?
Thanks for your reply.
Here is my graph image. The onnx file name is step_mobilenet_convert_to_hls_layers_separate_th.onnx.
I see 3 problems:
- The first conv layer (Im2Col->MatMul) is not converted to HLS, most likely because its input doesn't have a quantzied datatype annotation and FINN doesn't support floating point. Did you already remove parts of the input quantization from this graph? If the input is already quantized (e.g. UINT8 for RGB image date), you may need to annotate this manually. Also, the first Mul will need to be removed or streamlined into a MultiThreshold, otherwise it will result in a floating point datatype at its output.
- The Transpose->Transpose in the middle should cancel each other out, so you can streamline them with AbsorbConsecutiveTransposes
- The Transpose->Reshape near the end looks like the transition between the conv part and the dense classification layer. You should be able to streamline it with RemoveCNVtoFCFlatten
Hi.
I work with 2 models MobileNetV1v4_noinitConv, MobileNetV1v4_initConv.
Only difference of these is that MobileNetV1v4_initConv has init Conv layer but MobileNetV1v4_noinitConv has not.
class CommonIntWeightPerTensorQuant(Int8WeightPerTensorFloat):
"""
Common per-tensor weight quantizer with bit-width set to None so that it's forced to be
specified by each layer.
"""
scaling_min_val = 2e-16
bit_width = None
class CommonIntWeightPerChannelQuant(CommonIntWeightPerTensorQuant):
"""
Common per-channel weight quantizer with bit-width set to None so that it's forced to be
specified by each layer.
"""
scaling_per_output_channel = True
class CommonIntActQuant(Int8ActPerTensorFloat):
"""
Common signed act quantizer with bit-width set to None so that it's forced to be specified by
each layer.
"""
scaling_min_val = 2e-16
bit_width = None
restrict_scaling_type = RestrictValueType.LOG_FP
class CommonUintActQuant(Uint8ActPerTensorFloat):
"""
Common unsigned act quantizer with bit-width set to None so that it's forced to be specified by
each layer.
"""
scaling_min_val = 2e-16
bit_width = None
restrict_scaling_type = RestrictValueType.LOG_FP
class DwsConvBlock(nn.Module):
def __init__(
self,
in_channels,
out_channels,
stride,
act_bit_width,
weight_bit_width,
weight_quant=CommonIntWeightPerChannelQuant,
pw_activation_scaling_per_channel=False):
super(DwsConvBlock, self).__init__()
self.dw_conv = ConvBlock(
in_channels=in_channels,
out_channels=in_channels,
groups=in_channels,
kernel_size=3,
padding=1,
stride=stride,
weight_bit_width=weight_bit_width,
weight_quant=weight_quant,
act_bit_width=act_bit_width)
self.pw_conv = ConvBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
padding=0,
weight_bit_width=weight_bit_width,
weight_quant=weight_quant,
act_bit_width=act_bit_width,
activation_scaling_per_channel=pw_activation_scaling_per_channel)
def forward(self, x):
x = self.dw_conv(x)
x = self.pw_conv(x)
return x
class ConvBlock(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
weight_bit_width,
act_bit_width,
stride=1,
padding=0,
groups=1,
bn_eps=1e-5,
weight_quant=CommonIntWeightPerChannelQuant,
activation_scaling_per_channel=False):
super(ConvBlock, self).__init__()
self.conv = QuantConv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False,
weight_quant=weight_quant,
weight_bit_width=weight_bit_width)
self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps)
self.activation = QuantReLU(
act_quant=CommonUintActQuant,
bit_width=act_bit_width,
per_channel_broadcastable_shape=(1, out_channels, 1, 1),
scaling_stats_permute_dims=(1, 0, 2, 3),
scaling_per_output_channel=activation_scaling_per_channel,
return_quant_tensor=True)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.activation(x)
return x
class MobileNetV1v4_noinitConv(nn.Module):
def __init__(
self,
weight_quant=CommonIntWeightPerChannelQuant,
first_layer_weight_quant=CommonIntWeightPerChannelQuant,
last_layer_weight_quant=CommonIntWeightPerTensorQuant):
super(MobileNetV1v4_noinitConv, self).__init__()
# self.init_block = ConvBlock(in_channels=64, out_channels=64, kernel_size=3, stride=2,
# weight_bit_width=8, weight_quant=first_layer_weight_quant, act_bit_width=4, activation_scaling_per_channel=True)
self.DWConv1 = DwsConvBlock( in_channels=64, out_channels=64, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
self.DWConv2 = DwsConvBlock(in_channels=64, out_channels=128, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
self.DWConv3 = DwsConvBlock(in_channels=128, out_channels=256, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=False)
self.final_pool = QuantMaxPool2d(kernel_size=16, stride=1)
self.output = QuantLinear(256, 8, bias=True, bias_quant=IntBias, weight_quant=last_layer_weight_quant, weight_bit_width=4)
def forward(self, x):
# x = self.init_block(x)
x = self.DWConv1(x)
x = self.DWConv2(x)
x = self.DWConv3(x)
x = self.final_pool(x)
x = x.view(x.size(0), -1)
out = self.output(x)
return out
class MobileNetV1v4_initConv(nn.Module):
def __init__(
self,
weight_quant=CommonIntWeightPerChannelQuant,
first_layer_weight_quant=CommonIntWeightPerChannelQuant,
last_layer_weight_quant=CommonIntWeightPerTensorQuant):
super(MobileNetV1v4_initConv, self).__init__()
self.init_block = ConvBlock(in_channels=64, out_channels=64, kernel_size=3, stride=2,
weight_bit_width=8, weight_quant=first_layer_weight_quant, act_bit_width=4, activation_scaling_per_channel=True)
self.DWConv1 = DwsConvBlock( in_channels=64, out_channels=64, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
self.DWConv2 = DwsConvBlock(in_channels=64, out_channels=128, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
self.DWConv3 = DwsConvBlock(in_channels=128, out_channels=256, stride=2,
act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=False)
self.final_pool = QuantMaxPool2d(kernel_size=8, stride=1)
self.output = QuantLinear(256, 8, bias=True, bias_quant=IntBias, weight_quant=last_layer_weight_quant, weight_bit_width=4)
def forward(self, x):
x = self.init_block(x)
x = self.DWConv1(x)
x = self.DWConv2(x)
x = self.DWConv3(x)
x = self.final_pool(x)
x = x.view(x.size(0), -1)
out = self.output(x)
return out
And onnx graph after "convert_to_hls" step of MobileNetV1v4_noinitConv, MobileNetV1v4_initConv.
left: MobileNetV1v4_noinitConv right: MobileNetV1v4_initConv
As you can see, first layer of MobileNetV1v4_noinitConv is not converted to hls.
But MobileNetV1v4_initConv is successfully converted to hls, and synthesized to HW very well.
And I export these model with same function test_brevitas_mobilenet_custom_dirty shown in my first question block.
I think that there is no meaningful difference. Just first conv layer is depth-wise conv or not. Isn't this trivial for converting to hls?
Thank you.