finn-examples MobileNet Example(Maxpool Version) doesn't work

Hello! I have successfully completed the following workflow:

Extract Brevitas model at Brevitas Example in ONNX format using a slightly modified version of the test_brevitas_mobilenet function at finn/tests/brevitas/test_brevitas_mobilenet.py.
Compile the extracted ONNX Graph to HW following the example at build/mobilenet-v1.

In this process, I wanted to make slight modifications to the Brevitas model at Brevitas Example by replacing TruncAvgPool2d with QuantMaxPool2d. Then, I attempted to extract the modified model in ONNX format and synthesize it to HW following the example at finn-examples/build/mobilenet-v1.

However, during the partitioning process (I'm not exactly sure what this operation entails), I encountered an AssertionError: cycle-free graph violated: partition depends on itself error(self.partitioning(node) != partition_id is False, so asserted). I'm wondering meaning of the error and how I can successfully synthesize the modified model to HW. Please help!

Let me show you some details.

Modified Brevitas Model(only the modified parts are provided compared to Brevitas Example):

class MobileNet_ori(nn.Module):

    def __init__(
            self,
            channels,
            first_stage_stride,
            act_bit_width,
            weight_bit_width,
            round_average_pool=True,
            weight_quant=CommonIntWeightPerChannelQuant,
            first_layer_bit_width=8,
            first_layer_weight_quant=CommonIntWeightPerChannelQuant,
            last_layer_weight_quant=CommonIntWeightPerTensorQuant,
            last_layer_bit_width=8,
            avg_pool_kernel_size=7,
            first_layer_stride=2,
            in_channels=3,
            num_classes=1000):
        super(MobileNet_ori, self).__init__()
        init_block_channels = channels[0][0]

        self.features = Sequential()
        init_block = ConvBlock(
            in_channels=in_channels,
            out_channels=init_block_channels,
            kernel_size=3,
            stride=first_layer_stride,
            weight_bit_width=first_layer_bit_width,
            weight_quant=first_layer_weight_quant,
            act_bit_width=act_bit_width,
            activation_scaling_per_channel=True)
        self.features.add_module('init_block', init_block)
        in_channels = init_block_channels
        for i, channels_per_stage in enumerate(channels[1:]):
            stage = Sequential()
            pw_activation_scaling_per_channel = i < len(channels[1:]) - 1
            for j, out_channels in enumerate(channels_per_stage):
                stride = 2 if (j == 0) and ((i != 0) or first_stage_stride) else 1
                mod = DwsConvBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    stride=stride,
                    act_bit_width=act_bit_width,
                    weight_bit_width=weight_bit_width,
                    weight_quant=weight_quant,
                    pw_activation_scaling_per_channel=pw_activation_scaling_per_channel)
                stage.add_module('unit{}'.format(j + 1), mod)
                in_channels = out_channels
            self.features.add_module('stage{}'.format(i + 1), stage)
        # Exporting to torch or ONNX qcdq requires round
        avgpool_float_to_int_impl_type = 'ROUND' if round_average_pool else 'FLOOR'
        self.final_pool = QuantMaxPool2d(
            kernel_size=avg_pool_kernel_size,
            stride=1)            
        # self.final_pool = TruncAvgPool2d(
        #     kernel_size=avg_pool_kernel_size,
        #     stride=1,
        #     bit_width=last_layer_bit_width,
        #     float_to_int_impl_type=avgpool_float_to_int_impl_type)
        self.output = QuantLinear(
            in_channels,
            num_classes,
            bias=True,
            bias_quant=IntBias,
            weight_quant=last_layer_weight_quant,
            weight_bit_width=last_layer_bit_width)

    def forward(self, x):
        # print("x input",x.shape)
        x = self.features(x)
        # print("features",x.shape)
        x = self.final_pool(x)
        # print("final_pool",x.shape)
        x = x.view(x.size(0), -1)
        # print("view",x.shape)
        out = self.output(x)
        # print("output",x.shape)
        return out


def quant_mobilenet_v1_ori():

    channels = [[32], [64], [128, 128], [256, 256], [512, 512, 512, 512, 512, 512], [1024, 1024]]
    first_stage_stride = False
    width_scale = 1.0
    bit_width = 8
    round_avgpool = False

    if width_scale != 1.0:
        channels = [[int(cij * width_scale) for cij in ci] for ci in channels]

    net = MobileNet_ori(
        channels=channels,
        first_stage_stride=first_stage_stride,
        round_average_pool=round_avgpool,
        act_bit_width=bit_width,
        weight_bit_width=bit_width,
        last_layer_bit_width=bit_width)

    return net

Modified test_brevitas_mobilenet function at finn/tests/brevitas/test_brevitas_mobilenet.py

def test_brevitas_mobilenet_custom_dirty(mobilenet, img_torch, img_size, model_name):
    mobilenet = mobilenet.eval()

    # export preprocess
    export_onnx_path = make_build_dir("test_brevitas_"+ model_name +"-custom_")
    print("export_onnx_path:", export_onnx_path)
    preproc_onnx = export_onnx_path + "/quant_"+model_name+"-custom_preproc.onnx"
    print("preproc_onnx:",preproc_onnx)
    
    lower_bound = 0.406
    upper_bound = 0.485
    ch = img_size[0]
    random_list = np.random.uniform(low=lower_bound, high=upper_bound, size=ch).tolist()
    mean = random_list if ch!=3 else [0.485, 0.456, 0.406] # correct inferrence is not my interest, so use random value.
    std = 0.226
    preproc = NormalizePreProc(mean, std, ch)
    print("ch is",ch,"H is",img_size[1],"W is",img_size[2])
    export_qonnx(preproc, torch.randn(1, ch, img_size[1], img_size[2]), preproc_onnx)
    qonnx_cleanup(preproc_onnx, out_file=preproc_onnx)
    preproc_model = ModelWrapper(preproc_onnx)
    preproc_model = preproc_model.transform(ConvertQONNXtoFINN())
    # set input finn datatype to UINT8
    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"])
    preproc_model = preproc_model.transform(InferShapes())
    preproc_model = preproc_model.transform(GiveUniqueNodeNames())
    preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
    preproc_model = preproc_model.transform(GiveReadableTensorNames())

    finn_onnx = export_onnx_path + "/quant_"+model_name+"-custom_exported.onnx"
    # mobilenet = get_test_model_trained("mobilenet", 4, 4)
    export_qonnx(mobilenet, torch.randn(1, ch, img_size[1], img_size[2]), finn_onnx)
    qonnx_cleanup(finn_onnx, out_file=finn_onnx)

    # do forward pass in PyTorch/Brevitas
    input_tensor = preproc.forward(img_torch)
    expected = mobilenet.forward(input_tensor).detach().numpy()
    expected_topk = expected.flatten()
    expected_top5 = np.argsort(expected_topk)[-5:]
    expected_top5 = np.flip(expected_top5)
    expected_top5_prob = []
    for index in expected_top5:
        expected_top5_prob.append(expected_topk[index])

    model = ModelWrapper(finn_onnx)
    model = model.transform(ConvertQONNXtoFINN())
    model = model.transform(InferShapes())
    model = model.transform(FoldConstants())
    model = model.transform(InsertTopK())
    # get initializer from Mul that will be absorbed into topk
    a0 = model.get_initializer(model.graph.node[-2].input[1])
    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
    model = model.transform(InferShapes())
    model = model.transform(InferDataTypes())
    model = model.transform(InferDataLayouts())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveUniqueParameterTensors())
    model = model.transform(GiveReadableTensorNames())
    model.save(export_onnx_path + "/quant_"+model_name+"-custom_wo_preproc.onnx")
    model = model.transform(MergeONNXModels(preproc_model))
    model.save(export_onnx_path + "/quant_"+model_name+"-custom.onnx")
    idict = {model.graph.input[0].name: img_np}
    odict = oxe.execute_onnx(model, idict, True)
    produced = odict[model.graph.output[0].name]
    produced_prob = odict["TopK_0_out0"] * a0
    print(produced.flatten(), expected_top5)
    assert (produced.flatten() == expected_top5).all()
    assert np.isclose(produced_prob.flatten(), expected_top5_prob, atol=2.2 * 1e-1).all()

    return model

img_np = np.random.randint(0, 256, size=(1, 3, 224, 224)).astype(np.float32)
img_torch = torch.from_numpy(img_np).float()
model = test_brevitas_mobilenet_custom_dirty(quant_mobilenet_v1_ori(), img_torch = img_torch, img_size = (3,224,224), model_name = "mobilenet_v1_max")

The error message:

Running step: step_mobilenet_streamline [1/14]
Running step: step_mobilenet_lower_convs [2/14]
Running step: step_mobilenet_convert_to_hls_layers_separate_th [3/14]
Running step: step_create_dataflow_partition [4/14]
Traceback (most recent call last):
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/builder/build_dataflow.py", line 177, in build_dataflow_cfg
    model = transform_step(model, cfg)
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/builder/build_dataflow_steps.py", line 379, in step_create_dataflow_partition
    parent_model = model.transform(
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/core/modelwrapper.py", line 146, in transform
    (transformed_model, model_was_changed) = transformation.apply(transformed_model)
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/src/finn/transformation/fpgadataflow/create_dataflow_partition.py", line 80, in apply
    parent_model = model.transform(
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/core/modelwrapper.py", line 146, in transform
    (transformed_model, model_was_changed) = transformation.apply(transformed_model)
  File "/home/vboxuser/Desktop/FINN/finn-examples-main/build/finn/deps/qonnx/src/qonnx/transformation/create_generic_partitions.py", line 120, in apply
    assert (
AssertionError: cycle-free graph violated: partition depends on itself

Mar 28 '24 09:03 ppccww0201

Hi,

this error usually indicates that your graph contains incompatible nodes that FINN could not convert to HLS backend nodes. If any such node remains in the graph, dataflow partitioning fails.

Can you share an image of your onnx graph after the "convert_to_hls" step(s)?

Mar 31 '24 17:03 fpjentzsch

Thanks for your reply.

Here is my graph image. The onnx file name is step_mobilenet_convert_to_hls_layers_separate_th.onnx.

Apr 01 '24 04:04 ppccww0201

I see 3 problems:

The first conv layer (Im2Col->MatMul) is not converted to HLS, most likely because its input doesn't have a quantzied datatype annotation and FINN doesn't support floating point. Did you already remove parts of the input quantization from this graph? If the input is already quantized (e.g. UINT8 for RGB image date), you may need to annotate this manually. Also, the first Mul will need to be removed or streamlined into a MultiThreshold, otherwise it will result in a floating point datatype at its output.
The Transpose->Transpose in the middle should cancel each other out, so you can streamline them with AbsorbConsecutiveTransposes
The Transpose->Reshape near the end looks like the transition between the conv part and the dense classification layer. You should be able to streamline it with RemoveCNVtoFCFlatten

Apr 02 '24 07:04 fpjentzsch

I add AbsorbConsecutiveTransposes and RemoveCNVtoFCFlatten, then it works!!

Thank you so much!

Apr 02 '24 14:04 ppccww0201

Hi.

I work with 2 models MobileNetV1v4_noinitConv, MobileNetV1v4_initConv. Only difference of these is that MobileNetV1v4_initConv has init Conv layer but MobileNetV1v4_noinitConv has not.

class CommonIntWeightPerTensorQuant(Int8WeightPerTensorFloat):
    """
    Common per-tensor weight quantizer with bit-width set to None so that it's forced to be
    specified by each layer.
    """
    scaling_min_val = 2e-16
    bit_width = None


class CommonIntWeightPerChannelQuant(CommonIntWeightPerTensorQuant):
    """
    Common per-channel weight quantizer with bit-width set to None so that it's forced to be
    specified by each layer.
    """
    scaling_per_output_channel = True


class CommonIntActQuant(Int8ActPerTensorFloat):
    """
    Common signed act quantizer with bit-width set to None so that it's forced to be specified by
    each layer.
    """
    scaling_min_val = 2e-16
    bit_width = None
    restrict_scaling_type = RestrictValueType.LOG_FP


class CommonUintActQuant(Uint8ActPerTensorFloat):
    """
    Common unsigned act quantizer with bit-width set to None so that it's forced to be specified by
    each layer.
    """
    scaling_min_val = 2e-16
    bit_width = None
    restrict_scaling_type = RestrictValueType.LOG_FP


class DwsConvBlock(nn.Module):

    def __init__(
            self,
            in_channels,
            out_channels,
            stride,
            act_bit_width,
            weight_bit_width,
            weight_quant=CommonIntWeightPerChannelQuant,
            pw_activation_scaling_per_channel=False):
        super(DwsConvBlock, self).__init__()
        self.dw_conv = ConvBlock(
            in_channels=in_channels,
            out_channels=in_channels,
            groups=in_channels,
            kernel_size=3,
            padding=1,
            stride=stride,
            weight_bit_width=weight_bit_width,
            weight_quant=weight_quant,
            act_bit_width=act_bit_width)
        self.pw_conv = ConvBlock(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            padding=0,
            weight_bit_width=weight_bit_width,
            weight_quant=weight_quant,
            act_bit_width=act_bit_width,
            activation_scaling_per_channel=pw_activation_scaling_per_channel)

    def forward(self, x):
        x = self.dw_conv(x)
        x = self.pw_conv(x)
        return x


class ConvBlock(nn.Module):

    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            weight_bit_width,
            act_bit_width,
            stride=1,
            padding=0,
            groups=1,
            bn_eps=1e-5,
            weight_quant=CommonIntWeightPerChannelQuant,
            activation_scaling_per_channel=False):
        super(ConvBlock, self).__init__()
        self.conv = QuantConv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias=False,
            weight_quant=weight_quant,
            weight_bit_width=weight_bit_width)
        self.bn = nn.BatchNorm2d(num_features=out_channels, eps=bn_eps)
        self.activation = QuantReLU(
            act_quant=CommonUintActQuant,
            bit_width=act_bit_width,
            per_channel_broadcastable_shape=(1, out_channels, 1, 1),
            scaling_stats_permute_dims=(1, 0, 2, 3),
            scaling_per_output_channel=activation_scaling_per_channel,
            return_quant_tensor=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.activation(x)
        return x

class MobileNetV1v4_noinitConv(nn.Module):

    def __init__(
            self,
            weight_quant=CommonIntWeightPerChannelQuant,
            first_layer_weight_quant=CommonIntWeightPerChannelQuant,
            last_layer_weight_quant=CommonIntWeightPerTensorQuant):
        super(MobileNetV1v4_noinitConv, self).__init__()

        # self.init_block = ConvBlock(in_channels=64, out_channels=64, kernel_size=3, stride=2,
        #     weight_bit_width=8, weight_quant=first_layer_weight_quant, act_bit_width=4, activation_scaling_per_channel=True)
        self.DWConv1 = DwsConvBlock( in_channels=64, out_channels=64, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
        self.DWConv2 = DwsConvBlock(in_channels=64, out_channels=128, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
        self.DWConv3 = DwsConvBlock(in_channels=128, out_channels=256, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=False)
        self.final_pool = QuantMaxPool2d(kernel_size=16, stride=1)
        self.output = QuantLinear(256, 8, bias=True, bias_quant=IntBias, weight_quant=last_layer_weight_quant, weight_bit_width=4)

    def forward(self, x):
        # x = self.init_block(x)
        x = self.DWConv1(x)
        x = self.DWConv2(x)
        x = self.DWConv3(x)
        x = self.final_pool(x)
        x = x.view(x.size(0), -1)
        out = self.output(x)
        return out

class MobileNetV1v4_initConv(nn.Module):

    def __init__(
            self,
            weight_quant=CommonIntWeightPerChannelQuant,
            first_layer_weight_quant=CommonIntWeightPerChannelQuant,
            last_layer_weight_quant=CommonIntWeightPerTensorQuant):
        super(MobileNetV1v4_initConv, self).__init__()

        self.init_block = ConvBlock(in_channels=64, out_channels=64, kernel_size=3, stride=2,
            weight_bit_width=8, weight_quant=first_layer_weight_quant, act_bit_width=4, activation_scaling_per_channel=True)
        self.DWConv1 = DwsConvBlock( in_channels=64, out_channels=64, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
        self.DWConv2 = DwsConvBlock(in_channels=64, out_channels=128, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=True)
        self.DWConv3 = DwsConvBlock(in_channels=128, out_channels=256, stride=2,
            act_bit_width=4, weight_bit_width=4, weight_quant=weight_quant, pw_activation_scaling_per_channel=False)
        self.final_pool = QuantMaxPool2d(kernel_size=8, stride=1)
        self.output = QuantLinear(256, 8, bias=True, bias_quant=IntBias, weight_quant=last_layer_weight_quant, weight_bit_width=4)

    def forward(self, x):
        x = self.init_block(x)
        x = self.DWConv1(x)
        x = self.DWConv2(x)
        x = self.DWConv3(x)
        x = self.final_pool(x)
        x = x.view(x.size(0), -1)
        out = self.output(x)
        return out

And onnx graph after "convert_to_hls" step of MobileNetV1v4_noinitConv, MobileNetV1v4_initConv. left: MobileNetV1v4_noinitConv right: MobileNetV1v4_initConv

As you can see, first layer of MobileNetV1v4_noinitConv is not converted to hls. But MobileNetV1v4_initConv is successfully converted to hls, and synthesized to HW very well. And I export these model with same function test_brevitas_mobilenet_custom_dirty shown in my first question block.

I think that there is no meaningful difference. Just first conv layer is depth-wise conv or not. Isn't this trivial for converting to hls?

Thank you.

Apr 04 '24 10:04 ppccww0201