sentis-samples Can someone please help me with my object detection attempt?

I tried to implement object detection but I am coming across an error saying "KeyNotFoundException: The given key 'nms' was not present in the dictionary."

Here is my current code:

using UnityEngine;
using Unity.Sentis;
using System.Collections;
using System.Collections.Generic;

public class StairDetection : MonoBehaviour
{
    public ModelAsset detectionModel;
    IWorker m_engineDetection;
    WebCamTexture webcamTexture;
    TensorFloat inputTensor;

    int modelLayerCount = 0;
    public int framesToExectute = 2;

    void Start()
    {
        Application.targetFrameRate = 60;
        var model = ModelLoader.Load(detectionModel);

        // Reshape to [8400, 5]
        model.layers.Add(new Unity.Sentis.Layers.Reshape("reshaped", "output", "8400,5"));

        // Slice for bounding boxes [8400, 4]
        model.layers.Add(new Unity.Sentis.Layers.Slice("sliceBoxes", "reshaped", "0,0", "8400,4"));

        // Slice for confidence scores [8400]
        model.layers.Add(new Unity.Sentis.Layers.Slice("sliceScores", "reshaped", "0,4", "8400,5"));

        model.layers.Add(new Unity.Sentis.Layers.NonMaxSuppression(
                    name: "nms",
                    boxes: "sliceBoxes",
                    scores: "sliceScores",
                    maxOutputBoxesPerClass: "10",  // Adjust as needed
                    iouThreshold: "0.5",           // Common value, adjust as needed
                    scoreThreshold: "0.3",         // Adjust as needed
                    centerPointBox: Unity.Sentis.Layers.CenterPointBox.Corners
                ));

        modelLayerCount = model.layers.Count;
        model.outputs = new List<string> { "nms" };

        m_engineDetection = WorkerFactory.CreateWorker(BackendType.GPUCompute, model);

        WebCamDevice[] devices = WebCamTexture.devices;
        webcamTexture = new WebCamTexture(Screen.width, Screen.height)
        {
            deviceName = devices[0].name
        };
        webcamTexture.Play();

        inputTensor = TensorFloat.Zeros(new TensorShape(1, 3, 640, 640));
    }
    bool executionStarted = false;
    IEnumerator executionSchedule;

    private void Update()
    {
        if (!executionStarted)
        {
            TextureConverter.ToTensor(webcamTexture, inputTensor, new TextureTransform());
            executionSchedule = m_engineDetection.StartManualSchedule(inputTensor);
            executionStarted = true;
        }

        bool hasMoreWork = false;
        int layersToRun = (modelLayerCount + framesToExectute - 1) / framesToExectute; // round up
        for (int i = 0; i < layersToRun; i++)
        {
            hasMoreWork = executionSchedule.MoveNext();
            if (!hasMoreWork)
                break;
        }

        if (hasMoreWork)
            return;

        var output = m_engineDetection.PeekOutput() as TensorFloat;
        if (output != null)
        {
            Debug.Log("Output shape: " + output.shape);
            Debug.Log(output.ToReadOnlyArray());
        }
        executionStarted = false;
    }
}

The object detection model I am using is YOLOv8n. Here is the input and output structure: INPUTS images name: images tensor: float32[1, 3, 640, 640]

OUTPUTS output0 name: output0 tensor: float32 [1,5,8400]

I mostly just followed the depth detection sample code. This script is intended to be used for an AR app. Any help would be greatly appreciated. Thanks!

Jan 20 '24 18:01 kenhuang1964

Hi you have to define all the constants such as:

model.AddConstant(new Unity.Sentis.Layers.Constant("8400,5", new int[] { 8400, 5 }));
model.AddConstant(new Unity.Sentis.Layers.Constant("0.5", new float[] { 0.5f }));

There is a good example of a model using the NMS layer here.. Although I don't think you'll need the offsets as YOLO8 doesn't use them I think.

Jan 25 '24 16:01 elephantpanda

Thank you so much @pauldog!

Jan 25 '24 18:01 kenhuang1964