models Resnet npu

这个分支固定了数据集，去掉了随机。

需要准备一个可以加载的初始化模型到 .../models/Vision/classification/image/resnet50/examples/checkpoints/init

比如在910b上可以

cd .../models/Vision/classification/image/resnet50/examples
cp -r /data1/home/xiexuan/git-repos/models/Vision/classification/image/resnet50/examples/checkpoints .

然后就可以运行 ./npu_eager.sh 或 ./npu_graph.sh

目前npu eager和cuda eager/graph都对齐了，但npu graph还没有对齐，输出的pred都是 0.001，需要深入调查

loss
tensor(6.9073, placement=oneflow.placement(type="npu", ranks=[0]), sbp=(oneflow.sbp.partial_sum,),
       dtype=oneflow.float32)
pred
tensor([[0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010],
        [0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010],
        [0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010],
        ...,
        [0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010],
        [0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010],
        [0.0010, 0.0010, 0.0010,  ..., 0.0010, 0.0010, 0.0010]],
       placement=oneflow.placement(type="npu", ranks=[0]), sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.float32)
label
tensor([582, 209, 272, 331, 768, 626, 838, 202, 333, 754, 435, 955, 853, 943,  40, 723,   3, 104,  51,  60, 118,
        762, 603, 353, 898,  69, 552, 824, 999, 217, 713, 334, 758, 818, 115,   1, 609, 238, 147, 446, 240, 455,
        442, 257, 206, 200, 911, 355, 684, 419], placement=oneflow.placement(type="npu", ranks=[0]),
       sbp=(oneflow.sbp.split(dim=0),), dtype=oneflow.int32)

Sep 24 '24 08:09 ShawnXuan

进展：查到了 Vision/classification/image/resnet50/models/resnet50.py 里面的 _make_layer 最后返回的是 nn.Sequential(*layers)，就没法深入对比了。

    def _forward_impl(self, x: Tensor) -> Tensor:
        if self.pad_input:
            if self.channel_last:
                # NHWC
                paddings = (0, 1)
            else:
                # NCHW
                paddings = (0, 0, 0, 0, 0, 1)
            x = flow._C.pad(x, pad=paddings, mode="constant", value=0)
        x = self.conv1(x)
        if self.fuse_bn_relu:
            x = self.bn1(x, None)
        else:
            x = self.bn1(x)
            x = self.relu(x)
        x = self.maxpool(x)
        # graph模式时前面都是对齐的，能够正常输出，说明算子没有问题
        x = self.layer1(x)
        r = x # 到这里返回的值都是 0，深入 layer1发现，最后是nn.Sequential，不好对比了，可能需要构造一个nn.Sequential的graph 单测
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = flow.flatten(x, 1)
        x = self.fc(x)

Sep 25 '24 02:09 ShawnXuan

09.27 进展 nn.Sequential 测试通过，测试脚本如下：

import unittest
import numpy as np
from collections import OrderedDict

import oneflow as flow
import oneflow_npu
from oneflow.test_utils.test_util import GenArgList


def create_model(shape, device, dtype):
    model = flow.nn.Sequential(
        flow.nn.Linear(shape[-1], shape[-1]),
        flow.nn.ReLU(),
        flow.nn.Linear(shape[-1], shape[-1]),
        flow.nn.Softmax(dim=-1)
    ).to(device=device, dtype=dtype)
    return model


class EvalGraph(flow.nn.Graph):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def build(self, x):
        logits = self.model(x)
        return logits


class TrainGraph(flow.nn.Graph):
    def __init__(self, model):
        super().__init__()
        self.model = model
        param_group = {"params": [p for p in model.parameters() if p is not None]}
        optimizer = flow.optim.SGD([param_group], lr=0.1, momentum=0.9)
        self.add_optimizer(optimizer)

    def build(self, x):
        logits = self.model(x)
        loss = logits.sum()
        loss.backward()
        return loss, logits


def _test_sequential_forward(test_case, shape, device, dtype):
    model_cpu = create_model(shape, "cpu", flow.float)
    model_npu = create_model(shape, device, dtype)
    model_npu.load_state_dict(model_cpu.state_dict())

    arr = flow.rand(*shape)
    arr_npu = arr.to(device=device, dtype=dtype)

    out_cpu = model_cpu(arr.to(dtype=flow.float))
    out_npu = model_npu(arr_npu)

    tol = 1e-3 if dtype == flow.float16 else 1e-5

    test_case.assertTrue(np.allclose(out_cpu.numpy(), out_npu.cpu().numpy(), tol, tol))

    graph = EvalGraph(model_npu)
    graph_out = graph(arr_npu)

    test_case.assertTrue(np.allclose(graph_out.cpu().numpy(), out_cpu.numpy(), tol, tol))


def _test_sequential_grad(test_case, shape, device, dtype):
    model_cpu = create_model(shape, "cpu", flow.float)
    model_npu = create_model(shape, device, dtype)
    model_npu.load_state_dict(model_cpu.state_dict())
    graph = TrainGraph(model_npu)

    np_arr = np.random.rand(*shape).astype(np.float32)
    arr = flow.tensor(np_arr, dtype=flow.float, requires_grad=True)
    arr_npu = flow.tensor(np_arr, dtype=dtype, device=device, requires_grad=True)
    print("*"*100)
    print("arr", arr)
    print("arr_npu", arr_npu)

    out_cpu = model_cpu(arr.to(dtype=flow.float))
    out_cpu.sum().backward()

    loss, out_npu = graph(arr_npu)

    print("out_cpu", out_cpu)
    print("out_npu", out_npu)

    for param_cpu, param_npu in zip(model_cpu.parameters(), model_npu.parameters()):
        print("cpu", param_cpu)
        print("npu", param_npu)
    #tol = 1e-3 if dtype == flow.float16 else 1e-5
    #test_case.assertTrue(np.allclose(arr.grad.numpy(), arr_npu.grad.cpu().numpy(), tol, tol))


@flow.unittest.skip_unless_1n1d()
class Test_Sequential_Graph(flow.unittest.TestCase):
    def test_sequential_forward(test_case):
        arg_dict = OrderedDict()
        arg_dict["test_fun"] = [
            _test_sequential_forward,
        ]
        arg_dict["shape"] = [(2, 4), (2, 3, 4)]
        arg_dict["device"] = ["npu"]
        arg_dict["dtype"] = [flow.float32, flow.float16]
        for arg in GenArgList(arg_dict):
            arg[0](test_case, *arg[1:])

    def test_sequential_grad(test_case):
        arg_dict = OrderedDict()
        arg_dict["test_fun"] = [
            _test_sequential_grad,
        ]
        arg_dict["shape"] = [(2, 4), (2, 3, 4)]
        arg_dict["device"] = ["npu"]
        arg_dict["dtype"] = [flow.float32]
        for arg in GenArgList(arg_dict):
            arg[0](test_case, *arg[1:])


if __name__ == "__main__":
    unittest.main()

Sep 27 '24 10:09 ShawnXuan