Inlined function with update stage won't vectorize when parent function is scheduled with compute_with

Open vksnk opened this issue 5 years ago • 0 comments

In the following example:

    const int f_size = 128;
    const int g_size = 256;
    Buffer<int> f_im(f_size, f_size, 5), g_im(g_size, g_size);

    Var x("x"), y("y"), c("c"), xi("xi"), yi("yi"), yii("yii"), yo("yo");
    Func f("f"), g("g"), h("h"), input("input");

    input(x, y) = x;
    f(x, y, c) = c * input(x, y);
    h(x, y, c) = f(x, y, c);

    Func inl("inl");
    inl(x, y) = f(x / 2, y / 2, 0);
    inl(x, y) += f(x / 2, y / 2, 2);
    g(x, y) = inl(x, y);

    g
        .split(y, yo, y, 32 * 2, TailStrategy::RoundUp)
        .split(y, y, yi, 2, TailStrategy::RoundUp)
        .vectorize(x, 4, TailStrategy::GuardWithIf)
        .compute_with(h, y, LoopAlignStrategy::AlignEnd)
        //.compute_root()
        ;

    h
        .reorder(x, c, y)
        .split(y, yo, y, 32, TailStrategy::RoundUp)
        .split(y, y, yi, 1, TailStrategy::RoundUp)
        .vectorize(x, 4, TailStrategy::GuardWithIf)
        .compute_root();

    g.bound(y, 0, g_size);
    h.bound(y, 0, f_size).bound(c, 0, 5);

    Pipeline p({h, g});
    p.compile_jit();

Here function inl is inlined into g, which is scheduled to be computed with f. g is vectorized across x, but in the final IR inl serialized and compute in the loop:

      for (g.s0.x.x, 0, t266) {
       allocate inl[int32 * 4]
       let inl.s0.x.loop_extent.s = (g.extent.0 - (g.s0.x.x*4))
       produce inl {
        let t279 = max(min(inl.s0.x.loop_extent.s, 4), 0)
        let t280 = (g.s0.x.x*2)
        for (g.s0.x.v0, 0, t279) {
         inl[g.s0.x.v0] = 0
         inl[g.s0.x.v0] = (inl[g.s0.x.v0] + ((((g.min.0 + g.s0.x.v0)/2) + t280)*2))
        }
       }
       consume inl {
        g[ramp(((g.s0.x.x*4) + ((g.stride.1*t278) + t264)), 1, 4)] = inl[ramp(0, 1, 4)]
       }
       free inl
      }

However, if we schedule g as compute_root, inl will be properly vectorized:

    for (g.s0.x.x, 0, t50) {
      allocate inl[int32 * 4]
      produce inl {
       inl[ramp(0, 1, 4)] = x4(0)
       inl[ramp(0, 1, 4)] = interleave_vectors((ramp((((g.s0.x.x*2) + t46)*2), 2, 2) + inl[ramp(0, 2, 2)]), (ramp((((g.s0.x.x*2) + t47)*2), 2, 2) + inl[ramp(1, 2, 2)]))
      }
      consume inl {
       g[ramp(((g.s0.x.x*4) + ((g.stride.1*t53) + t48)), 1, 4)] = inl[ramp(0, 1, 4)]
      }
      free inl
     }

Mar 12 '20 00:03 vksnk