cutlass icon indicating copy to clipboard operation
cutlass copied to clipboard

[QST] Question about "global_load" code in "cutlass/arch/memory.h"

Open bmmcq opened this issue 2 years ago • 4 comments

The "global_load" function is used to copy data in global memory to a local array(maybe in registers). For examples:

template <typename AccessType>
struct global_load<AccessType,
                   16,
                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
  uint4 &data = reinterpret_cast<uint4 &>(D);
    asm volatile(
        "{\n"
        "  .reg .pred p;\n"
        "  setp.ne.b32 p, %5, 0;\n"
        "  mov.b32 %0, %6;\n"
        "  mov.b32 %1, %7;\n"
        "  mov.b32 %2, %8;\n"
        "  mov.b32 %3, %9;\n"
        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
        "}\n"
        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
  }
};

The code above in "memory.h" seems to load the target data to registers(%6,%7,%8,%9) first, and then move the value of registers(%6,%7,%8,%9) to registers(%0, %1, %2, %3). Then load the source data(ptr) in global memory to registers(%0, %1, %2, %3), and write these values to the target data at last.

My question is what are the registers(%6,%7,%8,%9) and these "mov" instructions used for? It seems that I can load source data to registers(%0,%1,%2, %3) directly like this:

template <typename AccessType>
struct global_load<AccessType, 16, CacheOperation::Always> {
  __device__ global_load(AccessType &D, void const *ptr, bool pred_guard) {
    uint4 &data = reinterpret_cast<uint4 &>(D);
    asm volatile(
        "{\n"
        "  .reg .pred p;\n"
        "  setp.ne.b32 p, %5, 0;\n"
        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
        "}\n"
        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
        : "l"(ptr), "r"((int)pred_guard));
  }
};

What are the considerations behind this?

bmmcq avatar Nov 27 '23 08:11 bmmcq

I believe the CUTLASS version is for something like initialising.

Check the following global loading code snippet:

__device__
void global_load(uint4 &D, void const *ptr, bool pred_guard) {
    uint4 &data = reinterpret_cast<uint4 &>(D);
    asm volatile(
            "{\n"
            "  .reg .pred p;\n"
            "  setp.ne.b32 p, %5, 0;\n"
            "  mov.b32 %0, %6;\n"
            "  mov.b32 %1, %7;\n"
            "  mov.b32 %2, %8;\n"
            "  mov.b32 %3, %9;\n"
            "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
            "}\n"
            : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
            : "l"(ptr), "r"((int) pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
}

__device__
void global_load_simple(uint4 &D, void const *ptr, bool pred_guard) {
    uint4 &data = reinterpret_cast<uint4 &>(D);
    asm volatile(
            "{\n"
            "  .reg .pred p;\n"
            "  setp.ne.b32 p, %5, 0;\n"
            "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
            "}\n"
            : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
            : "l"(ptr), "r"((int) pred_guard));
}

global_load is similar to CUTLASS's, and global_load_simple is yours.

Then check the following kernels:

__global__ void
kernel(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
       const int *__restrict__ inZ1,
       const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
       const int *__restrict__ inZ2,
       const int *__restrict__ inW2, const bool *pred_guard,
       uint4 *out) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint4 D;
    D.x = inX1[tid];
    D.y = inY1[tid];
    D.z = inZ1[tid];
    D.w = inW1[tid];
    global_load(D, in + tid, pred_guard[tid]);
    D.x *= inX2[tid];
    D.y *= inY2[tid];
    D.z *= inZ2[tid];
    D.w *= inW2[tid];
    out[tid] = D;
}

__global__ void
kernel2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
       const int *__restrict__ inZ1,
       const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
       const int *__restrict__ inZ2,
       const int *__restrict__ inW2, const bool *pred_guard,
       uint4 *out) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint4 D;
    global_load(D, in + tid, pred_guard[tid]);
    D.x *= inX2[tid];
    D.y *= inY2[tid];
    D.z *= inZ2[tid];
    D.w *= inW2[tid];
    out[tid] = D;
}

__global__ void
kernel_simple(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
              const int *__restrict__ inZ1,
              const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
              const int *__restrict__ inZ2,
              const int *__restrict__ inW2, const bool *pred_guard,
              uint4 *out) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint4 D;
    D.x = inX1[tid];
    D.y = inY1[tid];
    D.z = inZ1[tid];
    D.w = inW1[tid];
    global_load_simple(D, in + tid, pred_guard[tid]);
    D.x *= inX2[tid];
    D.y *= inY2[tid];
    D.z *= inZ2[tid];
    D.w *= inW2[tid];
    out[tid] = D;
}

__global__ void
kernel_simple2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
               const int *__restrict__ inZ1,
               const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
               const int *__restrict__ inZ2,
               const int *__restrict__ inW2, const bool *pred_guard,
               uint4 *out) {
    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint4 D;
    global_load_simple(D, in + tid, pred_guard[tid]);
    D.x *= inX2[tid];
    D.y *= inY2[tid];
    D.z *= inZ2[tid];
    D.w *= inW2[tid];
    out[tid] = D;
}

kernel/kernel2 and kernel_simple/kernel_simple2 are the "same" but use different loading functions. kernel2 and kernel_simple2 do not do the initialization.

Check the assembly code on SM86:

Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_86
		Function : kernel_simple2
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                                   /* 0x00000a0000017a02 */
                                                                                            /* 0x000fc40000000f00 */
        /*0010*/                   S2R R0, SR_TID.X ;                                       /* 0x0000000000007919 */
                                                                                            /* 0x000e220000002100 */
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;                             /* 0x0000460000047ab9 */
                                                                                            /* 0x000fc60000000a00 */
        /*0030*/                   S2R R3, SR_CTAID.X ;                                     /* 0x0000000000037919 */
                                                                                            /* 0x000e240000002500 */
        /*0040*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                           /* 0x0000000003007a24 */
                                                                                            /* 0x001fca00078e0200 */
        /*0050*/                   IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ;                   /* 0x00006a0000107a10 */
                                                                                            /* 0x000fc80007f1e0ff */
        /*0060*/                   IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ;            /* 0x00006b00ff117a10 */
                                                                                            /* 0x000fca00007fe4ff */
        /*0070*/                   LDG.E.S8 R16, [R16.64] ;                                 /* 0x0000000410107981 */
                                                                                            /* 0x000ea2000c1e1300 */
        /*0080*/                   MOV R15, 0x4 ;                                           /* 0x00000004000f7802 */
                                                                                            /* 0x000fe40000000f00 */
        /*0090*/                   MOV R21, 0x10 ;                                          /* 0x0000001000157802 */
                                                                                            /* 0x000fc60000000f00 */
        /*00a0*/                   IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ;               /* 0x0000620000087625 */
                                                                                            /* 0x000fc800078e000f */
        /*00b0*/                   IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ;  /* 0x00006400000a7625 */
                                                                                            /* 0x0c0fe400078e000f */
        /*00c0*/                   LDG.E.CONSTANT R9, [R8.64] ;                             /* 0x0000000408097981 */
                                                                                            /* 0x000ee4000c1e9900 */
        /*00d0*/                   IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ;        /* 0x00006600000c7625 */
                                                                                            /* 0x040fe400078e000f */
        /*00e0*/                   LDG.E.CONSTANT R10, [R10.64] ;                           /* 0x000000040a0a7981 */
                                                                                            /* 0x000f24000c1e9900 */
        /*00f0*/                   IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ;         /* 0x0000580000027625 */
                                                                                            /* 0x040fe400078e0015 */
        /*0100*/                   LDG.E.CONSTANT R13, [R12.64] ;                           /* 0x000000040c0d7981 */
                                                                                            /* 0x000f64000c1e9900 */
        /*0110*/                   IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ;              /* 0x00006800000e7625 */
                                                                                            /* 0x000fcc00078e000f */
        /*0120*/                   LDG.E.CONSTANT R14, [R14.64] ;                           /* 0x000000040e0e7981 */
                                                                                            /* 0x000ee2000c1e9900 */
        /*0130*/                   ISETP.NE.U32.AND P0, PT, R16, RZ, PT ;                   /* 0x000000ff1000720c */
                                                                                            /* 0x004fda0003f05070 */
        /*0140*/               @P0 LDG.E.LU.128 R4, [R2.64] ;                               /* 0x0000000402040981 */
                                                                                            /* 0x000ee4000c3e1d00 */
        /*0150*/                   IMAD R16, R4, R9, RZ ;                                   /* 0x0000000904107224 */
                                                                                            /* 0x008fe400078e02ff */
        /*0160*/                   IMAD R17, R5, R10, RZ ;                                  /* 0x0000000a05117224 */
                                                                                            /* 0x010fe400078e02ff */
        /*0170*/                   IMAD R18, R6, R13, RZ ;                                  /* 0x0000000d06127224 */
                                                                                            /* 0x020fe400078e02ff */
        /*0180*/                   IMAD R19, R7, R14, RZ ;                                  /* 0x0000000e07137224 */
                                                                                            /* 0x000fe400078e02ff */
        /*0190*/                   IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ;               /* 0x00006c0000047625 */
                                                                                            /* 0x000fca00078e0015 */
        /*01a0*/                   STG.E.128 [R4.64], R16 ;                                 /* 0x0000001004007986 */
                                                                                            /* 0x000fe2000c101d04 */
        /*01b0*/                   EXIT ;                                                   /* 0x000000000000794d */
                                                                                            /* 0x000fea0003800000 */
        /*01c0*/                   BRA 0x1c0;                                               /* 0xfffffff000007947 */
                                                                                            /* 0x000fc0000383ffff */
        /*01d0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0200*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0210*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0220*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0230*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0240*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0250*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0260*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0270*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
		..........


		Function : kernel_simple
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                                   /* 0x00000a0000017a02 */
                                                                                            /* 0x000fc40000000f00 */
        /*0010*/                   S2R R0, SR_TID.X ;                                       /* 0x0000000000007919 */
                                                                                            /* 0x000e220000002100 */
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;                             /* 0x0000460000047ab9 */
                                                                                            /* 0x000fc60000000a00 */
        /*0030*/                   S2R R3, SR_CTAID.X ;                                     /* 0x0000000000037919 */
                                                                                            /* 0x000e240000002500 */
        /*0040*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                           /* 0x0000000003007a24 */
                                                                                            /* 0x001fca00078e0200 */
        /*0050*/                   IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ;                   /* 0x00006a0000107a10 */
                                                                                            /* 0x000fc80007f1e0ff */
        /*0060*/                   IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ;            /* 0x00006b00ff117a10 */
                                                                                            /* 0x000fca00007fe4ff */
        /*0070*/                   LDG.E.S8 R16, [R16.64] ;                                 /* 0x0000000410107981 */
                                                                                            /* 0x000ea2000c1e1300 */
        /*0080*/                   MOV R15, 0x4 ;                                           /* 0x00000004000f7802 */
                                                                                            /* 0x000fe40000000f00 */
        /*0090*/                   MOV R21, 0x10 ;                                          /* 0x0000001000157802 */
                                                                                            /* 0x000fc60000000f00 */
        /*00a0*/                   IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ;               /* 0x0000620000087625 */
                                                                                            /* 0x000fc800078e000f */
        /*00b0*/                   IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ;  /* 0x00006400000a7625 */
                                                                                            /* 0x0c0fe400078e000f */
        /*00c0*/                   LDG.E.CONSTANT R9, [R8.64] ;                             /* 0x0000000408097981 */
                                                                                            /* 0x000ee4000c1e9900 */
        /*00d0*/                   IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ;        /* 0x00006600000c7625 */
                                                                                            /* 0x040fe400078e000f */
        /*00e0*/                   LDG.E.CONSTANT R10, [R10.64] ;                           /* 0x000000040a0a7981 */
                                                                                            /* 0x000f24000c1e9900 */
        /*00f0*/                   IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ;         /* 0x0000580000027625 */
                                                                                            /* 0x040fe400078e0015 */
        /*0100*/                   LDG.E.CONSTANT R13, [R12.64] ;                           /* 0x000000040c0d7981 */
                                                                                            /* 0x000f64000c1e9900 */
        /*0110*/                   IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ;              /* 0x00006800000e7625 */
                                                                                            /* 0x000fcc00078e000f */
        /*0120*/                   LDG.E.CONSTANT R14, [R14.64] ;                           /* 0x000000040e0e7981 */
                                                                                            /* 0x000ee2000c1e9900 */
        /*0130*/                   ISETP.NE.U32.AND P0, PT, R16, RZ, PT ;                   /* 0x000000ff1000720c */
                                                                                            /* 0x004fda0003f05070 */
        /*0140*/               @P0 LDG.E.LU.128 R4, [R2.64] ;                               /* 0x0000000402040981 */
                                                                                            /* 0x000ee4000c3e1d00 */
        /*0150*/                   IMAD R16, R4, R9, RZ ;                                   /* 0x0000000904107224 */
                                                                                            /* 0x008fe400078e02ff */
        /*0160*/                   IMAD R17, R5, R10, RZ ;                                  /* 0x0000000a05117224 */
                                                                                            /* 0x010fe400078e02ff */
        /*0170*/                   IMAD R18, R6, R13, RZ ;                                  /* 0x0000000d06127224 */
                                                                                            /* 0x020fe400078e02ff */
        /*0180*/                   IMAD R19, R7, R14, RZ ;                                  /* 0x0000000e07137224 */
                                                                                            /* 0x000fe400078e02ff */
        /*0190*/                   IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ;               /* 0x00006c0000047625 */
                                                                                            /* 0x000fca00078e0015 */
        /*01a0*/                   STG.E.128 [R4.64], R16 ;                                 /* 0x0000001004007986 */
                                                                                            /* 0x000fe2000c101d04 */
        /*01b0*/                   EXIT ;                                                   /* 0x000000000000794d */
                                                                                            /* 0x000fea0003800000 */
        /*01c0*/                   BRA 0x1c0;                                               /* 0xfffffff000007947 */
                                                                                            /* 0x000fc0000383ffff */
        /*01d0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0200*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0210*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0220*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0230*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0240*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0250*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0260*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0270*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
		..........


		Function : kernel2
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                                   /* 0x00000a0000017a02 */
                                                                                            /* 0x000fc40000000f00 */
        /*0010*/                   S2R R0, SR_TID.X ;                                       /* 0x0000000000007919 */
                                                                                            /* 0x000e220000002100 */
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;                             /* 0x0000460000047ab9 */
                                                                                            /* 0x000fc60000000a00 */
        /*0030*/                   S2R R3, SR_CTAID.X ;                                     /* 0x0000000000037919 */
                                                                                            /* 0x000e240000002500 */
        /*0040*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                           /* 0x0000000003007a24 */
                                                                                            /* 0x001fca00078e0200 */
        /*0050*/                   IADD3 R8, P0, R0, c[0x0][0x1a8], RZ ;                    /* 0x00006a0000087a10 */
                                                                                            /* 0x000fc80007f1e0ff */
        /*0060*/                   IADD3.X R9, RZ, c[0x0][0x1ac], RZ, P0, !PT ;             /* 0x00006b00ff097a10 */
                                                                                            /* 0x000fca00007fe4ff */
        /*0070*/                   LDG.E.S8 R8, [R8.64] ;                                   /* 0x0000000408087981 */
                                                                                            /* 0x000ea2000c1e1300 */
        /*0080*/                   MOV R7, 0x4 ;                                            /* 0x0000000400077802 */
                                                                                            /* 0x000fe40000000f00 */
        /*0090*/                   MOV R21, 0x10 ;                                          /* 0x0000001000157802 */
                                                                                            /* 0x000fc60000000f00 */
        /*00a0*/                   IMAD.WIDE.U32 R10, R0, R7, c[0x0][0x190] ;               /* 0x00006400000a7625 */
                                                                                            /* 0x000fc800078e0007 */
        /*00b0*/                   IMAD.WIDE.U32 R14, R0.reuse, R7, c[0x0][0x1a0] ;         /* 0x00006800000e7625 */
                                                                                            /* 0x040fe400078e0007 */
        /*00c0*/                   LDG.E.CONSTANT R10, [R10.64] ;                           /* 0x000000040a0a7981 */
                                                                                            /* 0x000ee4000c1e9900 */
        /*00d0*/                   IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ;         /* 0x0000580000027625 */
                                                                                            /* 0x040fe400078e0015 */
        /*00e0*/                   LDG.E.CONSTANT R14, [R14.64] ;                           /* 0x000000040e0e7981 */
                                                                                            /* 0x000f24000c1e9900 */
        /*00f0*/                   IMAD.WIDE.U32 R12, R0, R7, c[0x0][0x198] ;               /* 0x00006600000c7625 */
                                                                                            /* 0x000fcc00078e0007 */
        /*0100*/                   LDG.E.CONSTANT R13, [R12.64] ;                           /* 0x000000040c0d7981 */
                                                                                            /* 0x000f62000c1e9900 */
        /*0110*/                   ISETP.NE.U32.AND P0, PT, R8, RZ, PT ;                    /* 0x000000ff0800720c */
                                                                                            /* 0x004fe20003f05070 */
        /*0120*/                   IMAD.WIDE.U32 R8, R0, R7, c[0x0][0x188] ;                /* 0x0000620000087625 */
                                                                                            /* 0x000fcc00078e0007 */
        /*0130*/                   LDG.E.CONSTANT R9, [R8.64] ;                             /* 0x0000000408097981 */
                                                                                            /* 0x000eac000c1e9900 */
        /*0140*/               @P0 LDG.E.LU.128 R4, [R2.64] ;                               /* 0x0000000402040981 */
                                                                                            /* 0x000ee4000c3e1d00 */
        /*0150*/                   IMAD R17, R10, R5, RZ ;                                  /* 0x000000050a117224 */
                                                                                            /* 0x008fe400078e02ff */
        /*0160*/                   IMAD R16, R9, R4, RZ ;                                   /* 0x0000000409107224 */
                                                                                            /* 0x004fc400078e02ff */
        /*0170*/                   IMAD R19, R14, R7, RZ ;                                  /* 0x000000070e137224 */
                                                                                            /* 0x010fe400078e02ff */
        /*0180*/                   IMAD R18, R13, R6, RZ ;                                  /* 0x000000060d127224 */
                                                                                            /* 0x020fe400078e02ff */
        /*0190*/                   IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ;               /* 0x00006c0000047625 */
                                                                                            /* 0x000fca00078e0015 */
        /*01a0*/                   STG.E.128 [R4.64], R16 ;                                 /* 0x0000001004007986 */
                                                                                            /* 0x000fe2000c101d04 */
        /*01b0*/                   EXIT ;                                                   /* 0x000000000000794d */
                                                                                            /* 0x000fea0003800000 */
        /*01c0*/                   BRA 0x1c0;                                               /* 0xfffffff000007947 */
                                                                                            /* 0x000fc0000383ffff */
        /*01d0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0200*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0210*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0220*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0230*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0240*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0250*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0260*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0270*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
		..........


		Function : kernel
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                                   /* 0x00000a0000017a02 */
                                                                                            /* 0x000fc40000000f00 */
        /*0010*/                   S2R R0, SR_TID.X ;                                       /* 0x0000000000007919 */
                                                                                            /* 0x000e220000002100 */
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;                             /* 0x0000460000047ab9 */
                                                                                            /* 0x000fe20000000a00 */
        /*0030*/                   MOV R23, 0x4 ;                                           /* 0x0000000400177802 */
                                                                                            /* 0x000fe40000000f00 */
        /*0040*/                   S2R R3, SR_CTAID.X ;                                     /* 0x0000000000037919 */
                                                                                            /* 0x000e240000002500 */
        /*0050*/                   IMAD R0, R3, c[0x0][0x0], R0 ;                           /* 0x0000000003007a24 */
                                                                                            /* 0x001fca00078e0200 */
        /*0060*/                   IADD3 R14, P0, R0, c[0x0][0x1a8], RZ ;                   /* 0x00006a00000e7a10 */
                                                                                            /* 0x000fc80007f1e0ff */
        /*0070*/                   IADD3.X R15, RZ, c[0x0][0x1ac], RZ, P0, !PT ;            /* 0x00006b00ff0f7a10 */
                                                                                            /* 0x000fca00007fe4ff */
        /*0080*/                   LDG.E.S8 R14, [R14.64] ;                                 /* 0x000000040e0e7981 */
                                                                                            /* 0x000ea2000c1e1300 */
        /*0090*/                   IMAD.WIDE.U32 R2, R0, R23, c[0x0][0x168] ;               /* 0x00005a0000027625 */
                                                                                            /* 0x000fc800078e0017 */
        /*00a0*/                   IMAD.WIDE.U32 R8, R0.reuse, R23.reuse, c[0x0][0x170] ;   /* 0x00005c0000087625 */
                                                                                            /* 0x0c0fe200078e0017 */
        /*00b0*/                   LDG.E.CONSTANT R4, [R2.64] ;                             /* 0x0000000402047981 */
                                                                                            /* 0x0000e6000c1e9900 */
        /*00c0*/                   IMAD.WIDE.U32 R10, R0.reuse, R23.reuse, c[0x0][0x178] ;  /* 0x00005e00000a7625 */
                                                                                            /* 0x0c0fe200078e0017 */
        /*00d0*/                   LDG.E.CONSTANT R5, [R8.64] ;                             /* 0x0000000408057981 */
                                                                                            /* 0x000ee6000c1e9900 */
        /*00e0*/                   IMAD.WIDE.U32 R12, R0.reuse, R23.reuse, c[0x0][0x180] ;  /* 0x00006000000c7625 */
                                                                                            /* 0x0c0fe200078e0017 */
        /*00f0*/                   LDG.E.CONSTANT R6, [R10.64] ;                            /* 0x000000040a067981 */
                                                                                            /* 0x000ee8000c1e9900 */
        /*0100*/                   LDG.E.CONSTANT R7, [R12.64] ;                            /* 0x000000040c077981 */
                                                                                            /* 0x000ee2000c1e9900 */
        /*0110*/                   MOV R25, 0x10 ;                                          /* 0x0000001000197802 */
                                                                                            /* 0x000fe20000000f00 */
        /*0120*/                   IMAD.WIDE.U32 R16, R0, R23, c[0x0][0x188] ;              /* 0x0000620000107625 */
                                                                                            /* 0x000fc800078e0017 */
        /*0130*/                   IMAD.WIDE.U32 R18, R0.reuse, R23.reuse, c[0x0][0x190] ;  /* 0x0000640000127625 */
                                                                                            /* 0x0c0fe400078e0017 */
        /*0140*/                   LDG.E.CONSTANT R17, [R16.64] ;                           /* 0x0000000410117981 */
                                                                                            /* 0x000f24000c1e9900 */
        /*0150*/                   IMAD.WIDE.U32 R20, R0.reuse, R23.reuse, c[0x0][0x198] ;  /* 0x0000660000147625 */
                                                                                            /* 0x0c0fe400078e0017 */
        /*0160*/                   LDG.E.CONSTANT R18, [R18.64] ;                           /* 0x0000000412127981 */
                                                                                            /* 0x000f64000c1e9900 */
        /*0170*/                   IMAD.WIDE.U32 R22, R0, R23, c[0x0][0x1a0] ;              /* 0x0000680000167625 */
                                                                                            /* 0x000fe400078e0017 */
        /*0180*/                   LDG.E.CONSTANT R21, [R20.64] ;                           /* 0x0000000414157981 */
                                                                                            /* 0x000f28000c1e9900 */
        /*0190*/                   LDG.E.CONSTANT R22, [R22.64] ;                           /* 0x0000000416167981 */
                                                                                            /* 0x000f22000c1e9900 */
        /*01a0*/                   ISETP.NE.U32.AND P0, PT, R14, RZ, PT ;                   /* 0x000000ff0e00720c */
                                                                                            /* 0x004fe20003f05070 */
        /*01b0*/                   IMAD.WIDE.U32 R14, R0, R25, c[0x0][0x160] ;              /* 0x00005800000e7625 */
                                                                                            /* 0x000fd800078e0019 */
        /*01c0*/               @P0 LDG.E.LU.128 R4, [R14.64] ;                              /* 0x000000040e040981 */
                                                                                            /* 0x008f22000c3e1d00 */
        /*01d0*/                   IMAD.WIDE.U32 R2, R0, R25, c[0x0][0x1b0] ;               /* 0x00006c0000027625 */
                                                                                            /* 0x001fc800078e0019 */
        /*01e0*/                   IMAD R4, R17, R4, RZ ;                                   /* 0x0000000411047224 */
                                                                                            /* 0x010fe400078e02ff */
        /*01f0*/                   IMAD R5, R18, R5, RZ ;                                   /* 0x0000000512057224 */
                                                                                            /* 0x020fe400078e02ff */
        /*0200*/                   IMAD R6, R21, R6, RZ ;                                   /* 0x0000000615067224 */
                                                                                            /* 0x000fe400078e02ff */
        /*0210*/                   IMAD R7, R22, R7, RZ ;                                   /* 0x0000000716077224 */
                                                                                            /* 0x000fca00078e02ff */
        /*0220*/                   STG.E.128 [R2.64], R4 ;                                  /* 0x0000000402007986 */
                                                                                            /* 0x000fe2000c101d04 */
        /*0230*/                   EXIT ;                                                   /* 0x000000000000794d */
                                                                                            /* 0x000fea0003800000 */
        /*0240*/                   BRA 0x240;                                               /* 0xfffffff000007947 */
                                                                                            /* 0x000fc0000383ffff */
        /*0250*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0260*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0270*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0280*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*0290*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02a0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02b0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02c0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02d0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02e0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
        /*02f0*/                   NOP;                                                     /* 0x0000000000007918 */
                                                                                            /* 0x000fc00000000000 */
		..........

You will find that kernel_simple and kernel_simple2 have the same assembly code and their code is almost the same as the non-initialised version kernel2, which hints your loading function does not preserve the existing value of the register.

JieRen98 avatar Nov 28 '23 08:11 JieRen98

Also check this: CUDA PTX ISA As it said:

The global (.global) state space is memory that is accessible by all threads in a context. It is the mechanism by which threads in different CTAs, clusters, and grids can communicate. Use ld.global, st.global, and atom.global to access global variables.

Global variables have an optional variable initializer; global variables with no explicit initializer are initialized to zero by default.

I am not sure what is the exact meaning of "initialized to zero", we can write more testing code for this. But anyway, the assembly code told us that kernel_simple does not initialise R4, R5, R6, R7 in a normal way.

JieRen98 avatar Nov 28 '23 08:11 JieRen98

@bmmcq has your issue been resolved?

mnicely avatar Jan 02 '24 15:01 mnicely

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.

github-actions[bot] avatar Feb 01 '24 16:02 github-actions[bot]

Closing due to inactivity

mnicely avatar Feb 22 '24 14:02 mnicely