[QST] Question about "global_load" code in "cutlass/arch/memory.h"
The "global_load" function is used to copy data in global memory to a local array(maybe in registers). For examples:
template <typename AccessType>
struct global_load<AccessType,
16,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" mov.b32 %0, %6;\n"
" mov.b32 %1, %7;\n"
" mov.b32 %2, %8;\n"
" mov.b32 %3, %9;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
}
};
The code above in "memory.h" seems to load the target data to registers(%6,%7,%8,%9) first, and then move the value of registers(%6,%7,%8,%9) to registers(%0, %1, %2, %3). Then load the source data(ptr) in global memory to registers(%0, %1, %2, %3), and write these values to the target data at last.
My question is what are the registers(%6,%7,%8,%9) and these "mov" instructions used for? It seems that I can load source data to registers(%0,%1,%2, %3) directly like this:
template <typename AccessType>
struct global_load<AccessType, 16, CacheOperation::Always> {
__device__ global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int)pred_guard));
}
};
What are the considerations behind this?
I believe the CUTLASS version is for something like initialising.
Check the following global loading code snippet:
__device__
void global_load(uint4 &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" mov.b32 %0, %6;\n"
" mov.b32 %1, %7;\n"
" mov.b32 %2, %8;\n"
" mov.b32 %3, %9;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int) pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
}
__device__
void global_load_simple(uint4 &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int) pred_guard));
}
global_load is similar to CUTLASS's, and global_load_simple is yours.
Then check the following kernels:
__global__ void
kernel(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
D.x = inX1[tid];
D.y = inY1[tid];
D.z = inZ1[tid];
D.w = inW1[tid];
global_load(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
global_load(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel_simple(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
D.x = inX1[tid];
D.y = inY1[tid];
D.z = inZ1[tid];
D.w = inW1[tid];
global_load_simple(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel_simple2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
global_load_simple(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
kernel/kernel2 and kernel_simple/kernel_simple2 are the "same" but use different loading functions. kernel2 and kernel_simple2 do not do the initialization.
Check the assembly code on SM86:
Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_86
Function : kernel_simple2
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000107a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff117a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R16, [R16.64] ; /* 0x0000000410107981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R15, 0x4 ; /* 0x00000004000f7802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fc800078e000f */
/*00b0*/ IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x0c0fe400078e000f */
/*00c0*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x040fe400078e000f */
/*00e0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f64000c1e9900 */
/*0110*/ IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x000fcc00078e000f */
/*0120*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ee2000c1e9900 */
/*0130*/ ISETP.NE.U32.AND P0, PT, R16, RZ, PT ; /* 0x000000ff1000720c */
/* 0x004fda0003f05070 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R16, R4, R9, RZ ; /* 0x0000000904107224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R17, R5, R10, RZ ; /* 0x0000000a05117224 */
/* 0x010fe400078e02ff */
/*0170*/ IMAD R18, R6, R13, RZ ; /* 0x0000000d06127224 */
/* 0x020fe400078e02ff */
/*0180*/ IMAD R19, R7, R14, RZ ; /* 0x0000000e07137224 */
/* 0x000fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel_simple
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000107a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff117a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R16, [R16.64] ; /* 0x0000000410107981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R15, 0x4 ; /* 0x00000004000f7802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fc800078e000f */
/*00b0*/ IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x0c0fe400078e000f */
/*00c0*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x040fe400078e000f */
/*00e0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f64000c1e9900 */
/*0110*/ IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x000fcc00078e000f */
/*0120*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ee2000c1e9900 */
/*0130*/ ISETP.NE.U32.AND P0, PT, R16, RZ, PT ; /* 0x000000ff1000720c */
/* 0x004fda0003f05070 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R16, R4, R9, RZ ; /* 0x0000000904107224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R17, R5, R10, RZ ; /* 0x0000000a05117224 */
/* 0x010fe400078e02ff */
/*0170*/ IMAD R18, R6, R13, RZ ; /* 0x0000000d06127224 */
/* 0x020fe400078e02ff */
/*0180*/ IMAD R19, R7, R14, RZ ; /* 0x0000000e07137224 */
/* 0x000fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel2
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R8, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000087a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R9, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff097a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R8, [R8.64] ; /* 0x0000000408087981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R7, 0x4 ; /* 0x0000000400077802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R10, R0, R7, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x000fc800078e0007 */
/*00b0*/ IMAD.WIDE.U32 R14, R0.reuse, R7, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x040fe400078e0007 */
/*00c0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*00e0*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R12, R0, R7, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x000fcc00078e0007 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f62000c1e9900 */
/*0110*/ ISETP.NE.U32.AND P0, PT, R8, RZ, PT ; /* 0x000000ff0800720c */
/* 0x004fe20003f05070 */
/*0120*/ IMAD.WIDE.U32 R8, R0, R7, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fcc00078e0007 */
/*0130*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000eac000c1e9900 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R17, R10, R5, RZ ; /* 0x000000050a117224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R16, R9, R4, RZ ; /* 0x0000000409107224 */
/* 0x004fc400078e02ff */
/*0170*/ IMAD R19, R14, R7, RZ ; /* 0x000000070e137224 */
/* 0x010fe400078e02ff */
/*0180*/ IMAD R18, R13, R6, RZ ; /* 0x000000060d127224 */
/* 0x020fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0030*/ MOV R23, 0x4 ; /* 0x0000000400177802 */
/* 0x000fe40000000f00 */
/*0040*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0050*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0060*/ IADD3 R14, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a00000e7a10 */
/* 0x000fc80007f1e0ff */
/*0070*/ IADD3.X R15, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff0f7a10 */
/* 0x000fca00007fe4ff */
/*0080*/ LDG.E.S8 R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ea2000c1e1300 */
/*0090*/ IMAD.WIDE.U32 R2, R0, R23, c[0x0][0x168] ; /* 0x00005a0000027625 */
/* 0x000fc800078e0017 */
/*00a0*/ IMAD.WIDE.U32 R8, R0.reuse, R23.reuse, c[0x0][0x170] ; /* 0x00005c0000087625 */
/* 0x0c0fe200078e0017 */
/*00b0*/ LDG.E.CONSTANT R4, [R2.64] ; /* 0x0000000402047981 */
/* 0x0000e6000c1e9900 */
/*00c0*/ IMAD.WIDE.U32 R10, R0.reuse, R23.reuse, c[0x0][0x178] ; /* 0x00005e00000a7625 */
/* 0x0c0fe200078e0017 */
/*00d0*/ LDG.E.CONSTANT R5, [R8.64] ; /* 0x0000000408057981 */
/* 0x000ee6000c1e9900 */
/*00e0*/ IMAD.WIDE.U32 R12, R0.reuse, R23.reuse, c[0x0][0x180] ; /* 0x00006000000c7625 */
/* 0x0c0fe200078e0017 */
/*00f0*/ LDG.E.CONSTANT R6, [R10.64] ; /* 0x000000040a067981 */
/* 0x000ee8000c1e9900 */
/*0100*/ LDG.E.CONSTANT R7, [R12.64] ; /* 0x000000040c077981 */
/* 0x000ee2000c1e9900 */
/*0110*/ MOV R25, 0x10 ; /* 0x0000001000197802 */
/* 0x000fe20000000f00 */
/*0120*/ IMAD.WIDE.U32 R16, R0, R23, c[0x0][0x188] ; /* 0x0000620000107625 */
/* 0x000fc800078e0017 */
/*0130*/ IMAD.WIDE.U32 R18, R0.reuse, R23.reuse, c[0x0][0x190] ; /* 0x0000640000127625 */
/* 0x0c0fe400078e0017 */
/*0140*/ LDG.E.CONSTANT R17, [R16.64] ; /* 0x0000000410117981 */
/* 0x000f24000c1e9900 */
/*0150*/ IMAD.WIDE.U32 R20, R0.reuse, R23.reuse, c[0x0][0x198] ; /* 0x0000660000147625 */
/* 0x0c0fe400078e0017 */
/*0160*/ LDG.E.CONSTANT R18, [R18.64] ; /* 0x0000000412127981 */
/* 0x000f64000c1e9900 */
/*0170*/ IMAD.WIDE.U32 R22, R0, R23, c[0x0][0x1a0] ; /* 0x0000680000167625 */
/* 0x000fe400078e0017 */
/*0180*/ LDG.E.CONSTANT R21, [R20.64] ; /* 0x0000000414157981 */
/* 0x000f28000c1e9900 */
/*0190*/ LDG.E.CONSTANT R22, [R22.64] ; /* 0x0000000416167981 */
/* 0x000f22000c1e9900 */
/*01a0*/ ISETP.NE.U32.AND P0, PT, R14, RZ, PT ; /* 0x000000ff0e00720c */
/* 0x004fe20003f05070 */
/*01b0*/ IMAD.WIDE.U32 R14, R0, R25, c[0x0][0x160] ; /* 0x00005800000e7625 */
/* 0x000fd800078e0019 */
/*01c0*/ @P0 LDG.E.LU.128 R4, [R14.64] ; /* 0x000000040e040981 */
/* 0x008f22000c3e1d00 */
/*01d0*/ IMAD.WIDE.U32 R2, R0, R25, c[0x0][0x1b0] ; /* 0x00006c0000027625 */
/* 0x001fc800078e0019 */
/*01e0*/ IMAD R4, R17, R4, RZ ; /* 0x0000000411047224 */
/* 0x010fe400078e02ff */
/*01f0*/ IMAD R5, R18, R5, RZ ; /* 0x0000000512057224 */
/* 0x020fe400078e02ff */
/*0200*/ IMAD R6, R21, R6, RZ ; /* 0x0000000615067224 */
/* 0x000fe400078e02ff */
/*0210*/ IMAD R7, R22, R7, RZ ; /* 0x0000000716077224 */
/* 0x000fca00078e02ff */
/*0220*/ STG.E.128 [R2.64], R4 ; /* 0x0000000402007986 */
/* 0x000fe2000c101d04 */
/*0230*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0240*/ BRA 0x240; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0280*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0290*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
You will find that kernel_simple and kernel_simple2 have the same assembly code and their code is almost the same as the non-initialised version kernel2, which hints your loading function does not preserve the existing value of the register.
Also check this: CUDA PTX ISA As it said:
The global (
.global) state space is memory that is accessible by all threads in a context. It is the mechanism by which threads in different CTAs, clusters, and grids can communicate. Useld.global,st.global, andatom.globalto access global variables.Global variables have an optional variable initializer; global variables with no explicit initializer are initialized to zero by default.
I am not sure what is the exact meaning of "initialized to zero", we can write more testing code for this. But anyway, the assembly code told us that kernel_simple does not initialise R4, R5, R6, R7 in a normal way.
@bmmcq has your issue been resolved?
This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.
Closing due to inactivity