// Wrapper for kernmulRSymm2Dyz CUDA kernel, asynchronous. func k_kernmulRSymm2Dyz_async(fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, N1 int, N2 int, cfg *config, str cu.Stream) { if kernmulRSymm2Dyz_code == 0 { kernmulRSymm2Dyz_code = fatbinLoad(kernmulRSymm2Dyz_map, "kernmulRSymm2Dyz") } var a kernmulRSymm2Dyz_args a.arg_fftMy = fftMy a.argptr[0] = unsafe.Pointer(&a.arg_fftMy) a.arg_fftMz = fftMz a.argptr[1] = unsafe.Pointer(&a.arg_fftMz) a.arg_fftKyy = fftKyy a.argptr[2] = unsafe.Pointer(&a.arg_fftKyy) a.arg_fftKzz = fftKzz a.argptr[3] = unsafe.Pointer(&a.arg_fftKzz) a.arg_fftKyz = fftKyz a.argptr[4] = unsafe.Pointer(&a.arg_fftKyz) a.arg_N1 = N1 a.argptr[5] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[6] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(kernmulRSymm2Dyz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for kernmulRSymm3D CUDA kernel, asynchronous. func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, N0 int, N1 int, N2 int, cfg *config, str cu.Stream) { if kernmulRSymm3D_code == 0 { kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D") } var a kernmulRSymm3D_args a.arg_fftMx = fftMx a.argptr[0] = unsafe.Pointer(&a.arg_fftMx) a.arg_fftMy = fftMy a.argptr[1] = unsafe.Pointer(&a.arg_fftMy) a.arg_fftMz = fftMz a.argptr[2] = unsafe.Pointer(&a.arg_fftMz) a.arg_fftKxx = fftKxx a.argptr[3] = unsafe.Pointer(&a.arg_fftKxx) a.arg_fftKyy = fftKyy a.argptr[4] = unsafe.Pointer(&a.arg_fftKyy) a.arg_fftKzz = fftKzz a.argptr[5] = unsafe.Pointer(&a.arg_fftKzz) a.arg_fftKyz = fftKyz a.argptr[6] = unsafe.Pointer(&a.arg_fftKyz) a.arg_fftKxz = fftKxz a.argptr[7] = unsafe.Pointer(&a.arg_fftKxz) a.arg_fftKxy = fftKxy a.argptr[8] = unsafe.Pointer(&a.arg_fftKxy) a.arg_N0 = N0 a.argptr[9] = unsafe.Pointer(&a.arg_N0) a.arg_N1 = N1 a.argptr[10] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[11] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for adddmi CUDA kernel, asynchronous. func k_adddmi_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Dx float32, Dy float32, Dz float32, N0 int, N1 int, N2 int, cfg *config, str cu.Stream) { if adddmi_code == 0 { adddmi_code = fatbinLoad(adddmi_map, "adddmi") } var a adddmi_args a.arg_Hx = Hx a.argptr[0] = unsafe.Pointer(&a.arg_Hx) a.arg_Hy = Hy a.argptr[1] = unsafe.Pointer(&a.arg_Hy) a.arg_Hz = Hz a.argptr[2] = unsafe.Pointer(&a.arg_Hz) a.arg_mx = mx a.argptr[3] = unsafe.Pointer(&a.arg_mx) a.arg_my = my a.argptr[4] = unsafe.Pointer(&a.arg_my) a.arg_mz = mz a.argptr[5] = unsafe.Pointer(&a.arg_mz) a.arg_Dx = Dx a.argptr[6] = unsafe.Pointer(&a.arg_Dx) a.arg_Dy = Dy a.argptr[7] = unsafe.Pointer(&a.arg_Dy) a.arg_Dz = Dz a.argptr[8] = unsafe.Pointer(&a.arg_Dz) a.arg_N0 = N0 a.argptr[9] = unsafe.Pointer(&a.arg_N0) a.arg_N1 = N1 a.argptr[10] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[11] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(adddmi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for copypadmul CUDA kernel, asynchronous. func k_copypadmul_async(dst unsafe.Pointer, D0 int, D1 int, D2 int, src unsafe.Pointer, S0 int, S1 int, S2 int, volmask unsafe.Pointer, Bsat float32, cfg *config, str cu.Stream) { if copypadmul_code == 0 { copypadmul_code = fatbinLoad(copypadmul_map, "copypadmul") } var a copypadmul_args a.arg_dst = dst a.argptr[0] = unsafe.Pointer(&a.arg_dst) a.arg_D0 = D0 a.argptr[1] = unsafe.Pointer(&a.arg_D0) a.arg_D1 = D1 a.argptr[2] = unsafe.Pointer(&a.arg_D1) a.arg_D2 = D2 a.argptr[3] = unsafe.Pointer(&a.arg_D2) a.arg_src = src a.argptr[4] = unsafe.Pointer(&a.arg_src) a.arg_S0 = S0 a.argptr[5] = unsafe.Pointer(&a.arg_S0) a.arg_S1 = S1 a.argptr[6] = unsafe.Pointer(&a.arg_S1) a.arg_S2 = S2 a.argptr[7] = unsafe.Pointer(&a.arg_S2) a.arg_volmask = volmask a.argptr[8] = unsafe.Pointer(&a.arg_volmask) a.arg_Bsat = Bsat a.argptr[9] = unsafe.Pointer(&a.arg_Bsat) args := a.argptr[:] cu.LaunchKernel(copypadmul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous. func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config, str cu.Stream) { if reducemaxvecdiff2_code == 0 { reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2") } var a reducemaxvecdiff2_args a.arg_x1 = x1 a.argptr[0] = unsafe.Pointer(&a.arg_x1) a.arg_y1 = y1 a.argptr[1] = unsafe.Pointer(&a.arg_y1) a.arg_z1 = z1 a.argptr[2] = unsafe.Pointer(&a.arg_z1) a.arg_x2 = x2 a.argptr[3] = unsafe.Pointer(&a.arg_x2) a.arg_y2 = y2 a.argptr[4] = unsafe.Pointer(&a.arg_y2) a.arg_z2 = z2 a.argptr[5] = unsafe.Pointer(&a.arg_z2) a.arg_dst = dst a.argptr[6] = unsafe.Pointer(&a.arg_dst) a.arg_initVal = initVal a.argptr[7] = unsafe.Pointer(&a.arg_initVal) a.arg_n = n a.argptr[8] = unsafe.Pointer(&a.arg_n) args := a.argptr[:] cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for dampingtorque CUDA kernel, asynchronous. func k_dampingtorque_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config, str cu.Stream) { if dampingtorque_code == 0 { dampingtorque_code = fatbinLoad(dampingtorque_map, "dampingtorque") } var a dampingtorque_args a.arg_tx = tx a.argptr[0] = unsafe.Pointer(&a.arg_tx) a.arg_ty = ty a.argptr[1] = unsafe.Pointer(&a.arg_ty) a.arg_tz = tz a.argptr[2] = unsafe.Pointer(&a.arg_tz) a.arg_mx = mx a.argptr[3] = unsafe.Pointer(&a.arg_mx) a.arg_my = my a.argptr[4] = unsafe.Pointer(&a.arg_my) a.arg_mz = mz a.argptr[5] = unsafe.Pointer(&a.arg_mz) a.arg_hx = hx a.argptr[6] = unsafe.Pointer(&a.arg_hx) a.arg_hy = hy a.argptr[7] = unsafe.Pointer(&a.arg_hy) a.arg_hz = hz a.argptr[8] = unsafe.Pointer(&a.arg_hz) a.arg_N = N a.argptr[9] = unsafe.Pointer(&a.arg_N) args := a.argptr[:] cu.LaunchKernel(dampingtorque_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for addexchange1comp CUDA kernel, asynchronous. func k_addexchange1comp_async(Beff unsafe.Pointer, m unsafe.Pointer, wx float32, wy float32, wz float32, N0 int, N1 int, N2 int, cfg *config, str cu.Stream) { if addexchange1comp_code == 0 { addexchange1comp_code = fatbinLoad(addexchange1comp_map, "addexchange1comp") } var a addexchange1comp_args a.arg_Beff = Beff a.argptr[0] = unsafe.Pointer(&a.arg_Beff) a.arg_m = m a.argptr[1] = unsafe.Pointer(&a.arg_m) a.arg_wx = wx a.argptr[2] = unsafe.Pointer(&a.arg_wx) a.arg_wy = wy a.argptr[3] = unsafe.Pointer(&a.arg_wy) a.arg_wz = wz a.argptr[4] = unsafe.Pointer(&a.arg_wz) a.arg_N0 = N0 a.argptr[5] = unsafe.Pointer(&a.arg_N0) a.arg_N1 = N1 a.argptr[6] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[7] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(addexchange1comp_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for madd3 CUDA kernel, asynchronous. func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config, str cu.Stream) { if madd3_code == 0 { madd3_code = fatbinLoad(madd3_map, "madd3") } var a madd3_args a.arg_dst = dst a.argptr[0] = unsafe.Pointer(&a.arg_dst) a.arg_src1 = src1 a.argptr[1] = unsafe.Pointer(&a.arg_src1) a.arg_fac1 = fac1 a.argptr[2] = unsafe.Pointer(&a.arg_fac1) a.arg_src2 = src2 a.argptr[3] = unsafe.Pointer(&a.arg_src2) a.arg_fac2 = fac2 a.argptr[4] = unsafe.Pointer(&a.arg_fac2) a.arg_src3 = src3 a.argptr[5] = unsafe.Pointer(&a.arg_src3) a.arg_fac3 = fac3 a.argptr[6] = unsafe.Pointer(&a.arg_fac3) a.arg_N = N a.argptr[7] = unsafe.Pointer(&a.arg_N) args := a.argptr[:] cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for adduniaxialanisotropy CUDA kernel, asynchronous. func k_adduniaxialanisotropy_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ux float32, Uy float32, Uz float32, N int, cfg *config, str cu.Stream) { if adduniaxialanisotropy_code == 0 { adduniaxialanisotropy_code = fatbinLoad(adduniaxialanisotropy_map, "adduniaxialanisotropy") } var a adduniaxialanisotropy_args a.arg_Bx = Bx a.argptr[0] = unsafe.Pointer(&a.arg_Bx) a.arg_By = By a.argptr[1] = unsafe.Pointer(&a.arg_By) a.arg_Bz = Bz a.argptr[2] = unsafe.Pointer(&a.arg_Bz) a.arg_mx = mx a.argptr[3] = unsafe.Pointer(&a.arg_mx) a.arg_my = my a.argptr[4] = unsafe.Pointer(&a.arg_my) a.arg_mz = mz a.argptr[5] = unsafe.Pointer(&a.arg_mz) a.arg_Ux = Ux a.argptr[6] = unsafe.Pointer(&a.arg_Ux) a.arg_Uy = Uy a.argptr[7] = unsafe.Pointer(&a.arg_Uy) a.arg_Uz = Uz a.argptr[8] = unsafe.Pointer(&a.arg_Uz) a.arg_N = N a.argptr[9] = unsafe.Pointer(&a.arg_N) args := a.argptr[:] cu.LaunchKernel(adduniaxialanisotropy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for addzhanglitorque CUDA kernel, asynchronous. func k_addzhanglitorque_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, ux float32, uy float32, uz float32, jmapx unsafe.Pointer, jmapy unsafe.Pointer, jmapz unsafe.Pointer, alpha float32, xi float32, N0 int, N1 int, N2 int, cfg *config, str cu.Stream) { if addzhanglitorque_code == 0 { addzhanglitorque_code = fatbinLoad(addzhanglitorque_map, "addzhanglitorque") } var a addzhanglitorque_args a.arg_tx = tx a.argptr[0] = unsafe.Pointer(&a.arg_tx) a.arg_ty = ty a.argptr[1] = unsafe.Pointer(&a.arg_ty) a.arg_tz = tz a.argptr[2] = unsafe.Pointer(&a.arg_tz) a.arg_mx = mx a.argptr[3] = unsafe.Pointer(&a.arg_mx) a.arg_my = my a.argptr[4] = unsafe.Pointer(&a.arg_my) a.arg_mz = mz a.argptr[5] = unsafe.Pointer(&a.arg_mz) a.arg_ux = ux a.argptr[6] = unsafe.Pointer(&a.arg_ux) a.arg_uy = uy a.argptr[7] = unsafe.Pointer(&a.arg_uy) a.arg_uz = uz a.argptr[8] = unsafe.Pointer(&a.arg_uz) a.arg_jmapx = jmapx a.argptr[9] = unsafe.Pointer(&a.arg_jmapx) a.arg_jmapy = jmapy a.argptr[10] = unsafe.Pointer(&a.arg_jmapy) a.arg_jmapz = jmapz a.argptr[11] = unsafe.Pointer(&a.arg_jmapz) a.arg_alpha = alpha a.argptr[12] = unsafe.Pointer(&a.arg_alpha) a.arg_xi = xi a.argptr[13] = unsafe.Pointer(&a.arg_xi) a.arg_N0 = N0 a.argptr[14] = unsafe.Pointer(&a.arg_N0) a.arg_N1 = N1 a.argptr[15] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[16] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(addzhanglitorque_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for kernmulRSymm2Dx CUDA kernel, asynchronous. func k_kernmulRSymm2Dx_async(fftMx unsafe.Pointer, fftKxx unsafe.Pointer, N1 int, N2 int, cfg *config, str cu.Stream) { if kernmulRSymm2Dx_code == 0 { kernmulRSymm2Dx_code = fatbinLoad(kernmulRSymm2Dx_map, "kernmulRSymm2Dx") } var a kernmulRSymm2Dx_args a.arg_fftMx = fftMx a.argptr[0] = unsafe.Pointer(&a.arg_fftMx) a.arg_fftKxx = fftKxx a.argptr[1] = unsafe.Pointer(&a.arg_fftKxx) a.arg_N1 = N1 a.argptr[2] = unsafe.Pointer(&a.arg_N1) a.arg_N2 = N2 a.argptr[3] = unsafe.Pointer(&a.arg_N2) args := a.argptr[:] cu.LaunchKernel(kernmulRSymm2Dx_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for normalize CUDA kernel, asynchronous. func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, N int, cfg *config, str cu.Stream) { if normalize_code == 0 { normalize_code = fatbinLoad(normalize_map, "normalize") } var a normalize_args a.arg_vx = vx a.argptr[0] = unsafe.Pointer(&a.arg_vx) a.arg_vy = vy a.argptr[1] = unsafe.Pointer(&a.arg_vy) a.arg_vz = vz a.argptr[2] = unsafe.Pointer(&a.arg_vz) a.arg_N = N a.argptr[3] = unsafe.Pointer(&a.arg_N) args := a.argptr[:] cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for reducesum CUDA kernel, asynchronous. func k_reducesum_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config, str cu.Stream) { if reducesum_code == 0 { reducesum_code = fatbinLoad(reducesum_map, "reducesum") } var a reducesum_args a.arg_src = src a.argptr[0] = unsafe.Pointer(&a.arg_src) a.arg_dst = dst a.argptr[1] = unsafe.Pointer(&a.arg_dst) a.arg_initVal = initVal a.argptr[2] = unsafe.Pointer(&a.arg_initVal) a.arg_n = n a.argptr[3] = unsafe.Pointer(&a.arg_n) args := a.argptr[:] cu.LaunchKernel(reducesum_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for reducedot CUDA kernel, asynchronous. func k_reducedot_async(x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config, str cu.Stream) { if reducedot_code == 0 { reducedot_code = fatbinLoad(reducedot_map, "reducedot") } var a reducedot_args a.arg_x1 = x1 a.argptr[0] = unsafe.Pointer(&a.arg_x1) a.arg_x2 = x2 a.argptr[1] = unsafe.Pointer(&a.arg_x2) a.arg_dst = dst a.argptr[2] = unsafe.Pointer(&a.arg_dst) a.arg_initVal = initVal a.argptr[3] = unsafe.Pointer(&a.arg_initVal) a.arg_n = n a.argptr[4] = unsafe.Pointer(&a.arg_n) args := a.argptr[:] cu.LaunchKernel(reducedot_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous. func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config, str cu.Stream) { if reducemaxvecnorm2_code == 0 { reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2") } var a reducemaxvecnorm2_args a.arg_x = x a.argptr[0] = unsafe.Pointer(&a.arg_x) a.arg_y = y a.argptr[1] = unsafe.Pointer(&a.arg_y) a.arg_z = z a.argptr[2] = unsafe.Pointer(&a.arg_z) a.arg_dst = dst a.argptr[3] = unsafe.Pointer(&a.arg_dst) a.arg_initVal = initVal a.argptr[4] = unsafe.Pointer(&a.arg_initVal) a.arg_n = n a.argptr[5] = unsafe.Pointer(&a.arg_n) args := a.argptr[:] cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }
// Wrapper for kernmulC CUDA kernel, asynchronous. func k_kernmulC_async(Mx unsafe.Pointer, My unsafe.Pointer, Mz unsafe.Pointer, Kxx unsafe.Pointer, Kyy unsafe.Pointer, Kzz unsafe.Pointer, Kyz unsafe.Pointer, Kxz unsafe.Pointer, Kxy unsafe.Pointer, Kzy unsafe.Pointer, Kzx unsafe.Pointer, Kyx unsafe.Pointer, N int, cfg *config, str cu.Stream) { if kernmulC_code == 0 { kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC") } var a kernmulC_args a.arg_Mx = Mx a.argptr[0] = unsafe.Pointer(&a.arg_Mx) a.arg_My = My a.argptr[1] = unsafe.Pointer(&a.arg_My) a.arg_Mz = Mz a.argptr[2] = unsafe.Pointer(&a.arg_Mz) a.arg_Kxx = Kxx a.argptr[3] = unsafe.Pointer(&a.arg_Kxx) a.arg_Kyy = Kyy a.argptr[4] = unsafe.Pointer(&a.arg_Kyy) a.arg_Kzz = Kzz a.argptr[5] = unsafe.Pointer(&a.arg_Kzz) a.arg_Kyz = Kyz a.argptr[6] = unsafe.Pointer(&a.arg_Kyz) a.arg_Kxz = Kxz a.argptr[7] = unsafe.Pointer(&a.arg_Kxz) a.arg_Kxy = Kxy a.argptr[8] = unsafe.Pointer(&a.arg_Kxy) a.arg_Kzy = Kzy a.argptr[9] = unsafe.Pointer(&a.arg_Kzy) a.arg_Kzx = Kzx a.argptr[10] = unsafe.Pointer(&a.arg_Kzx) a.arg_Kyx = Kyx a.argptr[11] = unsafe.Pointer(&a.arg_Kyx) a.arg_N = N a.argptr[12] = unsafe.Pointer(&a.arg_N) args := a.argptr[:] cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, str, args) }