func MemCpy(dst, src unsafe.Pointer, bytes int64) { Sync() timer.Start("memcpy") cu.MemcpyAsync(cu.DevicePtr(uintptr(dst)), cu.DevicePtr(uintptr(src)), bytes, stream0) Sync() timer.Stop("memcpy") }
func MemCpyHtoD(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyHtoD") cu.MemcpyHtoD(cu.DevicePtr(uintptr(dst)), src, bytes) Sync() // sync copy timer.Stop("memcpyHtoD") }
// Wrapper for kernmulRSymm3D CUDA kernel, asynchronous. func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm3D") } kernmulRSymm3D_args.Lock() defer kernmulRSymm3D_args.Unlock() if kernmulRSymm3D_code == 0 { kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D") } kernmulRSymm3D_args.arg_fftMx = fftMx kernmulRSymm3D_args.arg_fftMy = fftMy kernmulRSymm3D_args.arg_fftMz = fftMz kernmulRSymm3D_args.arg_fftKxx = fftKxx kernmulRSymm3D_args.arg_fftKyy = fftKyy kernmulRSymm3D_args.arg_fftKzz = fftKzz kernmulRSymm3D_args.arg_fftKyz = fftKyz kernmulRSymm3D_args.arg_fftKxz = fftKxz kernmulRSymm3D_args.arg_fftKxy = fftKxy kernmulRSymm3D_args.arg_Nx = Nx kernmulRSymm3D_args.arg_Ny = Ny kernmulRSymm3D_args.arg_Nz = Nz args := kernmulRSymm3D_args.argptr[:] cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm3D") } }
// Wrapper for reducemaxdiff CUDA kernel, asynchronous. func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxdiff") } reducemaxdiff_args.Lock() defer reducemaxdiff_args.Unlock() if reducemaxdiff_code == 0 { reducemaxdiff_code = fatbinLoad(reducemaxdiff_map, "reducemaxdiff") } reducemaxdiff_args.arg_src1 = src1 reducemaxdiff_args.arg_src2 = src2 reducemaxdiff_args.arg_dst = dst reducemaxdiff_args.arg_initVal = initVal reducemaxdiff_args.arg_n = n args := reducemaxdiff_args.argptr[:] cu.LaunchKernel(reducemaxdiff_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxdiff") } }
// Wrapper for minimize CUDA kernel, asynchronous. func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("minimize") } minimize_args.Lock() defer minimize_args.Unlock() if minimize_code == 0 { minimize_code = fatbinLoad(minimize_map, "minimize") } minimize_args.arg_mx = mx minimize_args.arg_my = my minimize_args.arg_mz = mz minimize_args.arg_m0x = m0x minimize_args.arg_m0y = m0y minimize_args.arg_m0z = m0z minimize_args.arg_tx = tx minimize_args.arg_ty = ty minimize_args.arg_tz = tz minimize_args.arg_dt = dt minimize_args.arg_N = N args := minimize_args.argptr[:] cu.LaunchKernel(minimize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("minimize") } }
// Wrapper for normalize CUDA kernel, asynchronous. func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("normalize") } normalize_args.Lock() defer normalize_args.Unlock() if normalize_code == 0 { normalize_code = fatbinLoad(normalize_map, "normalize") } normalize_args.arg_vx = vx normalize_args.arg_vy = vy normalize_args.arg_vz = vz normalize_args.arg_vol = vol normalize_args.arg_N = N args := normalize_args.argptr[:] cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("normalize") } }
// Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous. func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecnorm2") } reducemaxvecnorm2_args.Lock() defer reducemaxvecnorm2_args.Unlock() if reducemaxvecnorm2_code == 0 { reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2") } reducemaxvecnorm2_args.arg_x = x reducemaxvecnorm2_args.arg_y = y reducemaxvecnorm2_args.arg_z = z reducemaxvecnorm2_args.arg_dst = dst reducemaxvecnorm2_args.arg_initVal = initVal reducemaxvecnorm2_args.arg_n = n args := reducemaxvecnorm2_args.argptr[:] cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecnorm2") } }
// Wrapper for madd3 CUDA kernel, asynchronous. func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd3") } madd3_args.Lock() defer madd3_args.Unlock() if madd3_code == 0 { madd3_code = fatbinLoad(madd3_map, "madd3") } madd3_args.arg_dst = dst madd3_args.arg_src1 = src1 madd3_args.arg_fac1 = fac1 madd3_args.arg_src2 = src2 madd3_args.arg_fac2 = fac2 madd3_args.arg_src3 = src3 madd3_args.arg_fac3 = fac3 madd3_args.arg_N = N args := madd3_args.argptr[:] cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd3") } }
// Wrapper for regiondecode CUDA kernel, asynchronous. func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regiondecode") } regiondecode_args.Lock() defer regiondecode_args.Unlock() if regiondecode_code == 0 { regiondecode_code = fatbinLoad(regiondecode_map, "regiondecode") } regiondecode_args.arg_dst = dst regiondecode_args.arg_LUT = LUT regiondecode_args.arg_regions = regions regiondecode_args.arg_N = N args := regiondecode_args.argptr[:] cu.LaunchKernel(regiondecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regiondecode") } }
// Wrapper for exchangedecode CUDA kernel, asynchronous. func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("exchangedecode") } exchangedecode_args.Lock() defer exchangedecode_args.Unlock() if exchangedecode_code == 0 { exchangedecode_code = fatbinLoad(exchangedecode_map, "exchangedecode") } exchangedecode_args.arg_dst = dst exchangedecode_args.arg_aLUT2d = aLUT2d exchangedecode_args.arg_regions = regions exchangedecode_args.arg_wx = wx exchangedecode_args.arg_wy = wy exchangedecode_args.arg_wz = wz exchangedecode_args.arg_Nx = Nx exchangedecode_args.arg_Ny = Ny exchangedecode_args.arg_Nz = Nz exchangedecode_args.arg_PBC = PBC args := exchangedecode_args.argptr[:] cu.LaunchKernel(exchangedecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("exchangedecode") } }
// Wrapper for mul CUDA kernel, asynchronous. func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("mul") } mul_args.Lock() defer mul_args.Unlock() if mul_code == 0 { mul_code = fatbinLoad(mul_map, "mul") } mul_args.arg_dst = dst mul_args.arg_a = a mul_args.arg_b = b mul_args.arg_N = N args := mul_args.argptr[:] cu.LaunchKernel(mul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("mul") } }
// Wrapper for llnoprecess CUDA kernel, asynchronous. func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("llnoprecess") } llnoprecess_args.Lock() defer llnoprecess_args.Unlock() if llnoprecess_code == 0 { llnoprecess_code = fatbinLoad(llnoprecess_map, "llnoprecess") } llnoprecess_args.arg_tx = tx llnoprecess_args.arg_ty = ty llnoprecess_args.arg_tz = tz llnoprecess_args.arg_mx = mx llnoprecess_args.arg_my = my llnoprecess_args.arg_mz = mz llnoprecess_args.arg_hx = hx llnoprecess_args.arg_hy = hy llnoprecess_args.arg_hz = hz llnoprecess_args.arg_N = N args := llnoprecess_args.argptr[:] cu.LaunchKernel(llnoprecess_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("llnoprecess") } }
// Wrapper for adduniaxialanisotropy CUDA kernel, asynchronous. func k_adduniaxialanisotropy_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, K1LUT unsafe.Pointer, K2LUT unsafe.Pointer, uxLUT unsafe.Pointer, uyLUT unsafe.Pointer, uzLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("adduniaxialanisotropy") } adduniaxialanisotropy_args.Lock() defer adduniaxialanisotropy_args.Unlock() if adduniaxialanisotropy_code == 0 { adduniaxialanisotropy_code = fatbinLoad(adduniaxialanisotropy_map, "adduniaxialanisotropy") } adduniaxialanisotropy_args.arg_Bx = Bx adduniaxialanisotropy_args.arg_By = By adduniaxialanisotropy_args.arg_Bz = Bz adduniaxialanisotropy_args.arg_mx = mx adduniaxialanisotropy_args.arg_my = my adduniaxialanisotropy_args.arg_mz = mz adduniaxialanisotropy_args.arg_K1LUT = K1LUT adduniaxialanisotropy_args.arg_K2LUT = K2LUT adduniaxialanisotropy_args.arg_uxLUT = uxLUT adduniaxialanisotropy_args.arg_uyLUT = uyLUT adduniaxialanisotropy_args.arg_uzLUT = uzLUT adduniaxialanisotropy_args.arg_regions = regions adduniaxialanisotropy_args.arg_N = N args := adduniaxialanisotropy_args.argptr[:] cu.LaunchKernel(adduniaxialanisotropy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adduniaxialanisotropy") } }
// Wrapper for kernmulC CUDA kernel, asynchronous. func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulC") } kernmulC_args.Lock() defer kernmulC_args.Unlock() if kernmulC_code == 0 { kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC") } kernmulC_args.arg_fftM = fftM kernmulC_args.arg_fftK = fftK kernmulC_args.arg_Nx = Nx kernmulC_args.arg_Ny = Ny args := kernmulC_args.argptr[:] cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulC") } }
// Wrapper for shiftbytes CUDA kernel, asynchronous. func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftbytes") } shiftbytes_args.Lock() defer shiftbytes_args.Unlock() if shiftbytes_code == 0 { shiftbytes_code = fatbinLoad(shiftbytes_map, "shiftbytes") } shiftbytes_args.arg_dst = dst shiftbytes_args.arg_src = src shiftbytes_args.arg_Nx = Nx shiftbytes_args.arg_Ny = Ny shiftbytes_args.arg_Nz = Nz shiftbytes_args.arg_shx = shx shiftbytes_args.arg_clamp = clamp args := shiftbytes_args.argptr[:] cu.LaunchKernel(shiftbytes_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftbytes") } }
// Wrapper for dotproduct CUDA kernel, asynchronous. func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("dotproduct") } dotproduct_args.Lock() defer dotproduct_args.Unlock() if dotproduct_code == 0 { dotproduct_code = fatbinLoad(dotproduct_map, "dotproduct") } dotproduct_args.arg_dst = dst dotproduct_args.arg_prefactor = prefactor dotproduct_args.arg_ax = ax dotproduct_args.arg_ay = ay dotproduct_args.arg_az = az dotproduct_args.arg_bx = bx dotproduct_args.arg_by = by dotproduct_args.arg_bz = bz dotproduct_args.arg_N = N args := dotproduct_args.argptr[:] cu.LaunchKernel(dotproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("dotproduct") } }
// Wrapper for crop CUDA kernel, asynchronous. func k_crop_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Offx int, Offy int, Offz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("crop") } crop_args.Lock() defer crop_args.Unlock() if crop_code == 0 { crop_code = fatbinLoad(crop_map, "crop") } crop_args.arg_dst = dst crop_args.arg_Dx = Dx crop_args.arg_Dy = Dy crop_args.arg_Dz = Dz crop_args.arg_src = src crop_args.arg_Sx = Sx crop_args.arg_Sy = Sy crop_args.arg_Sz = Sz crop_args.arg_Offx = Offx crop_args.arg_Offy = Offy crop_args.arg_Offz = Offz args := crop_args.argptr[:] cu.LaunchKernel(crop_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("crop") } }
// Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous. func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecdiff2") } reducemaxvecdiff2_args.Lock() defer reducemaxvecdiff2_args.Unlock() if reducemaxvecdiff2_code == 0 { reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2") } reducemaxvecdiff2_args.arg_x1 = x1 reducemaxvecdiff2_args.arg_y1 = y1 reducemaxvecdiff2_args.arg_z1 = z1 reducemaxvecdiff2_args.arg_x2 = x2 reducemaxvecdiff2_args.arg_y2 = y2 reducemaxvecdiff2_args.arg_z2 = z2 reducemaxvecdiff2_args.arg_dst = dst reducemaxvecdiff2_args.arg_initVal = initVal reducemaxvecdiff2_args.arg_n = n args := reducemaxvecdiff2_args.argptr[:] cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecdiff2") } }
// Wrapper for settemperature CUDA kernel, asynchronous. func k_settemperature_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, tempRedLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("settemperature") } settemperature_args.Lock() defer settemperature_args.Unlock() if settemperature_code == 0 { settemperature_code = fatbinLoad(settemperature_map, "settemperature") } settemperature_args.arg_B = B settemperature_args.arg_noise = noise settemperature_args.arg_kB2_VgammaDt = kB2_VgammaDt settemperature_args.arg_tempRedLUT = tempRedLUT settemperature_args.arg_regions = regions settemperature_args.arg_N = N args := settemperature_args.argptr[:] cu.LaunchKernel(settemperature_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settemperature") } }
// Wrapper for zeromask CUDA kernel, asynchronous. func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("zeromask") } zeromask_args.Lock() defer zeromask_args.Unlock() if zeromask_code == 0 { zeromask_code = fatbinLoad(zeromask_map, "zeromask") } zeromask_args.arg_dst = dst zeromask_args.arg_maskLUT = maskLUT zeromask_args.arg_regions = regions zeromask_args.arg_N = N args := zeromask_args.argptr[:] cu.LaunchKernel(zeromask_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("zeromask") } }
// Wrapper for copypadmul CUDA kernel, asynchronous. func k_copypadmul_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, vol unsafe.Pointer, Sx int, Sy int, Sz int, BsatLUT unsafe.Pointer, regions unsafe.Pointer, cfg *config) { if Synchronous { // debug Sync() timer.Start("copypadmul") } copypadmul_args.Lock() defer copypadmul_args.Unlock() if copypadmul_code == 0 { copypadmul_code = fatbinLoad(copypadmul_map, "copypadmul") } copypadmul_args.arg_dst = dst copypadmul_args.arg_Dx = Dx copypadmul_args.arg_Dy = Dy copypadmul_args.arg_Dz = Dz copypadmul_args.arg_src = src copypadmul_args.arg_vol = vol copypadmul_args.arg_Sx = Sx copypadmul_args.arg_Sy = Sy copypadmul_args.arg_Sz = Sz copypadmul_args.arg_BsatLUT = BsatLUT copypadmul_args.arg_regions = regions args := copypadmul_args.argptr[:] cu.LaunchKernel(copypadmul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("copypadmul") } }
func MemCpyDtoH(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyDtoH") cu.MemcpyDtoH(dst, cu.DevicePtr(uintptr(src)), bytes) Sync() // sync copy timer.Stop("memcpyDtoH") }
// Wrapper for resize CUDA kernel, asynchronous. func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) { if Synchronous { // debug Sync() timer.Start("resize") } resize_args.Lock() defer resize_args.Unlock() if resize_code == 0 { resize_code = fatbinLoad(resize_map, "resize") } resize_args.arg_dst = dst resize_args.arg_Dx = Dx resize_args.arg_Dy = Dy resize_args.arg_Dz = Dz resize_args.arg_src = src resize_args.arg_Sx = Sx resize_args.arg_Sy = Sy resize_args.arg_Sz = Sz resize_args.arg_layer = layer resize_args.arg_scalex = scalex resize_args.arg_scaley = scaley args := resize_args.argptr[:] cu.LaunchKernel(resize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("resize") } }
// Wrapper for regionaddv CUDA kernel, asynchronous. func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionaddv") } regionaddv_args.Lock() defer regionaddv_args.Unlock() if regionaddv_code == 0 { regionaddv_code = fatbinLoad(regionaddv_map, "regionaddv") } regionaddv_args.arg_dstx = dstx regionaddv_args.arg_dsty = dsty regionaddv_args.arg_dstz = dstz regionaddv_args.arg_LUTx = LUTx regionaddv_args.arg_LUTy = LUTy regionaddv_args.arg_LUTz = LUTz regionaddv_args.arg_regions = regions regionaddv_args.arg_N = N args := regionaddv_args.argptr[:] cu.LaunchKernel(regionaddv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionaddv") } }
func queOutput(f func()) { if cuda.Synchronous { timer.Start("io") } queLen.Add(1) saveQue <- f if cuda.Synchronous { timer.Stop("io") } }
// Memset sets the Slice's components to the specified values. // To be carefully used on unified slice (need sync) func Memset(s *data.Slice, val ...float32) { if Synchronous { // debug Sync() timer.Start("memset") } util.Argument(len(val) == s.NComp()) for c, v := range val { cu.MemsetD32Async(cu.DevicePtr(uintptr(s.DevPtr(c))), math.Float32bits(v), int64(s.Len()), stream0) } if Synchronous { //debug Sync() timer.Stop("memset") } }
func (t *DataTable) Flush() error { if t.output == nil { return nil } if cuda.Synchronous { timer.Start("io") } err := t.output.Flush() if cuda.Synchronous { timer.Stop("io") } util.FatalErr(err) return err }
// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } oksrclen := p.InputLenFloats() if src.Len() != oksrclen { panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLenFloats() if dst.Len() != okdstlen { panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } }
// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } util.Argument(src.NComp() == 1 && dst.NComp() == 1) oksrclen := p.InputLen() if src.Len() != oksrclen { log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()) } p.handle.ExecR2C(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } }
// Wrapper for adduniaxialanisotropy2 CUDA kernel, asynchronous. func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("adduniaxialanisotropy2") } adduniaxialanisotropy2_args.Lock() defer adduniaxialanisotropy2_args.Unlock() if adduniaxialanisotropy2_code == 0 { adduniaxialanisotropy2_code = fatbinLoad(adduniaxialanisotropy2_map, "adduniaxialanisotropy2") } adduniaxialanisotropy2_args.arg_Bx = Bx adduniaxialanisotropy2_args.arg_By = By adduniaxialanisotropy2_args.arg_Bz = Bz adduniaxialanisotropy2_args.arg_mx = mx adduniaxialanisotropy2_args.arg_my = my adduniaxialanisotropy2_args.arg_mz = mz adduniaxialanisotropy2_args.arg_Ms_ = Ms_ adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul adduniaxialanisotropy2_args.arg_K1_ = K1_ adduniaxialanisotropy2_args.arg_K1_mul = K1_mul adduniaxialanisotropy2_args.arg_K2_ = K2_ adduniaxialanisotropy2_args.arg_K2_mul = K2_mul adduniaxialanisotropy2_args.arg_ux_ = ux_ adduniaxialanisotropy2_args.arg_ux_mul = ux_mul adduniaxialanisotropy2_args.arg_uy_ = uy_ adduniaxialanisotropy2_args.arg_uy_mul = uy_mul adduniaxialanisotropy2_args.arg_uz_ = uz_ adduniaxialanisotropy2_args.arg_uz_mul = uz_mul adduniaxialanisotropy2_args.arg_N = N args := adduniaxialanisotropy2_args.argptr[:] cu.LaunchKernel(adduniaxialanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adduniaxialanisotropy2") } }