示例#1
0
文件: slice.go 项目: kyeongdong/3
func MemCpy(dst, src unsafe.Pointer, bytes int64) {
	Sync()
	timer.Start("memcpy")
	cu.MemcpyAsync(cu.DevicePtr(uintptr(dst)), cu.DevicePtr(uintptr(src)), bytes, stream0)
	Sync()
	timer.Stop("memcpy")
}
示例#2
0
文件: slice.go 项目: kyeongdong/3
func MemCpyHtoD(dst, src unsafe.Pointer, bytes int64) {
	Sync() // sync previous kernels
	timer.Start("memcpyHtoD")
	cu.MemcpyHtoD(cu.DevicePtr(uintptr(dst)), src, bytes)
	Sync() // sync copy
	timer.Stop("memcpyHtoD")
}
示例#3
0
// Wrapper for kernmulRSymm3D CUDA kernel, asynchronous.
func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("kernmulRSymm3D")
	}

	kernmulRSymm3D_args.Lock()
	defer kernmulRSymm3D_args.Unlock()

	if kernmulRSymm3D_code == 0 {
		kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D")
	}

	kernmulRSymm3D_args.arg_fftMx = fftMx
	kernmulRSymm3D_args.arg_fftMy = fftMy
	kernmulRSymm3D_args.arg_fftMz = fftMz
	kernmulRSymm3D_args.arg_fftKxx = fftKxx
	kernmulRSymm3D_args.arg_fftKyy = fftKyy
	kernmulRSymm3D_args.arg_fftKzz = fftKzz
	kernmulRSymm3D_args.arg_fftKyz = fftKyz
	kernmulRSymm3D_args.arg_fftKxz = fftKxz
	kernmulRSymm3D_args.arg_fftKxy = fftKxy
	kernmulRSymm3D_args.arg_Nx = Nx
	kernmulRSymm3D_args.arg_Ny = Ny
	kernmulRSymm3D_args.arg_Nz = Nz

	args := kernmulRSymm3D_args.argptr[:]
	cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("kernmulRSymm3D")
	}
}
示例#4
0
// Wrapper for reducemaxdiff CUDA kernel, asynchronous.
func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("reducemaxdiff")
	}

	reducemaxdiff_args.Lock()
	defer reducemaxdiff_args.Unlock()

	if reducemaxdiff_code == 0 {
		reducemaxdiff_code = fatbinLoad(reducemaxdiff_map, "reducemaxdiff")
	}

	reducemaxdiff_args.arg_src1 = src1
	reducemaxdiff_args.arg_src2 = src2
	reducemaxdiff_args.arg_dst = dst
	reducemaxdiff_args.arg_initVal = initVal
	reducemaxdiff_args.arg_n = n

	args := reducemaxdiff_args.argptr[:]
	cu.LaunchKernel(reducemaxdiff_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("reducemaxdiff")
	}
}
示例#5
0
// Wrapper for minimize CUDA kernel, asynchronous.
func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("minimize")
	}

	minimize_args.Lock()
	defer minimize_args.Unlock()

	if minimize_code == 0 {
		minimize_code = fatbinLoad(minimize_map, "minimize")
	}

	minimize_args.arg_mx = mx
	minimize_args.arg_my = my
	minimize_args.arg_mz = mz
	minimize_args.arg_m0x = m0x
	minimize_args.arg_m0y = m0y
	minimize_args.arg_m0z = m0z
	minimize_args.arg_tx = tx
	minimize_args.arg_ty = ty
	minimize_args.arg_tz = tz
	minimize_args.arg_dt = dt
	minimize_args.arg_N = N

	args := minimize_args.argptr[:]
	cu.LaunchKernel(minimize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("minimize")
	}
}
示例#6
0
// Wrapper for normalize CUDA kernel, asynchronous.
func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("normalize")
	}

	normalize_args.Lock()
	defer normalize_args.Unlock()

	if normalize_code == 0 {
		normalize_code = fatbinLoad(normalize_map, "normalize")
	}

	normalize_args.arg_vx = vx
	normalize_args.arg_vy = vy
	normalize_args.arg_vz = vz
	normalize_args.arg_vol = vol
	normalize_args.arg_N = N

	args := normalize_args.argptr[:]
	cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("normalize")
	}
}
示例#7
0
// Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous.
func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("reducemaxvecnorm2")
	}

	reducemaxvecnorm2_args.Lock()
	defer reducemaxvecnorm2_args.Unlock()

	if reducemaxvecnorm2_code == 0 {
		reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2")
	}

	reducemaxvecnorm2_args.arg_x = x
	reducemaxvecnorm2_args.arg_y = y
	reducemaxvecnorm2_args.arg_z = z
	reducemaxvecnorm2_args.arg_dst = dst
	reducemaxvecnorm2_args.arg_initVal = initVal
	reducemaxvecnorm2_args.arg_n = n

	args := reducemaxvecnorm2_args.argptr[:]
	cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("reducemaxvecnorm2")
	}
}
示例#8
0
// Wrapper for madd3 CUDA kernel, asynchronous.
func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("madd3")
	}

	madd3_args.Lock()
	defer madd3_args.Unlock()

	if madd3_code == 0 {
		madd3_code = fatbinLoad(madd3_map, "madd3")
	}

	madd3_args.arg_dst = dst
	madd3_args.arg_src1 = src1
	madd3_args.arg_fac1 = fac1
	madd3_args.arg_src2 = src2
	madd3_args.arg_fac2 = fac2
	madd3_args.arg_src3 = src3
	madd3_args.arg_fac3 = fac3
	madd3_args.arg_N = N

	args := madd3_args.argptr[:]
	cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("madd3")
	}
}
示例#9
0
// Wrapper for regiondecode CUDA kernel, asynchronous.
func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("regiondecode")
	}

	regiondecode_args.Lock()
	defer regiondecode_args.Unlock()

	if regiondecode_code == 0 {
		regiondecode_code = fatbinLoad(regiondecode_map, "regiondecode")
	}

	regiondecode_args.arg_dst = dst
	regiondecode_args.arg_LUT = LUT
	regiondecode_args.arg_regions = regions
	regiondecode_args.arg_N = N

	args := regiondecode_args.argptr[:]
	cu.LaunchKernel(regiondecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("regiondecode")
	}
}
示例#10
0
// Wrapper for exchangedecode CUDA kernel, asynchronous.
func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("exchangedecode")
	}

	exchangedecode_args.Lock()
	defer exchangedecode_args.Unlock()

	if exchangedecode_code == 0 {
		exchangedecode_code = fatbinLoad(exchangedecode_map, "exchangedecode")
	}

	exchangedecode_args.arg_dst = dst
	exchangedecode_args.arg_aLUT2d = aLUT2d
	exchangedecode_args.arg_regions = regions
	exchangedecode_args.arg_wx = wx
	exchangedecode_args.arg_wy = wy
	exchangedecode_args.arg_wz = wz
	exchangedecode_args.arg_Nx = Nx
	exchangedecode_args.arg_Ny = Ny
	exchangedecode_args.arg_Nz = Nz
	exchangedecode_args.arg_PBC = PBC

	args := exchangedecode_args.argptr[:]
	cu.LaunchKernel(exchangedecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("exchangedecode")
	}
}
示例#11
0
// Wrapper for mul CUDA kernel, asynchronous.
func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("mul")
	}

	mul_args.Lock()
	defer mul_args.Unlock()

	if mul_code == 0 {
		mul_code = fatbinLoad(mul_map, "mul")
	}

	mul_args.arg_dst = dst
	mul_args.arg_a = a
	mul_args.arg_b = b
	mul_args.arg_N = N

	args := mul_args.argptr[:]
	cu.LaunchKernel(mul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("mul")
	}
}
示例#12
0
// Wrapper for llnoprecess CUDA kernel, asynchronous.
func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("llnoprecess")
	}

	llnoprecess_args.Lock()
	defer llnoprecess_args.Unlock()

	if llnoprecess_code == 0 {
		llnoprecess_code = fatbinLoad(llnoprecess_map, "llnoprecess")
	}

	llnoprecess_args.arg_tx = tx
	llnoprecess_args.arg_ty = ty
	llnoprecess_args.arg_tz = tz
	llnoprecess_args.arg_mx = mx
	llnoprecess_args.arg_my = my
	llnoprecess_args.arg_mz = mz
	llnoprecess_args.arg_hx = hx
	llnoprecess_args.arg_hy = hy
	llnoprecess_args.arg_hz = hz
	llnoprecess_args.arg_N = N

	args := llnoprecess_args.argptr[:]
	cu.LaunchKernel(llnoprecess_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("llnoprecess")
	}
}
示例#13
0
// Wrapper for adduniaxialanisotropy CUDA kernel, asynchronous.
func k_adduniaxialanisotropy_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, K1LUT unsafe.Pointer, K2LUT unsafe.Pointer, uxLUT unsafe.Pointer, uyLUT unsafe.Pointer, uzLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("adduniaxialanisotropy")
	}

	adduniaxialanisotropy_args.Lock()
	defer adduniaxialanisotropy_args.Unlock()

	if adduniaxialanisotropy_code == 0 {
		adduniaxialanisotropy_code = fatbinLoad(adduniaxialanisotropy_map, "adduniaxialanisotropy")
	}

	adduniaxialanisotropy_args.arg_Bx = Bx
	adduniaxialanisotropy_args.arg_By = By
	adduniaxialanisotropy_args.arg_Bz = Bz
	adduniaxialanisotropy_args.arg_mx = mx
	adduniaxialanisotropy_args.arg_my = my
	adduniaxialanisotropy_args.arg_mz = mz
	adduniaxialanisotropy_args.arg_K1LUT = K1LUT
	adduniaxialanisotropy_args.arg_K2LUT = K2LUT
	adduniaxialanisotropy_args.arg_uxLUT = uxLUT
	adduniaxialanisotropy_args.arg_uyLUT = uyLUT
	adduniaxialanisotropy_args.arg_uzLUT = uzLUT
	adduniaxialanisotropy_args.arg_regions = regions
	adduniaxialanisotropy_args.arg_N = N

	args := adduniaxialanisotropy_args.argptr[:]
	cu.LaunchKernel(adduniaxialanisotropy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("adduniaxialanisotropy")
	}
}
示例#14
0
// Wrapper for kernmulC CUDA kernel, asynchronous.
func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("kernmulC")
	}

	kernmulC_args.Lock()
	defer kernmulC_args.Unlock()

	if kernmulC_code == 0 {
		kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC")
	}

	kernmulC_args.arg_fftM = fftM
	kernmulC_args.arg_fftK = fftK
	kernmulC_args.arg_Nx = Nx
	kernmulC_args.arg_Ny = Ny

	args := kernmulC_args.argptr[:]
	cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("kernmulC")
	}
}
示例#15
0
// Wrapper for shiftbytes CUDA kernel, asynchronous.
func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("shiftbytes")
	}

	shiftbytes_args.Lock()
	defer shiftbytes_args.Unlock()

	if shiftbytes_code == 0 {
		shiftbytes_code = fatbinLoad(shiftbytes_map, "shiftbytes")
	}

	shiftbytes_args.arg_dst = dst
	shiftbytes_args.arg_src = src
	shiftbytes_args.arg_Nx = Nx
	shiftbytes_args.arg_Ny = Ny
	shiftbytes_args.arg_Nz = Nz
	shiftbytes_args.arg_shx = shx
	shiftbytes_args.arg_clamp = clamp

	args := shiftbytes_args.argptr[:]
	cu.LaunchKernel(shiftbytes_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("shiftbytes")
	}
}
示例#16
0
// Wrapper for dotproduct CUDA kernel, asynchronous.
func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("dotproduct")
	}

	dotproduct_args.Lock()
	defer dotproduct_args.Unlock()

	if dotproduct_code == 0 {
		dotproduct_code = fatbinLoad(dotproduct_map, "dotproduct")
	}

	dotproduct_args.arg_dst = dst
	dotproduct_args.arg_prefactor = prefactor
	dotproduct_args.arg_ax = ax
	dotproduct_args.arg_ay = ay
	dotproduct_args.arg_az = az
	dotproduct_args.arg_bx = bx
	dotproduct_args.arg_by = by
	dotproduct_args.arg_bz = bz
	dotproduct_args.arg_N = N

	args := dotproduct_args.argptr[:]
	cu.LaunchKernel(dotproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("dotproduct")
	}
}
示例#17
0
// Wrapper for crop CUDA kernel, asynchronous.
func k_crop_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Offx int, Offy int, Offz int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("crop")
	}

	crop_args.Lock()
	defer crop_args.Unlock()

	if crop_code == 0 {
		crop_code = fatbinLoad(crop_map, "crop")
	}

	crop_args.arg_dst = dst
	crop_args.arg_Dx = Dx
	crop_args.arg_Dy = Dy
	crop_args.arg_Dz = Dz
	crop_args.arg_src = src
	crop_args.arg_Sx = Sx
	crop_args.arg_Sy = Sy
	crop_args.arg_Sz = Sz
	crop_args.arg_Offx = Offx
	crop_args.arg_Offy = Offy
	crop_args.arg_Offz = Offz

	args := crop_args.argptr[:]
	cu.LaunchKernel(crop_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("crop")
	}
}
示例#18
0
// Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous.
func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("reducemaxvecdiff2")
	}

	reducemaxvecdiff2_args.Lock()
	defer reducemaxvecdiff2_args.Unlock()

	if reducemaxvecdiff2_code == 0 {
		reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2")
	}

	reducemaxvecdiff2_args.arg_x1 = x1
	reducemaxvecdiff2_args.arg_y1 = y1
	reducemaxvecdiff2_args.arg_z1 = z1
	reducemaxvecdiff2_args.arg_x2 = x2
	reducemaxvecdiff2_args.arg_y2 = y2
	reducemaxvecdiff2_args.arg_z2 = z2
	reducemaxvecdiff2_args.arg_dst = dst
	reducemaxvecdiff2_args.arg_initVal = initVal
	reducemaxvecdiff2_args.arg_n = n

	args := reducemaxvecdiff2_args.argptr[:]
	cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("reducemaxvecdiff2")
	}
}
示例#19
0
// Wrapper for settemperature CUDA kernel, asynchronous.
func k_settemperature_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, tempRedLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("settemperature")
	}

	settemperature_args.Lock()
	defer settemperature_args.Unlock()

	if settemperature_code == 0 {
		settemperature_code = fatbinLoad(settemperature_map, "settemperature")
	}

	settemperature_args.arg_B = B
	settemperature_args.arg_noise = noise
	settemperature_args.arg_kB2_VgammaDt = kB2_VgammaDt
	settemperature_args.arg_tempRedLUT = tempRedLUT
	settemperature_args.arg_regions = regions
	settemperature_args.arg_N = N

	args := settemperature_args.argptr[:]
	cu.LaunchKernel(settemperature_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("settemperature")
	}
}
示例#20
0
// Wrapper for zeromask CUDA kernel, asynchronous.
func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("zeromask")
	}

	zeromask_args.Lock()
	defer zeromask_args.Unlock()

	if zeromask_code == 0 {
		zeromask_code = fatbinLoad(zeromask_map, "zeromask")
	}

	zeromask_args.arg_dst = dst
	zeromask_args.arg_maskLUT = maskLUT
	zeromask_args.arg_regions = regions
	zeromask_args.arg_N = N

	args := zeromask_args.argptr[:]
	cu.LaunchKernel(zeromask_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("zeromask")
	}
}
示例#21
0
// Wrapper for copypadmul CUDA kernel, asynchronous.
func k_copypadmul_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, vol unsafe.Pointer, Sx int, Sy int, Sz int, BsatLUT unsafe.Pointer, regions unsafe.Pointer, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("copypadmul")
	}

	copypadmul_args.Lock()
	defer copypadmul_args.Unlock()

	if copypadmul_code == 0 {
		copypadmul_code = fatbinLoad(copypadmul_map, "copypadmul")
	}

	copypadmul_args.arg_dst = dst
	copypadmul_args.arg_Dx = Dx
	copypadmul_args.arg_Dy = Dy
	copypadmul_args.arg_Dz = Dz
	copypadmul_args.arg_src = src
	copypadmul_args.arg_vol = vol
	copypadmul_args.arg_Sx = Sx
	copypadmul_args.arg_Sy = Sy
	copypadmul_args.arg_Sz = Sz
	copypadmul_args.arg_BsatLUT = BsatLUT
	copypadmul_args.arg_regions = regions

	args := copypadmul_args.argptr[:]
	cu.LaunchKernel(copypadmul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("copypadmul")
	}
}
示例#22
0
文件: slice.go 项目: kyeongdong/3
func MemCpyDtoH(dst, src unsafe.Pointer, bytes int64) {
	Sync() // sync previous kernels
	timer.Start("memcpyDtoH")
	cu.MemcpyDtoH(dst, cu.DevicePtr(uintptr(src)), bytes)
	Sync() // sync copy
	timer.Stop("memcpyDtoH")
}
示例#23
0
// Wrapper for resize CUDA kernel, asynchronous.
func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("resize")
	}

	resize_args.Lock()
	defer resize_args.Unlock()

	if resize_code == 0 {
		resize_code = fatbinLoad(resize_map, "resize")
	}

	resize_args.arg_dst = dst
	resize_args.arg_Dx = Dx
	resize_args.arg_Dy = Dy
	resize_args.arg_Dz = Dz
	resize_args.arg_src = src
	resize_args.arg_Sx = Sx
	resize_args.arg_Sy = Sy
	resize_args.arg_Sz = Sz
	resize_args.arg_layer = layer
	resize_args.arg_scalex = scalex
	resize_args.arg_scaley = scaley

	args := resize_args.argptr[:]
	cu.LaunchKernel(resize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("resize")
	}
}
示例#24
0
// Wrapper for regionaddv CUDA kernel, asynchronous.
func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("regionaddv")
	}

	regionaddv_args.Lock()
	defer regionaddv_args.Unlock()

	if regionaddv_code == 0 {
		regionaddv_code = fatbinLoad(regionaddv_map, "regionaddv")
	}

	regionaddv_args.arg_dstx = dstx
	regionaddv_args.arg_dsty = dsty
	regionaddv_args.arg_dstz = dstz
	regionaddv_args.arg_LUTx = LUTx
	regionaddv_args.arg_LUTy = LUTy
	regionaddv_args.arg_LUTz = LUTz
	regionaddv_args.arg_regions = regions
	regionaddv_args.arg_N = N

	args := regionaddv_args.argptr[:]
	cu.LaunchKernel(regionaddv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("regionaddv")
	}
}
示例#25
0
文件: asyncio.go 项目: callistoaz/3
func queOutput(f func()) {
	if cuda.Synchronous {
		timer.Start("io")
	}
	queLen.Add(1)
	saveQue <- f
	if cuda.Synchronous {
		timer.Stop("io")
	}
}
示例#26
0
文件: slice.go 项目: kyeongdong/3
// Memset sets the Slice's components to the specified values.
// To be carefully used on unified slice (need sync)
func Memset(s *data.Slice, val ...float32) {
	if Synchronous { // debug
		Sync()
		timer.Start("memset")
	}
	util.Argument(len(val) == s.NComp())
	for c, v := range val {
		cu.MemsetD32Async(cu.DevicePtr(uintptr(s.DevPtr(c))), math.Float32bits(v), int64(s.Len()), stream0)
	}
	if Synchronous { //debug
		Sync()
		timer.Stop("memset")
	}
}
示例#27
0
文件: table.go 项目: kyeongdong/3
func (t *DataTable) Flush() error {
	if t.output == nil {
		return nil
	}

	if cuda.Synchronous {
		timer.Start("io")
	}
	err := t.output.Flush()
	if cuda.Synchronous {
		timer.Stop("io")
	}
	util.FatalErr(err)
	return err
}
示例#28
0
文件: fft3dc2r.go 项目: callistoaz/3
// Execute the FFT plan, asynchronous.
// src and dst are 3D arrays stored 1D arrays.
func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) {
	if Synchronous {
		Sync()
		timer.Start("fft")
	}
	oksrclen := p.InputLenFloats()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLenFloats()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecC2R(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0))))
	if Synchronous {
		Sync()
		timer.Stop("fft")
	}
}
示例#29
0
文件: fft3dr2c.go 项目: callistoaz/3
// Execute the FFT plan, asynchronous.
// src and dst are 3D arrays stored 1D arrays.
func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) {
	if Synchronous {
		Sync()
		timer.Start("fft")
	}
	util.Argument(src.NComp() == 1 && dst.NComp() == 1)
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())
	}
	p.handle.ExecR2C(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0))))
	if Synchronous {
		Sync()
		timer.Stop("fft")
	}
}
示例#30
0
// Wrapper for adduniaxialanisotropy2 CUDA kernel, asynchronous.
func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) {
	if Synchronous { // debug
		Sync()
		timer.Start("adduniaxialanisotropy2")
	}

	adduniaxialanisotropy2_args.Lock()
	defer adduniaxialanisotropy2_args.Unlock()

	if adduniaxialanisotropy2_code == 0 {
		adduniaxialanisotropy2_code = fatbinLoad(adduniaxialanisotropy2_map, "adduniaxialanisotropy2")
	}

	adduniaxialanisotropy2_args.arg_Bx = Bx
	adduniaxialanisotropy2_args.arg_By = By
	adduniaxialanisotropy2_args.arg_Bz = Bz
	adduniaxialanisotropy2_args.arg_mx = mx
	adduniaxialanisotropy2_args.arg_my = my
	adduniaxialanisotropy2_args.arg_mz = mz
	adduniaxialanisotropy2_args.arg_Ms_ = Ms_
	adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul
	adduniaxialanisotropy2_args.arg_K1_ = K1_
	adduniaxialanisotropy2_args.arg_K1_mul = K1_mul
	adduniaxialanisotropy2_args.arg_K2_ = K2_
	adduniaxialanisotropy2_args.arg_K2_mul = K2_mul
	adduniaxialanisotropy2_args.arg_ux_ = ux_
	adduniaxialanisotropy2_args.arg_ux_mul = ux_mul
	adduniaxialanisotropy2_args.arg_uy_ = uy_
	adduniaxialanisotropy2_args.arg_uy_mul = uy_mul
	adduniaxialanisotropy2_args.arg_uz_ = uz_
	adduniaxialanisotropy2_args.arg_uz_mul = uz_mul
	adduniaxialanisotropy2_args.arg_N = N

	args := adduniaxialanisotropy2_args.argptr[:]
	cu.LaunchKernel(adduniaxialanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)

	if Synchronous { // debug
		Sync()
		timer.Stop("adduniaxialanisotropy2")
	}
}