func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) { // Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer, // so we copy the argument values go C memory first. argv := C.malloc(C.size_t(len(kernelParams) * pointerSize)) argp := C.malloc(C.size_t(len(kernelParams) * pointerSize)) defer C.free(argv) defer C.free(argp) for i := range kernelParams { *((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i] *((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i] } err := Result(C.cuLaunchKernel( C.CUfunction(unsafe.Pointer(uintptr(f))), C.uint(gridDimX), C.uint(gridDimY), C.uint(gridDimZ), C.uint(blockDimX), C.uint(blockDimY), C.uint(blockDimZ), C.uint(sharedMemBytes), C.CUstream(unsafe.Pointer(uintptr(stream))), (*unsafe.Pointer)(argp), (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) if err != SUCCESS { panic(err) } }
// Destroys the asynchronous stream func (stream *Stream) Destroy() { str := *stream *stream = Stream(uintptr(0)) err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str))))) if err != SUCCESS { panic(err) } }
// Asynchronously copies a number of bytes from host to device. // The host memory must be page-locked (see MemRegister) func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) { err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } }
// Asynchronously copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) { err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } }
// Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) { err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } }
// Launches a CUDA kernel on the device. // Example: // mod := ModuleLoad("file.ptx") // f := mod.GetFunction("test") // // var arg1 uintptr // arg1 = uintptr(someArray) // // var arg2 float32 // arg2 = 42 // // var arg3 int // arg3 = 1024 // // args := []uintptr{(uintptr)(unsafe.Pointer(&array)), (uintptr)(unsafe.Pointer(&value)), (uintptr)(unsafe.Pointer(&n))} // // block := 128 // grid := DivUp(N, block) // shmem := 0 // stream := STREAM0 // LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, stream, args) // // A more easy-to-use wrapper is implemented in closure.go // func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []uintptr) { //debug: print all arguments argvals := make([]int, len(kernelParams)) for i := range kernelParams { argvals[i] = *(*int)(unsafe.Pointer(kernelParams[i])) } //fmt.Println("LaunchKernel: ", "func: ", f, "gridDim: ", gridDimX, gridDimY, gridDimZ, "blockDim: ", blockDimX, blockDimY, blockDimZ, "shmem: ", sharedMemBytes, "stream: ", stream, "argptrs: ", kernelParams, "argvals:", argvals) err := Result(C.cuLaunchKernel(C.CUfunction(unsafe.Pointer(uintptr(f))), C.uint(gridDimX), C.uint(gridDimY), C.uint(gridDimZ), C.uint(blockDimX), C.uint(blockDimY), C.uint(blockDimZ), C.uint(sharedMemBytes), C.CUstream(unsafe.Pointer(uintptr(stream))), (*unsafe.Pointer)(unsafe.Pointer(&kernelParams[0])), (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) if err != SUCCESS { panic(err) } }
// Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Query() Result { return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream))))) }
// Blocks until the stream has completed. func (stream Stream) Synchronize() { err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } }
// Asynchronously copies a number of bytes device host to host. // The host memory must be page-locked (see MemRegister) func MemcpyDtoHAsync(dst HostPtr, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoHAsync(unsafe.Pointer(uintptr(dst)), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } }