func MemCpy(dst, src unsafe.Pointer, bytes int64) { Sync() timer.Start("memcpy") cu.MemcpyAsync(cu.DevicePtr(uintptr(dst)), cu.DevicePtr(uintptr(src)), bytes, stream0) Sync() timer.Stop("memcpy") }
// Frees the GPU memory and disables the slice. func (b *Bytes) Free() { if b.Ptr != nil { cu.MemFree(cu.DevicePtr(uintptr(b.Ptr))) } b.Ptr = nil b.Len = 0 }
func MemCpyHtoD(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyHtoD") cu.MemcpyHtoD(cu.DevicePtr(uintptr(dst)), src, bytes) Sync() // sync copy timer.Stop("memcpyHtoD") }
func MemCpyDtoH(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyDtoH") cu.MemcpyDtoH(dst, cu.DevicePtr(uintptr(src)), bytes) Sync() // sync copy timer.Stop("memcpyDtoH") }
// return a 1-float CUDA reduction buffer from a pool // initialized to initVal func reduceBuf(initVal float32) unsafe.Pointer { if reduceBuffers == nil { initReduceBuf() } buf := <-reduceBuffers cu.MemsetD32Async(cu.DevicePtr(uintptr(buf)), math.Float32bits(initVal), 1, stream0) return buf }
// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } oksrclen := p.InputLenFloats() if src.Len() != oksrclen { panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLenFloats() if dst.Len() != okdstlen { panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } }
// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } util.Argument(src.NComp() == 1 && dst.NComp() == 1) oksrclen := p.InputLen() if src.Len() != oksrclen { log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()) } p.handle.ExecR2C(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } }
// Frees all buffers. Called after mesh resize. func FreeBuffers() { Sync() for _, size := range buf_pool { for i := range size { cu.DevicePtr(uintptr(size[i])).Free() size[i] = nil } } buf_pool = make(map[int][]unsafe.Pointer) buf_check = make(map[unsafe.Pointer]struct{}) }
func newSlice(nComp int, size [3]int, alloc func(int64) unsafe.Pointer, memType int8) *data.Slice { data.EnableGPU(memFree, cu.MemFreeHost, MemCpy, MemCpyDtoH, MemCpyHtoD) length := prod(size) bytes := int64(length) * cu.SIZEOF_FLOAT32 ptrs := make([]unsafe.Pointer, nComp) for c := range ptrs { ptrs[c] = unsafe.Pointer(alloc(bytes)) cu.MemsetD32(cu.DevicePtr(uintptr(ptrs[c])), 0, int64(length)) } return data.SliceFromPtrs(size, memType, ptrs) }
// Memset sets the Slice's components to the specified values. // To be carefully used on unified slice (need sync) func Memset(s *data.Slice, val ...float32) { if Synchronous { // debug Sync() timer.Start("memset") } util.Argument(len(val) == s.NComp()) for c, v := range val { cu.MemsetD32Async(cu.DevicePtr(uintptr(s.DevPtr(c))), math.Float32bits(v), int64(s.Len()), stream0) } if Synchronous { //debug Sync() timer.Stop("memset") } }
// zero 1-component slice func zero1_async(dst *data.Slice) { cu.MemsetD32Async(cu.DevicePtr(uintptr(dst.DevPtr(0))), 0, int64(dst.Len()), stream0) }
// Construct new byte slice with given length, // initialised to zeros. func NewBytes(Len int) *Bytes { ptr := cu.MemAlloc(int64(Len)) cu.MemsetD8(cu.DevicePtr(ptr), 0, int64(Len)) return &Bytes{unsafe.Pointer(uintptr(ptr)), Len} }
func memFree(ptr unsafe.Pointer) { cu.MemFree(cu.DevicePtr(uintptr(ptr))) }