// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) { oksrclen := p.InputLenFloats() if src.Len() != oksrclen { panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLenFloats() if dst.Len() != okdstlen { panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(cu.DevicePtr(src.DevPtr(0)), cu.DevicePtr(dst.DevPtr(0))) }
// Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) { util.Argument(src.NComp() == 1 && dst.NComp() == 1) oksrclen := p.InputLen() if src.Len() != oksrclen { log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()) } p.handle.ExecR2C(cu.DevicePtr(src.DevPtr(0)), cu.DevicePtr(dst.DevPtr(0))) }
// copy back single float result from GPU and recycle buffer func copyback(buf unsafe.Pointer) float32 { var result_ [1]float32 result := result_[:] cu.MemcpyDtoH(unsafe.Pointer(&result[0]), cu.DevicePtr(buf), 1*cu.SIZEOF_FLOAT32) reduceBuffers <- buf return result_[0] }
// Memset sets the Slice's components to the specified values. func Memset(s *data.Slice, val ...float32) { util.Argument(len(val) == s.NComp()) str := stream() for c, v := range val { cu.MemsetD32Async(cu.DevicePtr(s.DevPtr(c)), math.Float32bits(v), int64(s.Len()), str) } syncAndRecycle(str) }
// internal base func for all slice() functions func (s *slice) slice(start, stop int, elemsize uintptr) slice { if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 { panic("cuda4/safe: slice index out of bounds") } if start > stop { panic("cuda4/safe: inverted slice range") } return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start} }
// return a 1-float CUDA reduction buffer from a pool // initialized to initVal func reduceBuf(initVal float32) unsafe.Pointer { if reduceBuffers == nil { initReduceBuf() } buf := <-reduceBuffers str := stream() cu.MemsetD32Async(cu.DevicePtr(buf), math.Float32bits(initVal), 1, str) syncAndRecycle(str) return buf }
func newSlice(nComp int, m *data.Mesh, alloc func(int64) unsafe.Pointer, memType int8) *data.Slice { data.EnableGPU(memFree, cu.MemFreeHost, memCpy, memCpyDtoH, memCpyHtoD) length := m.NCell() bytes := int64(length) * cu.SIZEOF_FLOAT32 ptrs := make([]unsafe.Pointer, nComp) for c := range ptrs { ptrs[c] = unsafe.Pointer(alloc(bytes)) cu.MemsetD32(cu.DevicePtr(ptrs[c]), 0, int64(length)) } return data.SliceFromPtrs(m, memType, ptrs) }
// Manually set the pointer, length and capacity. // Side-steps the security mechanisms, use with caution. func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) { s.ptr_ = cu.DevicePtr(uintptr(pointer)) s.len_ = length s.cap_ = capacity }
func memCpy(dst, src unsafe.Pointer, bytes int64) { str := stream() cu.MemcpyAsync(cu.DevicePtr(dst), cu.DevicePtr(src), bytes, str) syncAndRecycle(str) }
func memCpyHtoD(dst, src unsafe.Pointer, bytes int64) { cu.MemcpyHtoD(cu.DevicePtr(dst), src, bytes) }
func memCpyDtoH(dst, src unsafe.Pointer, bytes int64) { cu.MemcpyDtoH(dst, cu.DevicePtr(src), bytes) }
func memFree(ptr unsafe.Pointer) { cu.MemFree(cu.DevicePtr(ptr)) }
// zero 1-component slice func zero1(dst *data.Slice, str cu.Stream) { cu.MemsetD32Async(cu.DevicePtr(dst.DevPtr(0)), 0, int64(dst.Len()), str) }