// Returns a GPU slice for temporary use. To be returned to the pool with Recycle func Buffer(nComp int, size [3]int) *data.Slice { if Synchronous { Sync() } ptrs := make([]unsafe.Pointer, nComp) // re-use as many buffers as possible form our stack N := prod(size) pool := buf_pool[N] nFromPool := iMin(nComp, len(pool)) for i := 0; i < nFromPool; i++ { ptrs[i] = pool[len(pool)-i-1] } buf_pool[N] = pool[:len(pool)-nFromPool] // allocate as much new memory as needed for i := nFromPool; i < nComp; i++ { if len(buf_check) >= buf_max { log.Panic("too many buffers in use, possible memory leak") } ptrs[i] = MemAlloc(int64(cu.SIZEOF_FLOAT32 * N)) buf_check[ptrs[i]] = struct{}{} // mark this pointer as mine } return data.SliceFromPtrs(size, data.GPUMemory, ptrs) }
func sliceFromList(arr [][]float32, size [3]int) *data.Slice { ptrs := make([]unsafe.Pointer, len(arr)) for i := range ptrs { util.Argument(len(arr[i]) == prod(size)) ptrs[i] = unsafe.Pointer(&arr[i][0]) } return data.SliceFromPtrs(size, data.CPUMemory, ptrs) }
func newSlice(nComp int, size [3]int, alloc func(int64) unsafe.Pointer, memType int8) *data.Slice { data.EnableGPU(memFree, cu.MemFreeHost, MemCpy, MemCpyDtoH, MemCpyHtoD) length := prod(size) bytes := int64(length) * cu.SIZEOF_FLOAT32 ptrs := make([]unsafe.Pointer, nComp) for c := range ptrs { ptrs[c] = unsafe.Pointer(alloc(bytes)) cu.MemsetD32(cu.DevicePtr(uintptr(ptrs[c])), 0, int64(length)) } return data.SliceFromPtrs(size, memType, ptrs) }
func (c *MFMConvolution) init() { // init FFT plans padded := c.kernSize c.fwPlan = newFFT3DR2C(padded[X], padded[Y], padded[Z]) c.bwPlan = newFFT3DC2R(padded[X], padded[Y], padded[Z]) // init device buffers nc := fftR2COutputSizeFloats(c.kernSize) c.fftCBuf = NewSlice(1, nc) c.fftRBuf = data.SliceFromPtrs(c.kernSize, data.GPUMemory, []unsafe.Pointer{c.fftCBuf.DevPtr(0)}) c.gpuFFTKern[X] = NewSlice(1, nc) c.gpuFFTKern[Y] = NewSlice(1, nc) c.gpuFFTKern[Z] = NewSlice(1, nc) c.initFFTKern3D() }
func (c *DemagConvolution) init(realKern [3][3]*data.Slice) { // init device buffers // 2D re-uses fftBuf[X] as fftBuf[Z], 3D needs all 3 fftBufs. nc := fftR2COutputSizeFloats(c.realKernSize) c.fftCBuf[X] = NewSlice(1, nc) c.fftCBuf[Y] = NewSlice(1, nc) if c.is2D() { c.fftCBuf[Z] = c.fftCBuf[X] } else { c.fftCBuf[Z] = NewSlice(1, nc) } // Real buffer shares storage with Complex buffer for i := 0; i < 3; i++ { c.fftRBuf[i] = data.SliceFromPtrs(c.realKernSize, data.GPUMemory, []unsafe.Pointer{c.fftCBuf[i].DevPtr(0)}) } // init FFT plans c.fwPlan = newFFT3DR2C(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) c.bwPlan = newFFT3DC2R(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) // init FFT kernel // logic size of FFT(kernel): store real parts only c.fftKernLogicSize = fftR2COutputSizeFloats(c.realKernSize) util.Assert(c.fftKernLogicSize[X]%2 == 0) c.fftKernLogicSize[X] /= 2 // physical size of FFT(kernel): store only non-redundant part exploiting Y, Z mirror symmetry // X mirror symmetry already exploited: FFT(kernel) is purely real. physKSize := [3]int{c.fftKernLogicSize[X], c.fftKernLogicSize[Y]/2 + 1, c.fftKernLogicSize[Z]/2 + 1} output := c.fftCBuf[0] input := c.fftRBuf[0] fftKern := data.NewSlice(1, physKSize) kfull := data.NewSlice(1, output.Size()) // not yet exploiting symmetry kfulls := kfull.Scalars() kCSize := physKSize kCSize[X] *= 2 // size of kernel after removing Y,Z redundant parts, but still complex kCmplx := data.NewSlice(1, kCSize) // not yet exploiting X symmetry kc := kCmplx.Scalars() for i := 0; i < 3; i++ { for j := i; j < 3; j++ { // upper triangular part if realKern[i][j] != nil { // ignore 0's // FW FFT data.Copy(input, realKern[i][j]) c.fwPlan.ExecAsync(input, output) data.Copy(kfull, output) // extract non-redundant part (Y,Z symmetry) for iz := 0; iz < kCSize[Z]; iz++ { for iy := 0; iy < kCSize[Y]; iy++ { for ix := 0; ix < kCSize[X]; ix++ { kc[iz][iy][ix] = kfulls[iz][iy][ix] } } } // extract real parts (X symmetry) scaleRealParts(fftKern, kCmplx, 1/float32(c.fwPlan.InputLen())) c.kern[i][j] = GPUCopy(fftKern) } } } }