func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Bsat float64) { padded := c.kernSize // FW FFT for i := 0; i < 3; i++ { zero1(c.fftRBuf[i], c.stream) in := inp.Comp(i) copyPadMul(c.fftRBuf[i], in, padded, c.size, vol, Bsat, c.stream) c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i]) } // kern mul N0, N1, N2 := c.fftKernSize[0], c.fftKernSize[1], c.fftKernSize[2] // TODO: rm these kernMulRSymm3D(c.fftCBuf, c.gpuFFTKern[0][0], c.gpuFFTKern[1][1], c.gpuFFTKern[2][2], c.gpuFFTKern[1][2], c.gpuFFTKern[0][2], c.gpuFFTKern[0][1], N0, N1, N2, c.stream) // BW FFT for i := 0; i < 3; i++ { c.bwPlan.ExecAsync(c.fftCBuf[i], c.fftRBuf[i]) out := outp.Comp(i) copyPad(out, c.fftRBuf[i], c.size, padded, c.stream) } c.stream.Synchronize() }
func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Bsat float64) { // Convolution is separated into // a 1D convolution for x and a 2D convolution for yz. // So only 2 FFT buffers are needed at the same time. // FFT x zero1(c.fftRBuf[0], c.stream) in := inp.Comp(0) padded := c.kernSize copyPadMul(c.fftRBuf[0], in, padded, c.size, vol, Bsat, c.stream) c.fwPlan.ExecAsync(c.fftRBuf[0], c.fftCBuf[0]) // kern mul X N1, N2 := c.fftKernSize[1], c.fftKernSize[2] // TODO: rm these kernMulRSymm2Dx(c.fftCBuf[0], c.gpuFFTKern[0][0], N1, N2, c.stream) // bw FFT x c.bwPlan.ExecAsync(c.fftCBuf[0], c.fftRBuf[0]) out := outp.Comp(0) copyPad(out, c.fftRBuf[0], c.size, padded, c.stream) // FW FFT yz for i := 1; i < 3; i++ { zero1(c.fftRBuf[i], c.stream) in := inp.Comp(i) copyPadMul(c.fftRBuf[i], in, padded, c.size, vol, Bsat, c.stream) c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i]) } // kern mul yz kernMulRSymm2Dyz(c.fftCBuf[1], c.fftCBuf[2], c.gpuFFTKern[1][1], c.gpuFFTKern[2][2], c.gpuFFTKern[1][2], N1, N2, c.stream) // BW FFT yz for i := 1; i < 3; i++ { c.bwPlan.ExecAsync(c.fftCBuf[i], c.fftRBuf[i]) out := outp.Comp(i) copyPad(out, c.fftRBuf[i], c.size, padded, c.stream) } c.stream.Synchronize() }