func (plan *MaxwellPlan) init() { if plan.initialized { return } plan.initialized = true e := GetEngine() dataSize := e.GridSize() logicSize := e.PaddedSize() Assert(len(dataSize) == 3) Assert(len(logicSize) == 3) // init size copy(plan.dataSize[:], dataSize) copy(plan.logicSize[:], logicSize) // init fft fftOutputSize := gpu.FFTOutputSize(logicSize) plan.fftBuf = gpu.NewArray(3, fftOutputSize) plan.fftPlan = gpu.NewDefaultFFT(dataSize, logicSize) // init M plan.M = gpu.NewArray(3, dataSize) // init fftKern copy(plan.fftKernSize[:], gpu.FFTOutputSize(logicSize)) plan.fftKernSize[2] = plan.fftKernSize[2] / 2 // store only non-redundant parts }
func newDerivativeUpdater(orig, diff *Quant) *derivativeUpdater { u := new(derivativeUpdater) u.val = orig u.diff = diff u.lastVal = gpu.NewArray(orig.NComp(), orig.Size3D()) // TODO: alloc only if needed? u.lastDiff = gpu.NewArray(orig.NComp(), orig.Size3D()) // TODO: alloc only if needed? u.lastT = math.Inf(-1) // so the first time the derivative is taken it will be 0 u.lastStep = 0 //? return u }
// Mumax2 self-test function. // Benchmarks cuda memcpyDtoD func testMain() { size := []int{10, 1024, 1024} a := gpu.NewArray(1, size) defer a.Free() b := gpu.NewArray(1, size) defer b.Free() Log("Testing CUDA") N := 1000 start := time.Now() for i := 0; i < N; i++ { a.CopyFromDevice(b) } t := float64(time.Now().Sub(start)) / 1e9 bw := float64(int64(Prod(size))*int64(N)*SIZEOF_FLOAT) / t bw /= 1e9 Log("Multi-GPU bandwidth:", float64(bw), "GB/s") }
func NewFFTUpdater(qin, qout *Quant) *FFTUpdater { u := new(FFTUpdater) u.in = qin u.out = qout meshSize := engine.GridSize() u.win = gpu.NewArray(1, meshSize) u.win.CopyFromHost(genWindow(meshSize)) u.q = gpu.NewArray(qin.NComp(), meshSize) u.norm = 1.0 / float64(gpu.FFTNormLogic(meshSize)) u.plan = gpu.NewDefaultFFT(meshSize, meshSize) engine.Depends(qout.Name(), qin.Name()) return u }
//// Loads a sub-kernel at position pos in the 3x3 global kernel matrix. //// The symmetry and real/imaginary/complex properties are taken into account to reduce storage. func (plan *MaxwellPlan) LoadKernel(kernel *host.Array, matsymm int, realness int) { // for i := range kernel.Array { // Debug("kernel", TensorIndexStr[i], ":", kernel.Array[i], "\n\n\n") // } //Assert(kernel.NComp() == 9) // full tensor if kernel.NComp() > 3 { testedsymm := MatrixSymmetry(kernel) Debug("matsymm", testedsymm) // TODO: re-enable!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! //Assert(matsymm == testedsymm) } Assert(matsymm == SYMMETRIC || matsymm == ANTISYMMETRIC || matsymm == NOSYMMETRY || matsymm == DIAGONAL) //if FFT'd kernel is pure real or imag, //store only relevant part and multiply by scaling later scaling := [3]complex128{complex(1, 0), complex(0, 1), complex(0, 0)}[realness] Debug("scaling=", scaling) // FFT input on GPU logic := plan.logicSize[:] devIn := gpu.NewArray(1, logic) defer devIn.Free() // FFT output on GPU devOut := gpu.NewArray(1, gpu.FFTOutputSize(logic)) defer devOut.Free() fullFFTPlan := gpu.NewDefaultFFT(logic, logic) defer fullFFTPlan.Free() // Maximum of all elements gives idea of scale. max := maxAbs(kernel.List) // FFT all components for k := 0; k < 9; k++ { i, j := IdxToIJ(k) // fills diagonal first, then upper, then lower // ignore off-diagonals of vector (would go out of bounds) if k > ZZ && matsymm == DIAGONAL { Debug("break", TensorIndexStr[k], "(off-diagonal)") break } // elements of diagonal kernel are stored in one column if matsymm == DIAGONAL { i = 0 } // clear data first AssertMsg(plan.fftKern[i][j] == nil, "I'm afraid I can't let you overwrite that") AssertMsg(plan.fftMul[i][j] == 0, "Likewise") // auto-fill lower triangle if possible if k > XY { if matsymm == SYMMETRIC { plan.fftKern[i][j] = plan.fftKern[j][i] plan.fftMul[i][j] = plan.fftMul[j][i] continue } if matsymm == ANTISYMMETRIC { plan.fftKern[i][j] = plan.fftKern[j][i] plan.fftMul[i][j] = -plan.fftMul[j][i] continue } } // ignore zeros if k < kernel.NComp() && IsZero(kernel.Comp[k], max) { Debug("kernel", TensorIndexStr[k], " == 0") plan.fftKern[i][j] = gpu.NilArray(1, []int{plan.fftKernSize[X], plan.fftKernSize[Y], plan.fftKernSize[Z]}) continue } // calculate FFT of kernel elementx Debug("use", TensorIndexStr[k]) devIn.CopyFromHost(kernel.Component(k)) fullFFTPlan.Forward(devIn, devOut) hostOut := devOut.LocalCopy() // extract real part of the kernel from the first quadrant (other parts are redundunt due to the symmetry properties) hostFFTKern := extract(hostOut) rescale(hostFFTKern, 1/float64(gpu.FFTNormLogic(logic))) plan.fftKern[i][j] = gpu.NewArray(1, hostFFTKern.Size3D) plan.fftKern[i][j].CopyFromHost(hostFFTKern) plan.fftMul[i][j] = scaling } }
func (p *ArrayPool) Get(nComp int, size []int) *gpu.Array { // TODO: actual recycling return gpu.NewArray(nComp, size) }