func TestCpy(t *testing.T) { N0, N1, N2 := 2, 4, 32 N := N0 * N1 * N2 mesh := data.NewMesh(N0, N1, N2, 1, 1, 1) h1 := make([]float32, N) for i := range h1 { h1[i] = float32(i) } hs := data.SliceFromList([][]float32{h1}, mesh) d := NewSlice(1, mesh) data.Copy(d, hs) d2 := NewSlice(1, mesh) data.Copy(d2, d) h2 := data.NewSlice(1, mesh) data.Copy(h2, d2) res := h2.Host()[0] for i := range res { if res[i] != h1[i] { t.Fail() } } }
// Initialize GPU FFT kernel for 2D. // Only the non-redundant parts are stored on the GPU. func (c *DemagConvolution) initFFTKern2D() { padded := c.kernSize ffted := fftR2COutputSizeFloats(padded) realsize := ffted realsize[2] /= 2 c.fftKernSize = realsize halfkern := realsize halfkern[1] = halfkern[1]/2 + 1 fwPlan := c.fwPlan output := c.fftCBuf[0] input := c.fftRBuf[0] // upper triangular part fftKern := data.NewSlice(1, data.NewMesh(halfkern[0], halfkern[1], halfkern[2], 1, 1, 1)) for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if c.kern[i][j] != nil { // ignore 0's data.Copy(input, c.kern[i][j]) fwPlan.Exec(input, output) scaleRealParts(fftKern, output.Slice(0, prod(halfkern)*2), 1/float32(fwPlan.InputLen())) c.gpuFFTKern[i][j] = GPUCopy(fftKern) } } } }
func testConvolution(c *DemagConvolution, mesh *data.Mesh) { inhost := data.NewSlice(3, mesh) initConvTestInput(inhost.Vectors()) gpu := NewSlice(3, mesh) defer gpu.Free() data.Copy(gpu, inhost) c.Exec(gpu, gpu, data.NilSlice(1, mesh), 1) output := gpu.HostCopy() //data.MustWriteFile("gpu.dump", output, 0) // rm! brute := data.NewSlice(3, mesh) bruteConv(inhost.Vectors(), brute.Vectors(), c.kern) //data.MustWriteFile("brute.dump", brute, 0) // rm! a, b := output.Host(), brute.Host() err := float32(0) for c := range a { for i := range a[c] { if abs(a[c][i]-b[c][i]) > err { err = abs(a[c][i] - b[c][i]) } } } if err > CONV_TOLERANCE { log.Fatal("convolution self-test error: ", err) } else { log.Println("convolution self-test error:", err) } }
func toGPU(list []float32) *data.Slice { mesh := data.NewMesh(1, 1, len(list), 1, 1, 1) h := data.SliceFromList([][]float32{list}, mesh) d := NewSlice(1, mesh) data.Copy(d, h) return d }
// continuously takes download tasks and queues corresponding save tasks. // the downloader queue is not buffered and we want to use at most one GPU // output buffer. Only one PCIe download at a time can proceed anyway. func runDownloader() { cuda.LockThread() for t := range dlQue { h := hostbuf() data.Copy(h, t.output) // output is already locked t.unlockOutput() saveQue <- saveTask{t.fname, h, t.time, func() { hBuf <- h }} } close(saveQue) }
// Take one time step func (e *Heun) Step() { dy0 := e.dy0 dt := float32(e.Dt_si * e.dt_mul) // could check here if it is in float32 ranges util.Assert(dt > 0) // stage 1 { Dy := e.torqueFn(true) // <- hook here for output, always good step output dy := Dy.Read() y := e.y.Write() Madd2(y, y, dy, 1, dt) // y = y + dt * dy e.y.WriteDone() data.Copy(dy0, dy) Dy.ReadDone() } // stage 2 { *e.time += e.Dt_si Dy := e.torqueFn(false) dy := Dy.Read() err := 0.0 if !e.Fixdt { err = MaxVecDiff(dy0, dy) * float64(dt) solverCheckErr(err) } y := e.y.Write() if err < e.MaxErr || e.Dt_si <= e.Mindt { // mindt check to avoid infinite loop // step OK Madd3(y, y, dy, dy0, 1, 0.5*dt, -0.5*dt) e.postStep(y) e.NSteps++ e.adaptDt(math.Pow(e.MaxErr/err, 1./2.)) e.LastErr = err } else { // undo bad step util.Assert(!e.Fixdt) *e.time -= e.Dt_si Madd2(y, y, dy0, 1, -dt) e.NUndone++ e.adaptDt(math.Pow(e.MaxErr/err, 1./3.)) } e.y.WriteDone() Dy.ReadDone() } }
func main() { cuda.Init() N0, N1, N2 := 16, 16, 16 c := 1. mesh := data.NewMesh(N0, N1, N2, c, c, c) m := cuda.NewSlice(3, mesh) conv := cuda.NewDemag(mesh) mhost := m.HostCopy() m_ := mhost.Vectors() r := float64(N2) / 2 for i := 0; i < N0; i++ { x := c * (float64(i) + 0.5 - float64(N0)/2) for j := 0; j < N1; j++ { y := c * (float64(j) + 0.5 - float64(N1)/2) for k := 0; k < N2; k++ { z := c * (float64(k) + 0.5 - float64(N2)/2) if x*x+y*y+z*z < r*r { m_[0][i][j][k] = 1 m_[1][i][j][k] = 2 m_[2][i][j][k] = 3 } } } } data.Copy(m, mhost) B := cuda.NewSlice(3, mesh) conv.Exec(B, m, data.NilSlice(1, mesh), 1) out := B.HostCopy() bx := out.Vectors()[0][N0/2][N1/2][N2/2] by := out.Vectors()[1][N0/2][N1/2][N2/2] bz := out.Vectors()[2][N0/2][N1/2][N2/2] fmt.Println("demag tensor:", bx, by/2, bz/3) check(bx, -1./3.) check(by, -2./3.) check(bz, -3./3.) fmt.Println("OK") }
func (m *buffered) Upload(src *data.Slice) { m_ := m.Write() data.Copy(m_, src) m.WriteDone() }
// Returns a copy of in, allocated on GPU. func GPUCopy(in *data.Slice) *data.Slice { s := NewSlice(in.NComp(), in.Mesh()) data.Copy(s, in) return s }