func TestCpy(t *testing.T) { Init(0) N0, N1, N2 := 2, 4, 32 N := N0 * N1 * N2 mesh := [3]int{N0, N1, N2} h1 := make([]float32, N) for i := range h1 { h1[i] = float32(i) } hs := sliceFromList([][]float32{h1}, mesh) d := NewSlice(1, mesh) data.Copy(d, hs) d2 := NewSlice(1, mesh) data.Copy(d2, d) h2 := data.NewSlice(1, mesh) data.Copy(h2, d2) res := h2.Host()[0] for i := range res { if res[i] != h1[i] { t.Fail() } } }
func (mini *Minimizer) Step() { m := M.Buffer() size := m.Size() k := mini.k h := mini.h // save original magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) // make descent cuda.Minimize(m, m0, k, h) // calculate new torque for next step k0 := cuda.Buffer(3, size) defer cuda.Recycle(k0) data.Copy(k0, k) torqueFn(k) setMaxTorque(k) // report to user // just to make the following readable dm := m0 dk := k0 // calculate step difference of m and k cuda.Madd2(dm, m, m0, 1., -1.) cuda.Madd2(dk, k, k0, -1., 1.) // reversed due to LLNoPrecess sign // get maxdiff and add to list max_dm := cuda.MaxVecNorm(dm) mini.lastDm.Add(max_dm) setLastErr(mini.lastDm.Max()) // report maxDm to user as LastErr // adjust next time step var nom, div float32 if NSteps%2 == 0 { nom = cuda.Dot(dm, dm) div = cuda.Dot(dm, dk) } else { nom = cuda.Dot(dm, dk) div = cuda.Dot(dk, dk) } if div != 0. { mini.h = nom / div } else { // in case of division by zero mini.h = 1e-4 } M.normalize() // as a convention, time does not advance during relax NSteps++ }
func (g *geom) shift(dx int) { // empty mask, nothing to do if g == nil || g.buffer.IsNil() { return } // allocated mask: shift s := g.buffer s2 := cuda.Buffer(1, g.Mesh().Size()) defer cuda.Recycle(s2) newv := float32(1) // initially fill edges with 1's cuda.ShiftX(s2, s, dx, newv, newv) data.Copy(s, s2) n := Mesh().Size() x1, x2 := shiftDirtyRange(dx) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := x1; ix < x2; ix++ { r := Index2Coord(ix, iy, iz) // includes shift if !g.shape(r[X], r[Y], r[Z]) { cuda.SetCell(g.buffer, 0, ix, iy, iz, 0) // a bit slowish, but hardly reached } } } } }
func (b *magnetization) SetArray(src *data.Slice) { if src.Size() != b.Mesh().Size() { src = data.Resample(src, b.Mesh().Size()) } data.Copy(b.Buffer(), src) M.normalize() }
func toGPU(list []float32) *data.Slice { mesh := [3]int{1, 1, len(list)} h := sliceFromList([][]float32{list}, mesh) d := NewSlice(1, mesh) data.Copy(d, h) return d }
func (m *magnetization) resize() { backup := m.Buffer().HostCopy() s2 := Mesh().Size() resized := data.Resample(backup, s2) m.buffer_.Free() m.buffer_ = cuda.NewSlice(VECTOR, s2) data.Copy(m.buffer_, resized) }
// Euler method, can be used as solver.Step. func (s *BackwardEuler) Step() { util.AssertMsg(MaxErr > 0, "Backward euler solver requires MaxErr > 0") t0 := Time y := M.Buffer() y0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(y0) data.Copy(y0, y) dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if s.dy1 == nil { s.dy1 = cuda.Buffer(VECTOR, y.Size()) } dy1 := s.dy1 Dt_si = FixDt dt := float32(Dt_si * GammaLL) util.AssertMsg(dt > 0, "Backward Euler solver requires fixed time step > 0") // Fist guess Time = t0 + 0.5*Dt_si // 0.5 dt makes it implicit midpoint method // with temperature, previous torque cannot be used as predictor if Temp.isZero() { cuda.Madd2(y, y0, dy1, 1, dt) // predictor euler step with previous torque M.normalize() } torqueFn(dy0) cuda.Madd2(y, y0, dy0, 1, dt) // y = y0 + dt * dy M.normalize() // One iteration torqueFn(dy1) cuda.Madd2(y, y0, dy1, 1, dt) // y = y0 + dt * dy1 M.normalize() Time = t0 + Dt_si err := cuda.MaxVecDiff(dy0, dy1) * float64(dt) // adjust next time step //if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK NSteps++ setLastErr(err) setMaxTorque(dy1) //} else { // undo bad step // util.Assert(FixDt == 0) // Time = t0 // data.Copy(y, y0) // NUndone++ //} }
func shiftMag(m *data.Slice, dx int) { m2 := cuda.Buffer(1, m.Size()) defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) cuda.ShiftX(m2, comp, dx, float32(ShiftMagL[c]), float32(ShiftMagR[c])) data.Copy(comp, m2) // str0 ? } }
func (c *MFMConvolution) initFFTKern3D() { c.fftKernSize = fftR2COutputSizeFloats(c.kernSize) for i := 0; i < 3; i++ { zero1_async(c.fftRBuf) data.Copy(c.fftRBuf, c.kern[i]) c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf) scale := 2 / float32(c.fwPlan.InputLen()) // ?? zero1_async(c.gpuFFTKern[i]) Madd2(c.gpuFFTKern[i], c.gpuFFTKern[i], c.fftCBuf, 0, scale) } }
// Compares FFT-accelerated convolution against brute-force on sparse data. // This is not really needed but very quickly uncovers newly introduced bugs. func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice) { if PBC != [3]int{0, 0, 0} || prod(c.inputSize) > 512*512 { // the brute-force method does not work for pbc, // and for large simulations it gets just too slow. util.Log("skipping convolution self-test") return } //fmt.Print("convolution test ") inhost := data.NewSlice(3, c.inputSize) initConvTestInput(inhost.Vectors()) gpu := NewSlice(3, c.inputSize) defer gpu.Free() data.Copy(gpu, inhost) regions := NewBytes(prod(c.inputSize)) defer regions.Free() Bsat := NewSlice(1, [3]int{1, 1, 256}) defer Bsat.Free() Memset(Bsat, 1) BsatLUT := LUTPtr(Bsat.DevPtr(0)) vol := data.NilSlice(1, c.inputSize) c.Exec(gpu, gpu, vol, BsatLUT, regions) output := gpu.HostCopy() brute := data.NewSlice(3, c.inputSize) bruteConv(inhost.Vectors(), brute.Vectors(), realKern) a, b := output.Host(), brute.Host() err := float32(0) for c := range a { for i := range a[c] { if fabs(a[c][i]-b[c][i]) > err { err = fabs(a[c][i] - b[c][i]) } } } if err > CONV_TOLERANCE { util.Fatal("convolution self-test tolerance: ", err, " FAIL") } }
// Compares FFT-accelerated convolution against brute-force on sparse data. // This is not really needed but very quickly uncovers newly introduced bugs. func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice) { if PBC != [3]int{0, 0, 0} { // the brute-force method does not work for pbc. util.Log("skipping convolution self-test for PBC") return } util.Log("//convolution self-test...") inhost := data.NewSlice(3, c.inputSize) initConvTestInput(inhost.Vectors()) gpu := NewSlice(3, c.inputSize) defer gpu.Free() data.Copy(gpu, inhost) Msat := NewSlice(1, [3]int{1, 1, 256}) defer Msat.Free() Memset(Msat, 1) vol := data.NilSlice(1, c.inputSize) c.Exec(gpu, gpu, vol, ToMSlice(Msat)) output := gpu.HostCopy() brute := data.NewSlice(3, c.inputSize) bruteConv(inhost.Vectors(), brute.Vectors(), realKern) a, b := output.Host(), brute.Host() err := float32(0) for c := range a { for i := range a[c] { if fabs(a[c][i]-b[c][i]) > err { err = fabs(a[c][i] - b[c][i]) } } } if err > CONV_TOLERANCE { util.Fatal("convolution self-test tolerance: ", err, " FAIL") } }
// Sets dst to the demag field, but cells where NoDemagSpins != 0 do not generate nor recieve field. func setMaskedDemagField(dst *data.Slice) { // No-demag spins: mask-out geometry with zeros where NoDemagSpins is set, // so these spins do not generate a field buf := cuda.Buffer(SCALAR, geometry.Gpu().Size()) // masked-out geometry defer cuda.Recycle(buf) // obtain a copy of the geometry mask, which we can overwrite geom, r := geometry.Slice() if r { defer cuda.Recycle(geom) } data.Copy(buf, geom) // mask-out cuda.ZeroMask(buf, NoDemagSpins.gpuLUT1(), regions.Gpu()) // convolution with masked-out cells. demagConv().Exec(dst, M.Buffer(), buf, Bsat.gpuLUT1(), regions.Gpu()) // After convolution, mask-out the field in the NoDemagSpins cells // so they don't feel the field generated by others. cuda.ZeroMask(dst, NoDemagSpins.gpuLUT1(), regions.Gpu()) }
// Returns a copy of in, allocated on GPU. func GPUCopy(in *data.Slice) *data.Slice { s := NewSlice(in.NComp(), in.Size()) data.Copy(s, in) return s }
func (rk *RK23) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./4.)*Dt_si cuda.Madd2(m, m0, k2, 1, (3./4.)*h) // m = m0*1 + k2*3/4 M.normalize() torqueFn(k3) // 3rd order solution madd4(m, m0, rk.k1, k2, k3, 1, (2./9.)*h, (1./3.)*h, (4./9.)*h) M.normalize() // error estimate Time = t0 + Dt_si torqueFn(k4) Err := k2 // re-use k2 as error // difference of 3rd and 2nd order torque without explicitly storing them first madd4(Err, rk.k1, k2, k3, k4, (7./24.)-(2./9.), (1./4.)-(1./3.), (1./3.)-(4./9.), (1. / 8.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k4) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./3.)) data.Copy(rk.k1, k4) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./4.)) } }
// rescale and download quantity, save in rescaleBuf func (ren *render) download() { InjectAndWait(func() { if ren.quant == nil { // not yet set, default = m ren.quant = &M } quant := ren.quant size := quant.Mesh().Size() // don't slice out of bounds renderLayer := ren.layer if renderLayer >= size[Z] { renderLayer = size[Z] - 1 } if renderLayer < 0 { renderLayer = 0 } // scaling sanity check if ren.scale < 1 { ren.scale = 1 } if ren.scale > maxScale { ren.scale = maxScale } // Don't render too large images or we choke for size[X]/ren.scale > maxImgSize { ren.scale++ } for size[Y]/ren.scale > maxImgSize { ren.scale++ } for i := range size { size[i] /= ren.scale if size[i] == 0 { size[i] = 1 } } size[Z] = 1 // selects one layer // make sure buffers are there if ren.imgBuf.Size() != size { ren.imgBuf = data.NewSlice(3, size) // always 3-comp, may be re-used } buf, r := quant.Slice() if r { defer cuda.Recycle(buf) } if !buf.GPUAccess() { ren.imgBuf = Download(quant) // fallback (no zoom) return } // make sure buffers are there (in CUDA context) if ren.rescaleBuf.Size() != size { ren.rescaleBuf.Free() ren.rescaleBuf = cuda.NewSlice(1, size) } for c := 0; c < quant.NComp(); c++ { cuda.Resize(ren.rescaleBuf, buf.Comp(c), renderLayer) data.Copy(ren.imgBuf.Comp(c), ren.rescaleBuf) } }) }
func (rk *RK45DP) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with finite temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4, k5, k6 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) // k2 will be re-used as k7 h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./5.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./5.)*h) // m = m*1 + k1*h/5 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./10.)*Dt_si cuda.Madd3(m, m0, rk.k1, k2, 1, (3./40.)*h, (9./40.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (4./5.)*Dt_si madd4(m, m0, rk.k1, k2, k3, 1, (44./45.)*h, (-56./15.)*h, (32./9.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (8./9.)*Dt_si madd5(m, m0, rk.k1, k2, k3, k4, 1, (19372./6561.)*h, (-25360./2187.)*h, (64448./6561.)*h, (-212./729.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si madd6(m, m0, rk.k1, k2, k3, k4, k5, 1, (9017./3168.)*h, (-355./33.)*h, (46732./5247.)*h, (49./176.)*h, (-5103./18656.)*h) M.normalize() torqueFn(k6) // stage 7: 5th order solution Time = t0 + (1.)*Dt_si // no k2 madd6(m, m0, rk.k1, k3, k4, k5, k6, 1, (35./384.)*h, (500./1113.)*h, (125./192.)*h, (-2187./6784.)*h, (11./84.)*h) // 5th M.normalize() k7 := k2 // re-use k2 torqueFn(k7) // next torque if OK // error estimate Err := cuda.Buffer(3, size) //k3 // re-use k3 as error estimate defer cuda.Recycle(Err) madd6(Err, rk.k1, k3, k4, k5, k6, k7, (35./384.)-(5179./57600.), (500./1113.)-(7571./16695.), (125./192.)-(393./640.), (-2187./6784.)-(-92097./339200.), (11./84.)-(187./2100.), (0.)-(1./40.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k7) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./5.)) data.Copy(rk.k1, k7) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./6.)) } }
func (m *magnetization) EvalTo(dst *data.Slice) { data.Copy(dst, m.buffer_) }
func (rk *RK4) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 cuda.Madd2(m, m0, k2, 1, (1./2.)*h) // m = m0*1 + k2*1/2 M.normalize() torqueFn(k3) // stage 4 Time = t0 + Dt_si cuda.Madd2(m, m0, k3, 1, 1.*h) // m = m0*1 + k3*1 M.normalize() torqueFn(k4) err := cuda.MaxVecDiff(k1, k4) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK // 4th order solution madd5(m, m0, k1, k2, k3, k4, 1, (1./6.)*h, (1./3.)*h, (1./3.)*h, (1./6.)*h) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./4.)) setLastErr(err) setMaxTorque(k4) } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./5.)) } }
func (c *DemagConvolution) init(realKern [3][3]*data.Slice) { // init device buffers // 2D re-uses fftBuf[X] as fftBuf[Z], 3D needs all 3 fftBufs. nc := fftR2COutputSizeFloats(c.realKernSize) c.fftCBuf[X] = NewSlice(1, nc) c.fftCBuf[Y] = NewSlice(1, nc) if c.is2D() { c.fftCBuf[Z] = c.fftCBuf[X] } else { c.fftCBuf[Z] = NewSlice(1, nc) } // Real buffer shares storage with Complex buffer for i := 0; i < 3; i++ { c.fftRBuf[i] = data.SliceFromPtrs(c.realKernSize, data.GPUMemory, []unsafe.Pointer{c.fftCBuf[i].DevPtr(0)}) } // init FFT plans c.fwPlan = newFFT3DR2C(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) c.bwPlan = newFFT3DC2R(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) // init FFT kernel // logic size of FFT(kernel): store real parts only c.fftKernLogicSize = fftR2COutputSizeFloats(c.realKernSize) util.Assert(c.fftKernLogicSize[X]%2 == 0) c.fftKernLogicSize[X] /= 2 // physical size of FFT(kernel): store only non-redundant part exploiting Y, Z mirror symmetry // X mirror symmetry already exploited: FFT(kernel) is purely real. physKSize := [3]int{c.fftKernLogicSize[X], c.fftKernLogicSize[Y]/2 + 1, c.fftKernLogicSize[Z]/2 + 1} output := c.fftCBuf[0] input := c.fftRBuf[0] fftKern := data.NewSlice(1, physKSize) kfull := data.NewSlice(1, output.Size()) // not yet exploiting symmetry kfulls := kfull.Scalars() kCSize := physKSize kCSize[X] *= 2 // size of kernel after removing Y,Z redundant parts, but still complex kCmplx := data.NewSlice(1, kCSize) // not yet exploiting X symmetry kc := kCmplx.Scalars() for i := 0; i < 3; i++ { for j := i; j < 3; j++ { // upper triangular part if realKern[i][j] != nil { // ignore 0's // FW FFT data.Copy(input, realKern[i][j]) c.fwPlan.ExecAsync(input, output) data.Copy(kfull, output) // extract non-redundant part (Y,Z symmetry) for iz := 0; iz < kCSize[Z]; iz++ { for iy := 0; iy < kCSize[Y]; iy++ { for ix := 0; ix < kCSize[X]; ix++ { kc[iz][iy][ix] = kfulls[iz][iy][ix] } } } // extract real parts (X symmetry) scaleRealParts(fftKern, kCmplx, 1/float32(c.fwPlan.InputLen())) c.kern[i][j] = GPUCopy(fftKern) } } } }
func (geometry *geom) setGeom(s Shape) { SetBusy(true) defer SetBusy(false) if s == nil { // TODO: would be nice not to save volume if entirely filled s = universe } geometry.shape = s if geometry.Gpu().IsNil() { geometry.buffer = cuda.NewSlice(1, geometry.Mesh().Size()) } host := data.NewSlice(1, geometry.Gpu().Size()) array := host.Scalars() V := host v := array n := geometry.Mesh().Size() c := geometry.Mesh().CellSize() cx, cy, cz := c[X], c[Y], c[Z] progress, progmax := 0, n[Y]*n[Z] var ok bool for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { progress++ util.Progress(progress, progmax, "Initializing geometry") for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) x0, y0, z0 := r[X], r[Y], r[Z] // check if center and all vertices lie inside or all outside allIn, allOut := true, true if s(x0, y0, z0) { allOut = false } else { allIn = false } if edgeSmooth != 0 { // center is sufficient if we're not really smoothing for _, Δx := range []float64{-cx / 2, cx / 2} { for _, Δy := range []float64{-cy / 2, cy / 2} { for _, Δz := range []float64{-cz / 2, cz / 2} { if s(x0+Δx, y0+Δy, z0+Δz) { // inside allOut = false } else { allIn = false } } } } } switch { case allIn: v[iz][iy][ix] = 1 ok = true case allOut: v[iz][iy][ix] = 0 default: v[iz][iy][ix] = geometry.cellVolume(ix, iy, iz) ok = ok || (v[iz][iy][ix] != 0) } } } } if !ok { util.Fatal("SetGeom: geometry completely empty") } data.Copy(geometry.buffer, V) // M inside geom but previously outside needs to be re-inited needupload := false geomlist := host.Host()[0] mhost := M.Buffer().HostCopy() m := mhost.Host() rng := rand.New(rand.NewSource(0)) for i := range m[0] { if geomlist[i] != 0 { mx, my, mz := m[X][i], m[Y][i], m[Z][i] if mx == 0 && my == 0 && mz == 0 { needupload = true rnd := randomDir(rng) m[X][i], m[Y][i], m[Z][i] = float32(rnd[X]), float32(rnd[Y]), float32(rnd[Z]) } } } if needupload { data.Copy(M.Buffer(), mhost) } M.normalize() // removes m outside vol }