// mul1N pointwise multiplies a scalar (1-component) with an N-component vector, // yielding an N-component vector stored in dst. func mul1N(dst, a, b *data.Slice) { util.Assert(a.NComp() == 1) util.Assert(dst.NComp() == b.NComp()) for c := 0; c < dst.NComp(); c++ { cuda.Mul(dst.Comp(c), a, b.Comp(c)) } }
func divN1(dst, a, b *data.Slice) { util.Assert(dst.NComp() == a.NComp()) util.Assert(b.NComp() == 1) for c := 0; c < dst.NComp(); c++ { cuda.Div(dst.Comp(c), a.Comp(c), b) } }
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3 * factor3 func Madd3(dst, src1, src2, src3 *data.Slice, factor1, factor2, factor3 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd3_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, N, cfg) } }
// Select and resize one layer for interactive output func Resize(dst, src *data.Slice, layer int) { dstsize := dst.Size() srcsize := src.Size() util.Assert(dstsize[Z] == 1) util.Assert(dst.NComp() == 1 && src.NComp() == 1) scalex := srcsize[X] / dstsize[X] scaley := srcsize[Y] / dstsize[Y] util.Assert(scalex > 0 && scaley > 0) cfg := make3DConf(dstsize) k_resize_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], layer, scalex, scaley, cfg) }
// shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge or clampR at right edge. func ShiftX(dst, src *data.Slice, shiftX int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftx_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) }
// multiply: dst[i] = a[i] * b[i] // a and b must have the same number of components func Mul(dst, a, b *data.Slice) { N := dst.Len() nComp := dst.NComp() util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_mul_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg) } }
// Copies src into dst, which is larger, and multiplies by vol*Bsat. // The remainder of dst is not filled with zeros. // Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length. func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Msat MSlice) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(srcsize) k_copypadmul2_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], Msat.DevPtr(0), Msat.Mul(0), vol.DevPtr(0), cfg) }
// Copies src into dst, which is larger, and multiplies by vol*Bsat. // The remainder of dst is not filled with zeros. // Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length. func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Bsat LUTPtr, regions *Bytes) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(srcsize) k_copypadmul_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), vol.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], unsafe.Pointer(Bsat), regions.Ptr, cfg) }
// Adaptive Heun method, can be used as solver.Step func (_ *Heun) Step() { y := M.Buffer() dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if FixDt != 0 { Dt_si = FixDt } dt := float32(Dt_si * GammaLL) util.Assert(dt > 0) // stage 1 torqueFn(dy0) cuda.Madd2(y, y, dy0, 1, dt) // y = y + dt * dy // stage 2 dy := cuda.Buffer(3, y.Size()) defer cuda.Recycle(dy) Time += Dt_si torqueFn(dy) err := cuda.MaxVecDiff(dy0, dy) * float64(dt) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK cuda.Madd3(y, y, dy, dy0, 1, 0.5*dt, -0.5*dt) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./2.)) setLastErr(err) setMaxTorque(dy) } else { // undo bad step util.Assert(FixDt == 0) Time -= Dt_si cuda.Madd2(y, y, dy0, 1, -dt) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./3.)) } }
// Downsample returns a slice of new size N, smaller than in.Size(). // Averaging interpolation over the input slice. // in is returned untouched if the sizes are equal. func Downsample(In [][][][]float32, N [3]int) [][][][]float32 { if SizeOf(In[0]) == N { return In // nothing to do } nComp := len(In) out := NewSlice(nComp, N) Out := out.Tensors() srcsize := SizeOf(In[0]) dstsize := SizeOf(Out[0]) Dx := dstsize[X] Dy := dstsize[Y] Dz := dstsize[Z] Sx := srcsize[X] Sy := srcsize[Y] Sz := srcsize[Z] scalex := Sx / Dx scaley := Sy / Dy scalez := Sz / Dz util.Assert(scalex > 0 && scaley > 0) for c := range Out { for iz := 0; iz < Dz; iz++ { for iy := 0; iy < Dy; iy++ { for ix := 0; ix < Dx; ix++ { sum, n := 0.0, 0.0 for I := 0; I < scalez; I++ { i2 := iz*scalez + I for J := 0; J < scaley; J++ { j2 := iy*scaley + J for K := 0; K < scalex; K++ { k2 := ix*scalex + K if i2 < Sz && j2 < Sy && k2 < Sx { sum += float64(In[c][i2][j2][k2]) n++ } } } } Out[c][iz][iy][ix] = float32(sum / n) } } } } return Out }
// Calculates the magnetostatic kernel by brute-force integration // of magnetic charges over the faces and averages over cell volumes. func CalcDemagKernel(inputSize, pbc [3]int, cellsize [3]float64, accuracy float64) (kernel [3][3]*data.Slice) { // Add zero-padding in non-PBC directions size := padSize(inputSize, pbc) // Sanity check { util.Assert(size[Z] > 0 && size[Y] > 0 && size[X] > 0) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) util.Assert(pbc[X] >= 0 && pbc[Y] >= 0 && pbc[Z] >= 0) util.Assert(accuracy > 0) } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var array [3][3][][][]float32 for i := 0; i < 3; i++ { for j := i; j < 3; j++ { kernel[i][j] = data.NewSlice(1, size) array[i][j] = kernel[i][j].Scalars() } } // Field (destination) loop ranges r1, r2 := kernelRanges(size, pbc) // smallest cell dimension is our typical length scale L := cellsize[X] { if cellsize[Y] < L { L = cellsize[Y] } if cellsize[Z] < L { L = cellsize[Z] } } progress, progmax := 0, (1+(r2[Y]-r1[Y]))*(1+(r2[Z]-r1[Z])) // progress bar done := make(chan struct{}, 3) // parallel calculation of one component done? // Start brute integration // 9 nested loops, does that stress you out? // Fortunately, the 5 inner ones usually loop over just one element. for s := 0; s < 3; s++ { // source index Ksdxyz (parallelized over) go func(s int) { u, v, w := s, (s+1)%3, (s+2)%3 // u = direction of source (s), v & w are the orthogonal directions var ( R, R2 [3]float64 // field and source cell center positions pole [3]float64 // position of point charge on the surface points int // counts used integration points ) for z := r1[Z]; z <= r2[Z]; z++ { zw := wrap(z, size[Z]) // skip one half, reconstruct from symmetry later // check on wrapped index instead of loop range so it also works for PBC if zw > size[Z]/2 { if s == 0 { progress += (1 + (r2[Y] - r1[Y])) } continue } R[Z] = float64(z) * cellsize[Z] for y := r1[Y]; y <= r2[Y]; y++ { if s == 0 { // show progress of only one component progress++ util.Progress(progress, progmax, "Calculating demag kernel") } yw := wrap(y, size[Y]) if yw > size[Y]/2 { continue } R[Y] = float64(y) * cellsize[Y] for x := r1[X]; x <= r2[X]; x++ { xw := wrap(x, size[X]) if xw > size[X]/2 { continue } R[X] = float64(x) * cellsize[X] // choose number of integration points depending on how far we are from source. dx, dy, dz := delta(x)*cellsize[X], delta(y)*cellsize[Y], delta(z)*cellsize[Z] d := math.Sqrt(dx*dx + dy*dy + dz*dz) if d == 0 { d = L } maxSize := d / accuracy // maximum acceptable integration size nv := int(math.Max(cellsize[v]/maxSize, 1) + 0.5) nw := int(math.Max(cellsize[w]/maxSize, 1) + 0.5) nx := int(math.Max(cellsize[X]/maxSize, 1) + 0.5) ny := int(math.Max(cellsize[Y]/maxSize, 1) + 0.5) nz := int(math.Max(cellsize[Z]/maxSize, 1) + 0.5) // Stagger source and destination grids. // Massively improves accuracy, see note. nv *= 2 nw *= 2 util.Assert(nv > 0 && nw > 0 && nx > 0 && ny > 0 && nz > 0) scale := 1 / float64(nv*nw*nx*ny*nz) surface := cellsize[v] * cellsize[w] // the two directions perpendicular to direction s charge := surface * scale pu1 := cellsize[u] / 2. // positive pole center pu2 := -pu1 // negative pole center // Do surface integral over source cell, accumulate in B var B [3]float64 for i := 0; i < nv; i++ { pv := -(cellsize[v] / 2.) + cellsize[v]/float64(2*nv) + float64(i)*(cellsize[v]/float64(nv)) pole[v] = pv for j := 0; j < nw; j++ { pw := -(cellsize[w] / 2.) + cellsize[w]/float64(2*nw) + float64(j)*(cellsize[w]/float64(nw)) pole[w] = pw // Do volume integral over destination cell for α := 0; α < nx; α++ { rx := R[X] - cellsize[X]/2 + cellsize[X]/float64(2*nx) + (cellsize[X]/float64(nx))*float64(α) for β := 0; β < ny; β++ { ry := R[Y] - cellsize[Y]/2 + cellsize[Y]/float64(2*ny) + (cellsize[Y]/float64(ny))*float64(β) for γ := 0; γ < nz; γ++ { rz := R[Z] - cellsize[Z]/2 + cellsize[Z]/float64(2*nz) + (cellsize[Z]/float64(nz))*float64(γ) points++ pole[u] = pu1 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r := math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr := charge / (4 * math.Pi * r * r * r) bx := R2[X] * qr by := R2[Y] * qr bz := R2[Z] * qr pole[u] = pu2 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r = math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr = -charge / (4 * math.Pi * r * r * r) B[X] += (bx + R2[X]*qr) // addition ordered for accuracy B[Y] += (by + R2[Y]*qr) B[Z] += (bz + R2[Z]*qr) } } } } } for d := s; d < 3; d++ { // destination index Ksdxyz array[s][d][zw][yw][xw] += float32(B[d]) // += needed in case of PBC } } } } done <- struct{}{} // notify parallel computation of this component is done }(s) } // wait for all 3 components to finish <-done <-done <-done // Reconstruct skipped parts from symmetry (X) for z := 0; z < size[Z]; z++ { for y := 0; y < size[Y]; y++ { for x := size[X]/2 + 1; x < size[X]; x++ { x2 := size[X] - x array[X][X][z][y][x] = array[X][X][z][y][x2] array[X][Y][z][y][x] = -array[X][Y][z][y][x2] array[X][Z][z][y][x] = -array[X][Z][z][y][x2] array[Y][Y][z][y][x] = array[Y][Y][z][y][x2] array[Y][Z][z][y][x] = array[Y][Z][z][y][x2] array[Z][Z][z][y][x] = array[Z][Z][z][y][x2] } } } // Reconstruct skipped parts from symmetry (Y) for z := 0; z < size[Z]; z++ { for y := size[Y]/2 + 1; y < size[Y]; y++ { y2 := size[Y] - y for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z][y2][x] array[X][Y][z][y][x] = -array[X][Y][z][y2][x] array[X][Z][z][y][x] = array[X][Z][z][y2][x] array[Y][Y][z][y][x] = array[Y][Y][z][y2][x] array[Y][Z][z][y][x] = -array[Y][Z][z][y2][x] array[Z][Z][z][y][x] = array[Z][Z][z][y2][x] } } } // Reconstruct skipped parts from symmetry (Z) for z := size[Z]/2 + 1; z < size[Z]; z++ { z2 := size[Z] - z for y := 0; y < size[Y]; y++ { for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z2][y][x] array[X][Y][z][y][x] = array[X][Y][z2][y][x] array[X][Z][z][y][x] = -array[X][Z][z2][y][x] array[Y][Y][z][y][x] = array[Y][Y][z2][y][x] array[Y][Z][z][y][x] = -array[Y][Z][z2][y][x] array[Z][Z][z][y][x] = array[Z][Z][z2][y][x] } } } // for 2D these elements are zero: if size[Z] == 1 { kernel[X][Z] = nil kernel[Y][Z] = nil } // make result symmetric for tools that expect it so. kernel[Y][X] = kernel[X][Y] kernel[Z][X] = kernel[X][Z] kernel[Z][Y] = kernel[Y][Z] return kernel }
func (c *DemagConvolution) init(realKern [3][3]*data.Slice) { // init device buffers // 2D re-uses fftBuf[X] as fftBuf[Z], 3D needs all 3 fftBufs. nc := fftR2COutputSizeFloats(c.realKernSize) c.fftCBuf[X] = NewSlice(1, nc) c.fftCBuf[Y] = NewSlice(1, nc) if c.is2D() { c.fftCBuf[Z] = c.fftCBuf[X] } else { c.fftCBuf[Z] = NewSlice(1, nc) } // Real buffer shares storage with Complex buffer for i := 0; i < 3; i++ { c.fftRBuf[i] = data.SliceFromPtrs(c.realKernSize, data.GPUMemory, []unsafe.Pointer{c.fftCBuf[i].DevPtr(0)}) } // init FFT plans c.fwPlan = newFFT3DR2C(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) c.bwPlan = newFFT3DC2R(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) // init FFT kernel // logic size of FFT(kernel): store real parts only c.fftKernLogicSize = fftR2COutputSizeFloats(c.realKernSize) util.Assert(c.fftKernLogicSize[X]%2 == 0) c.fftKernLogicSize[X] /= 2 // physical size of FFT(kernel): store only non-redundant part exploiting Y, Z mirror symmetry // X mirror symmetry already exploited: FFT(kernel) is purely real. physKSize := [3]int{c.fftKernLogicSize[X], c.fftKernLogicSize[Y]/2 + 1, c.fftKernLogicSize[Z]/2 + 1} output := c.fftCBuf[0] input := c.fftRBuf[0] fftKern := data.NewSlice(1, physKSize) kfull := data.NewSlice(1, output.Size()) // not yet exploiting symmetry kfulls := kfull.Scalars() kCSize := physKSize kCSize[X] *= 2 // size of kernel after removing Y,Z redundant parts, but still complex kCmplx := data.NewSlice(1, kCSize) // not yet exploiting X symmetry kc := kCmplx.Scalars() for i := 0; i < 3; i++ { for j := i; j < 3; j++ { // upper triangular part if realKern[i][j] != nil { // ignore 0's // FW FFT data.Copy(input, realKern[i][j]) c.fwPlan.ExecAsync(input, output) data.Copy(kfull, output) // extract non-redundant part (Y,Z symmetry) for iz := 0; iz < kCSize[Z]; iz++ { for iy := 0; iy < kCSize[Y]; iy++ { for ix := 0; ix < kCSize[X]; ix++ { kc[iz][iy][ix] = kfulls[iz][iy][ix] } } } // extract real parts (X symmetry) scaleRealParts(fftKern, kCmplx, 1/float32(c.fwPlan.InputLen())) c.kern[i][j] = GPUCopy(fftKern) } } } }
func (rk *RK4) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 cuda.Madd2(m, m0, k2, 1, (1./2.)*h) // m = m0*1 + k2*1/2 M.normalize() torqueFn(k3) // stage 4 Time = t0 + Dt_si cuda.Madd2(m, m0, k3, 1, 1.*h) // m = m0*1 + k3*1 M.normalize() torqueFn(k4) err := cuda.MaxVecDiff(k1, k4) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK // 4th order solution madd5(m, m0, k1, k2, k3, k4, 1, (1./6.)*h, (1./3.)*h, (1./3.)*h, (1./6.)*h) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./4.)) setLastErr(err) setMaxTorque(k4) } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./5.)) } }
func (rk *RK45DP) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with finite temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4, k5, k6 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) // k2 will be re-used as k7 h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./5.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./5.)*h) // m = m*1 + k1*h/5 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./10.)*Dt_si cuda.Madd3(m, m0, rk.k1, k2, 1, (3./40.)*h, (9./40.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (4./5.)*Dt_si madd4(m, m0, rk.k1, k2, k3, 1, (44./45.)*h, (-56./15.)*h, (32./9.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (8./9.)*Dt_si madd5(m, m0, rk.k1, k2, k3, k4, 1, (19372./6561.)*h, (-25360./2187.)*h, (64448./6561.)*h, (-212./729.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si madd6(m, m0, rk.k1, k2, k3, k4, k5, 1, (9017./3168.)*h, (-355./33.)*h, (46732./5247.)*h, (49./176.)*h, (-5103./18656.)*h) M.normalize() torqueFn(k6) // stage 7: 5th order solution Time = t0 + (1.)*Dt_si // no k2 madd6(m, m0, rk.k1, k3, k4, k5, k6, 1, (35./384.)*h, (500./1113.)*h, (125./192.)*h, (-2187./6784.)*h, (11./84.)*h) // 5th M.normalize() k7 := k2 // re-use k2 torqueFn(k7) // next torque if OK // error estimate Err := cuda.Buffer(3, size) //k3 // re-use k3 as error estimate defer cuda.Recycle(Err) madd6(Err, rk.k1, k3, k4, k5, k6, k7, (35./384.)-(5179./57600.), (500./1113.)-(7571./16695.), (125./192.)-(393./640.), (-2187./6784.)-(-92097./339200.), (11./84.)-(187./2100.), (0.)-(1./40.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k7) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./5.)) data.Copy(rk.k1, k7) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./6.)) } }
func unslice(v []float64) [3]float64 { util.Assert(len(v) == 3) return [3]float64{v[0], v[1], v[2]} }
// dst = a/b, unless b == 0 func paramDiv(dst, a, b [][NREGION]float32) { util.Assert(len(dst) == 1 && len(a) == 1 && len(b) == 1) for i := 0; i < NREGION; i++ { // not regions.maxreg dst[0][i] = safediv(a[0][i], b[0][i]) } }
func (rk *RK23) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./4.)*Dt_si cuda.Madd2(m, m0, k2, 1, (3./4.)*h) // m = m0*1 + k2*3/4 M.normalize() torqueFn(k3) // 3rd order solution madd4(m, m0, rk.k1, k2, k3, 1, (2./9.)*h, (1./3.)*h, (4./9.)*h) M.normalize() // error estimate Time = t0 + Dt_si torqueFn(k4) Err := k2 // re-use k2 as error // difference of 3rd and 2nd order torque without explicitly storing them first madd4(Err, rk.k1, k2, k3, k4, (7./24.)-(2./9.), (1./4.)-(1./3.), (1./3.)-(4./9.), (1. / 8.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k4) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./3.)) data.Copy(rk.k1, k4) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./4.)) } }
func (p *Excitation) MSlice() cuda.MSlice { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) }
// Kernel for the vertical derivative of the force on an MFM tip due to mx, my, mz. // This is the 2nd derivative of the energy w.r.t. z. func MFMKernel(mesh *d.Mesh, lift, tipsize float64) (kernel [3]*d.Slice) { const TipCharge = 1 / Mu0 // tip charge const Δ = 1e-9 // tip oscillation, take 2nd derivative over this distance util.AssertMsg(lift > 0, "MFM tip crashed into sample, please lift the new one higher") { // Kernel mesh is 2x larger than input, instead in case of PBC pbc := mesh.PBC() sz := padSize(mesh.Size(), pbc) cs := mesh.CellSize() mesh = d.NewMesh(sz[X], sz[Y], sz[Z], cs[X], cs[Y], cs[Z], pbc[:]...) } // Shorthand size := mesh.Size() pbc := mesh.PBC() cellsize := mesh.CellSize() volume := cellsize[X] * cellsize[Y] * cellsize[Z] fmt.Println("calculating MFM kernel") // Sanity check { util.Assert(size[Z] >= 1 && size[Y] >= 2 && size[X] >= 2) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) util.AssertMsg(size[X]%2 == 0 && size[Y]%2 == 0, "Even kernel size needed") if size[Z] > 1 { util.AssertMsg(size[Z]%2 == 0, "Even kernel size needed") } } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var K [3][][][]float32 for i := 0; i < 3; i++ { kernel[i] = d.NewSlice(1, mesh.Size()) K[i] = kernel[i].Scalars() } r1, r2 := kernelRanges(size, pbc) progress, progmax := 0, (1+r2[Y]-r1[Y])*(1+r2[Z]-r1[Z]) for iz := r1[Z]; iz <= r2[Z]; iz++ { zw := wrap(iz, size[Z]) z := float64(iz) * cellsize[Z] for iy := r1[Y]; iy <= r2[Y]; iy++ { yw := wrap(iy, size[Y]) y := float64(iy) * cellsize[Y] progress++ util.Progress(progress, progmax, "Calculating MFM kernel") for ix := r1[X]; ix <= r2[X]; ix++ { x := float64(ix) * cellsize[X] xw := wrap(ix, size[X]) for s := 0; s < 3; s++ { // source index Ksxyz m := d.Vector{0, 0, 0} m[s] = 1 var E [3]float64 // 3 energies for 2nd derivative for i := -1; i <= 1; i++ { I := float64(i) R := d.Vector{-x, -y, z - (lift + (I * Δ))} r := R.Len() B := R.Mul(TipCharge / (4 * math.Pi * r * r * r)) R = d.Vector{-x, -y, z - (lift + tipsize + (I * Δ))} r = R.Len() B = B.Add(R.Mul(-TipCharge / (4 * math.Pi * r * r * r))) E[i+1] = B.Dot(m) * volume // i=-1 stored in E[0] } dFdz_tip := ((E[0] - E[1]) + (E[2] - E[1])) / (Δ * Δ) // dFz/dz = d2E/dz2 K[s][zw][yw][xw] += float32(dFdz_tip) // += needed in case of PBC } } } } return kernel }
// utility for LUT of single-component data func (p *lut) gpuLUT1() cuda.LUTPtr { util.Assert(len(p.gpu_buf) == 1) return cuda.LUTPtr(p.gpuLUT()[0]) }