func TestSliceSlice(t *testing.T) { LockThread() N0, N1, N2 := 1, 10, 10 c := 1e-6 m := data.NewMesh(N0, N1, N2, c, c, c) a := NewUnifiedSlice(3, m) h := a.Host() h[1][21] = 42 b := a.Slice(20, 30) if b.Len() != 30-20 { t.Fail() } if b.NComp() != a.NComp() { t.Fail() } if b.Host()[1][1] != 42 { t.Fail() } if *a.Mesh() != *b.Mesh() { t.Fail() } if a.MemType() != b.MemType() { t.Fail() } }
func TestCpy(t *testing.T) { N0, N1, N2 := 2, 4, 32 N := N0 * N1 * N2 mesh := data.NewMesh(N0, N1, N2, 1, 1, 1) h1 := make([]float32, N) for i := range h1 { h1[i] = float32(i) } hs := data.SliceFromList([][]float32{h1}, mesh) d := NewSlice(1, mesh) data.Copy(d, hs) d2 := NewSlice(1, mesh) data.Copy(d2, d) h2 := data.NewSlice(1, mesh) data.Copy(h2, d2) res := h2.Host()[0] for i := range res { if res[i] != h1[i] { t.Fail() } } }
// Initialize GPU FFT kernel for 2D. // Only the non-redundant parts are stored on the GPU. func (c *DemagConvolution) initFFTKern2D() { padded := c.kernSize ffted := fftR2COutputSizeFloats(padded) realsize := ffted realsize[2] /= 2 c.fftKernSize = realsize halfkern := realsize halfkern[1] = halfkern[1]/2 + 1 fwPlan := c.fwPlan output := c.fftCBuf[0] input := c.fftRBuf[0] // upper triangular part fftKern := data.NewSlice(1, data.NewMesh(halfkern[0], halfkern[1], halfkern[2], 1, 1, 1)) for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if c.kern[i][j] != nil { // ignore 0's data.Copy(input, c.kern[i][j]) fwPlan.Exec(input, output) scaleRealParts(fftKern, output.Slice(0, prod(halfkern)*2), 1/float32(fwPlan.InputLen())) c.gpuFFTKern[i][j] = GPUCopy(fftKern) } } } }
func TestSlice(t *testing.T) { N0, N1, N2 := 2, 4, 8 c := 1e-6 m := data.NewMesh(N0, N1, N2, c, c, c) N := N0 * N1 * N2 a := NewSlice(3, m) defer a.Free() Memset(a, 1, 2, 3) if a.GPUAccess() == false { t.Fail() } if a.Len() != N { t.Fail() } if a.NComp() != 3 { t.Fail() } b := a.Comp(1) if b.GPUAccess() == false { t.Error("b.GPUAccess", b.GPUAccess()) } if b.Len() != N { t.Error("b.Len", b.Len()) } if b.NComp() != 1 { t.Error("b.NComp", b.NComp()) } if *b.Mesh() != *a.Mesh() { t.Fail() } }
func toGPU(list []float32) *data.Slice { mesh := data.NewMesh(1, 1, len(list), 1, 1, 1) h := data.SliceFromList([][]float32{list}, mesh) d := NewSlice(1, mesh) data.Copy(d, h) return d }
func main() { cuda.Init() N0, N1, N2 := 1, 64, 128 c := 1. mesh := data.NewMesh(N0, N1, N2, c/2, c*2, c) m := cuda.NewSlice(3, mesh) conv := cuda.NewDemag(mesh) cuda.Memset(m, 1, 1, 1) B := cuda.NewSlice(3, mesh) Bsat := 1. vol := data.NilSlice(1, mesh) conv.Exec(B, m, vol, Bsat) out := B.HostCopy() bx := out.Vectors()[0][N0/2][N1/2][N2/2] by := out.Vectors()[1][N0/2][N1/2][N2/2] bz := out.Vectors()[2][N0/2][N1/2][N2/2] fmt.Println("demag tensor:", bx, by, bz) check(bx, -1) check(by, 0) check(bz, 0) fmt.Println("OK") }
// Set the simulation mesh to Nx x Ny x Nz cells of given size. // Can be set only once at the beginning of the simulation. func SetMesh(Nx, Ny, Nz int, cellSizeX, cellSizeY, cellSizeZ float64) { if mesh != nil { log.Fatal("mesh already set") } if Nx <= 1 { log.Fatal("mesh size X should be > 1, have: ", Nx) } mesh = data.NewMesh(Nz, Ny, Nx, cellSizeZ, cellSizeY, cellSizeX) log.Println("set mesh:", mesh.UserString()) initialize() }
func TestSliceFree(t *testing.T) { LockThread() N0, N1, N2 := 128, 1024, 1024 c := 1e-6 m := data.NewMesh(N0, N1, N2, c, c, c) N := 17 // not freeing would attempt to allocate 17GB. for i := 0; i < N; i++ { a := NewSlice(2, m) a.Free() } a := NewSlice(2, m) a.Free() a.Free() // test double-free }
func TestSliceHost(t *testing.T) { LockThread() N0, N1, N2 := 1, 10, 10 c := 1e-6 m := data.NewMesh(N0, N1, N2, c, c, c) a := NewUnifiedSlice(3, m) defer a.Free() b := a.Host() if b[0][0] != 0 || b[1][42] != 0 || b[2][99] != 0 { t.Fail() } Memset(a, 1, 2, 3) b = a.Host() if b[0][0] != 1 || b[1][42] != 2 || b[2][99] != 3 { t.Fail() } }
func main() { cuda.Init() N0, N1, N2 := 16, 16, 16 c := 1. mesh := data.NewMesh(N0, N1, N2, c, c, c) m := cuda.NewSlice(3, mesh) conv := cuda.NewDemag(mesh) mhost := m.HostCopy() m_ := mhost.Vectors() r := float64(N2) / 2 for i := 0; i < N0; i++ { x := c * (float64(i) + 0.5 - float64(N0)/2) for j := 0; j < N1; j++ { y := c * (float64(j) + 0.5 - float64(N1)/2) for k := 0; k < N2; k++ { z := c * (float64(k) + 0.5 - float64(N2)/2) if x*x+y*y+z*z < r*r { m_[0][i][j][k] = 1 m_[1][i][j][k] = 2 m_[2][i][j][k] = 3 } } } } data.Copy(m, mhost) B := cuda.NewSlice(3, mesh) conv.Exec(B, m, data.NilSlice(1, mesh), 1) out := B.HostCopy() bx := out.Vectors()[0][N0/2][N1/2][N2/2] by := out.Vectors()[1][N0/2][N1/2][N2/2] bz := out.Vectors()[2][N0/2][N1/2][N2/2] fmt.Println("demag tensor:", bx, by/2, bz/3) check(bx, -1./3.) check(by, -2./3.) check(bz, -3./3.) fmt.Println("OK") }
// Calculates the magnetostatic kernel by brute-force integration // of magnetic charges over the faces and averages over cell volumes. // Mesh should NOT yet be zero-padded. func BruteKernel(mesh *data.Mesh, accuracy float64) (kernel [3][3]*data.Slice) { { // Kernel mesh is 2x larger than input, instead in case of PBC pbc := mesh.PBC() util.Argument(pbc == [3]int{0, 0, 0}) // PBC not supported yet sz := padSize(mesh.Size(), pbc) cs := mesh.CellSize() mesh = data.NewMesh(sz[0], sz[1], sz[2], cs[0], cs[1], cs[2], pbc[:]...) } // Shorthand size := mesh.Size() cellsize := mesh.CellSize() periodic := mesh.PBC() log.Println("calculating demag kernel:", "accuracy:", accuracy, ", size:", size[0], "x", size[1], "x", size[2]) // Sanity check { util.Assert(size[0] > 0 && size[1] > 1 && size[2] > 1) util.Assert(cellsize[0] > 0 && cellsize[1] > 0 && cellsize[2] > 0) util.Assert(periodic[0] >= 0 && periodic[1] >= 0 && periodic[2] >= 0) util.Assert(accuracy > 0) // TODO: in case of PBC, this will not be met: util.Assert(size[1]%2 == 0 && size[2]%2 == 0) if size[0] > 1 { util.Assert(size[0]%2 == 0) } } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var array [3][3][][][]float32 for i := 0; i < 3; i++ { for j := i; j < 3; j++ { kernel[i][j] = data.NewSlice(1, mesh) array[i][j] = kernel[i][j].Scalars() } } // Field (destination) loop ranges x1, x2 := -(size[X]-1)/2, size[X]/2-1 y1, y2 := -(size[Y]-1)/2, size[Y]/2-1 z1, z2 := -(size[Z]-1)/2, size[Z]/2-1 // support for 2D simulations (thickness 1) if size[X] == 1 && periodic[X] == 0 { x2 = 0 } { // Repeat for PBC: x1 *= (periodic[X] + 1) x2 *= (periodic[X] + 1) y1 *= (periodic[Y] + 1) y2 *= (periodic[Y] + 1) z1 *= (periodic[Z] + 1) z2 *= (periodic[Z] + 1) } // smallest cell dimension is our typical length scale L := cellsize[X] if cellsize[Y] < L { L = cellsize[Y] } if cellsize[Z] < L { L = cellsize[Z] } // Start brute integration // 9 nested loops, does that stress you out? // Fortunately, the 5 inner ones usually loop over just one element. // It might be nice to get rid of that branching though. var ( R, R2 [3]float64 // field and source cell center positions pole [3]float64 // position of point charge on the surface points int // counts used integration points ) for s := 0; s < 3; s++ { // source index Ksdxyz u, v, w := s, (s+1)%3, (s+2)%3 // u = direction of source (s), v & w are the orthogonal directions for x := x1; x <= x2; x++ { // in each dimension, go from -(size-1)/2 to size/2 -1, wrapped. xw := wrap(x, size[X]) R[X] = float64(x) * cellsize[X] for y := y1; y <= y2; y++ { yw := wrap(y, size[Y]) R[Y] = float64(y) * cellsize[Y] for z := z1; z <= z2; z++ { zw := wrap(z, size[Z]) R[Z] = float64(z) * cellsize[Z] // choose number of integration points depending on how far we are from source. dx, dy, dz := delta(x)*cellsize[X], delta(y)*cellsize[Y], delta(z)*cellsize[Z] d := math.Sqrt(dx*dx + dy*dy + dz*dz) if d == 0 { d = L } maxSize := d / accuracy // maximum acceptable integration size nv := int(math.Max(cellsize[v]/maxSize, 1) + 0.5) nw := int(math.Max(cellsize[w]/maxSize, 1) + 0.5) nx := int(math.Max(cellsize[X]/maxSize, 1) + 0.5) ny := int(math.Max(cellsize[Y]/maxSize, 1) + 0.5) nz := int(math.Max(cellsize[Z]/maxSize, 1) + 0.5) // Stagger source and destination grids. // Massively improves accuracy. Could play with variations. // See note. nv *= 2 nw *= 2 util.Assert(nv > 0 && nw > 0 && nx > 0 && ny > 0 && nz > 0) scale := 1 / float64(nv*nw*nx*ny*nz) surface := cellsize[v] * cellsize[w] // the two directions perpendicular to direction s charge := surface * scale pu1 := cellsize[u] / 2. // positive pole center pu2 := -pu1 // negative pole center // Do surface integral over source cell, accumulate in B var B [3]float64 for i := 0; i < nv; i++ { pv := -(cellsize[v] / 2.) + cellsize[v]/float64(2*nv) + float64(i)*(cellsize[v]/float64(nv)) pole[v] = pv for j := 0; j < nw; j++ { pw := -(cellsize[w] / 2.) + cellsize[w]/float64(2*nw) + float64(j)*(cellsize[w]/float64(nw)) pole[w] = pw // Do volume integral over destination cell for α := 0; α < nx; α++ { rx := R[X] - cellsize[X]/2 + cellsize[X]/float64(2*nx) + (cellsize[X]/float64(nx))*float64(α) for β := 0; β < ny; β++ { ry := R[Y] - cellsize[Y]/2 + cellsize[Y]/float64(2*ny) + (cellsize[Y]/float64(ny))*float64(β) for γ := 0; γ < nz; γ++ { rz := R[Z] - cellsize[Z]/2 + cellsize[Z]/float64(2*nz) + (cellsize[Z]/float64(nz))*float64(γ) points++ pole[u] = pu1 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r := math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr := charge / (4 * math.Pi * r * r * r) bx := R2[X] * qr by := R2[Y] * qr bz := R2[Z] * qr pole[u] = pu2 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r = math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr = -charge / (4 * math.Pi * r * r * r) B[X] += (bx + R2[X]*qr) // addition ordered for accuracy B[Y] += (by + R2[Y]*qr) B[Z] += (bz + R2[Z]*qr) } } } } } for d := s; d < 3; d++ { // destination index Ksdxyz // TODO: for PBC, need to add here array[s][d][xw][yw][zw] = float32(B[d]) } } } } } log.Println("kernel used", points, "integration points") // for 2D these elements are zero: if size[0] == 1 { kernel[0][1] = nil kernel[0][2] = nil } // make result symmetric for tools that expect it so. kernel[1][0] = kernel[0][1] kernel[2][0] = kernel[0][2] kernel[2][1] = kernel[1][2] return kernel }
// TODO: spill if does not fit GPU? func makeFloats(size [3]int) *data.Slice { m := data.NewMesh(size[0], size[1], size[2], 1, 1, 1) return NewSlice(1, m) }