func main() { cuda.Init() N0, N1, N2 := 1, 64, 128 c := 1. mesh := data.NewMesh(N0, N1, N2, c/2, c*2, c) m := cuda.NewSlice(3, mesh) conv := cuda.NewDemag(mesh) cuda.Memset(m, 1, 1, 1) B := cuda.NewSlice(3, mesh) Bsat := 1. vol := data.NilSlice(1, mesh) conv.Exec(B, m, vol, Bsat) out := B.HostCopy() bx := out.Vectors()[0][N0/2][N1/2][N2/2] by := out.Vectors()[1][N0/2][N1/2][N2/2] bz := out.Vectors()[2][N0/2][N1/2][N2/2] fmt.Println("demag tensor:", bx, by, bz) check(bx, -1) check(by, 0) check(bz, 0) fmt.Println("OK") }
// returns host buffer for storing output before being flushed to disk. // takes one from the pool or allocates a new one when the pool is empty // and less than maxOutputQueLen buffers already are in use. func hostbuf() *data.Slice { select { case b := <-hBuf: cuda.Memset(b, 0, 0, 0) // not strictly needed return b default: if nOutBuf < maxOutputQueLen { nOutBuf++ return cuda.NewUnifiedSlice(3, mesh) } } panic("unreachable") }
// Memset with synchronization. func (b *buffered) memset(val ...float32) { s := b.Write() cuda.Memset(s, val...) b.WriteDone() }