func Update2Sym(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() if cr*cc == 0 { return nil } P := ac E := cr if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 { bits |= gomas.TRANSA } switch { case bits&gomas.TRANSA != 0: ok = cr == cc && cr == ac && bc == ac && br == ar P = ar default: ok = cr == cc && cr == ar && br == ar && bc == ac } if !ok { return gomas.NewError(gomas.ESIZE, "Update2Sym") } if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB { syr2k(Cc, A, B, alpha, beta, bits, P, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) _, nN := blocking(0, E, conf.WB) nT := 0 for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, E) jE := blockIndex(j+1, nN, conf.WB, E) task := func(q chan int) { syr2k(Cc, A, B, alpha, beta, bits, P, jS, jE, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } for nT > 0 { <-wait nT -= 1 } return nil }
/* * Triangular matrix multiplication. */ func MultTrm(B, A *cmat.FloatMatrix, alpha float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if B.Len() == 0 || A.Len() == 0 { return nil } ok := true ar, ac := A.Size() br, bc := B.Size() P := ac E := bc switch { case bits&gomas.RIGHT != 0: ok = bc == ar && ar == ac E = br case bits&gomas.LEFT != 0: fallthrough default: ok = ac == br && ar == ac } if !ok { return gomas.NewError(gomas.ESIZE, "MultTrm") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || E < conf.WB/2 { trmm(B, A, alpha, bits, P, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) _, nN := blocking(0, E, conf.WB/2) nT := 0 for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB/2, E) jL := blockIndex(j+1, nN, conf.WB/2, E) task := func(q chan int) { trmm(B, A, alpha, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } for nT > 0 { <-wait nT -= 1 } return nil }
/* * General matrix-matrix multiplication. * * Computes C = beta*C + alpha*op(A)*op(B), where op is optional transpose operation * encoded in bits argument. Operand A is transposed if gomas.TRANSA bit is set in * bits. And operand B is transposed if gomas.TRANSB bit is set. * * Optional Config block defines blocking parameters for computation. */ func Mult(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if A.Len() == 0 || B.Len() == 0 { return nil } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() P := ac L := cc E := cr switch bits & (gomas.TRANSA | gomas.TRANSB) { case gomas.TRANSA | gomas.TRANSB: ok = cr == ac && cc == br && ar == bc P = ar case gomas.TRANSA: ok = cr == ac && cc == bc && ar == br P = ar case gomas.TRANSB: ok = cr == ar && cc == br && ac == bc P = ac default: ok = cr == ar && cc == bc && ac == br } if !ok { return gomas.NewError(gomas.ESIZE, "Mult") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB { gemm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) nM, nN := blocking(cr, cc, conf.WB) nT := int64(0) for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, cc) jL := blockIndex(j+1, nN, conf.WB, cc) for i := 0; i < nM; i++ { iR := blockIndex(i, nM, conf.WB, cr) iE := blockIndex(i+1, nM, conf.WB, cr) task := func(q chan int) { gemm(Cc, A, B, alpha, beta, bits, P, jS, jL, iR, iE, conf) q <- 1 } nT += 1 conf.Sched.Schedule(gomas.NewTask(task, wait)) } } // wait for subtask to complete for nT > 0 { <-wait nT -= 1 } return nil }
func UpdateTrm(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if A.Len() == 0 || B.Len() == 0 { return nil } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() P := ac L := cc E := cr switch bits & (gomas.TRANSA | gomas.TRANSB) { case gomas.TRANSA | gomas.TRANSB: ok = cr == ac && cc == br && ar == bc P = ar case gomas.TRANSA: ok = cr == ac && cc == bc && ar == br P = ar case gomas.TRANSB: ok = cr == ar && cc == br && ac == bc default: ok = cr == ar && cc == bc && ac == br } if !ok { return gomas.NewError(gomas.ESIZE, "UpdateTrm") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB { updtrm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) nM, nN := blocking(cr, cc, conf.WB) nT := 0 if bits&gomas.UPPER != 0 { // by rows; upper trapezoidial for j := 0; j < nM; j++ { iR := blockIndex(j, nM, conf.WB, cr) iE := blockIndex(j+1, nM, conf.WB, cr) task := func(q chan int) { updtrm(Cc, A, B, alpha, beta, bits, P, iR, L, iR, iE, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } else { // by columns; lower trapezoidial for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, cc) jL := blockIndex(j+1, nN, conf.WB, cc) task := func(q chan int) { updtrm(Cc, A, B, alpha, beta, bits, P, jS, jL, jS, E, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } // wait for subtasks to complete for nT > 0 { <-wait nT -= 1 } return nil }
/* * UpdateSym performs symmetric rank-k update C = beta*C + alpha*A*A.T or * C = beta*C + alpha*A.T*A if gomas.TRANS bit is set. */ func UpdateSym(c, a *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ok := true cr, cc := c.Size() ar, ac := a.Size() if cr*cc == 0 { return nil } P := ac E := cr if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 { bits |= gomas.TRANSA } switch { case bits&gomas.TRANSA != 0: ok = cr == cc && cr == ac P = ar default: ok = cr == cc && cr == ar } if !ok { return gomas.NewError(gomas.ESIZE, "UpdateSym") } if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB { syrk(c, a, alpha, beta, bits, P, 0, E, conf) return nil } // parallelized var sbits int = 0 wait := make(chan int, 4) nM, nN := blocking(E, E, conf.WB) nT := 0 if bits&gomas.TRANS != 0 { sbits |= gomas.TRANSA } else { sbits |= gomas.TRANSB } if bits&gomas.LOWER != 0 { sbits |= gomas.LOWER for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, E) jL := blockIndex(j+1, nN, conf.WB, E) // update lower trapezoidal/triangular blocks task := func(q chan int) { updtrm(c, a, a, alpha, beta, sbits, P, jS, jL, jS, E, conf) //syrk(c, a, alpha, beta, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } else { sbits |= gomas.UPPER for j := 0; j < nM; j++ { jS := blockIndex(j, nM, conf.WB, E) jL := blockIndex(j+1, nM, conf.WB, E) // update upper trapezoidal/triangular blocks task := func(q chan int) { updtrm(c, a, a, alpha, beta, sbits, P, jS, E, jS, jL, conf) //syrk(c, a, alpha, beta, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } for nT > 0 { <-wait nT -= 1 } return nil }