/* * Solve a system of linear equations A*X = B or A.T*X = B with general N-by-N * matrix A using the LU factorization computed by LUFactor(). * * Arguments: * B On entry, the right hand side matrix B. On exit, the solution matrix X. * * A The factor L and U from the factorization A = P*L*U as computed by * LUFactor() * * pivots The pivot indices from LUFactor(). * * flags The indicator of the form of the system of equations. * If flags&TRANSA then system is transposed. All other values * indicate non transposed system. * * Compatible with lapack.DGETRS. */ func LUSolve(B, A *cmat.FloatMatrix, pivots Pivots, flags int, confs ...*gomas.Config) *gomas.Error { var err *gomas.Error = nil conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ar, ac := A.Size() br, _ := B.Size() if ar != ac { return gomas.NewError(gomas.ENOTSQUARE, "SolveLU") } if br != ac { return gomas.NewError(gomas.ESIZE, "SolveLU") } if pivots != nil { applyPivots(B, pivots) } if flags&gomas.TRANSA != 0 { // transposed X = A.-1*B == (L.T*U.T).-1*B == U.-T*(L.-T*B) blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.UNIT|gomas.TRANSA, conf) blasd.SolveTrm(B, A, 1.0, gomas.UPPER|gomas.TRANSA, conf) } else { // non-transposed X = A.-1*B == (L*U).-1*B == U.-1*(L.-1*B) blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.UNIT, conf) blasd.SolveTrm(B, A, 1.0, gomas.UPPER, conf) } return err }
func Update2Sym(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() if cr*cc == 0 { return nil } P := ac E := cr if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 { bits |= gomas.TRANSA } switch { case bits&gomas.TRANSA != 0: ok = cr == cc && cr == ac && bc == ac && br == ar P = ar default: ok = cr == cc && cr == ar && br == ar && bc == ac } if !ok { return gomas.NewError(gomas.ESIZE, "Update2Sym") } if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB { syr2k(Cc, A, B, alpha, beta, bits, P, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) _, nN := blocking(0, E, conf.WB) nT := 0 for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, E) jE := blockIndex(j+1, nN, conf.WB, E) task := func(q chan int) { syr2k(Cc, A, B, alpha, beta, bits, P, jS, jE, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } for nT > 0 { <-wait nT -= 1 } return nil }
/* * Triangular matrix multiplication. */ func MultTrm(B, A *cmat.FloatMatrix, alpha float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if B.Len() == 0 || A.Len() == 0 { return nil } ok := true ar, ac := A.Size() br, bc := B.Size() P := ac E := bc switch { case bits&gomas.RIGHT != 0: ok = bc == ar && ar == ac E = br case bits&gomas.LEFT != 0: fallthrough default: ok = ac == br && ar == ac } if !ok { return gomas.NewError(gomas.ESIZE, "MultTrm") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || E < conf.WB/2 { trmm(B, A, alpha, bits, P, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) _, nN := blocking(0, E, conf.WB/2) nT := 0 for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB/2, E) jL := blockIndex(j+1, nN, conf.WB/2, E) task := func(q chan int) { trmm(B, A, alpha, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } for nT > 0 { <-wait nT -= 1 } return nil }
/* * Compute an LU factorization of a general M-by-N matrix without pivoting. * * Arguments: * A On entry, the M-by-N matrix to be factored. On exit the factors * L and U from factorization A = P*L*U, the unit diagonal elements * of L are not stored. * * nb Blocking factor for blocked invocations. If bn == 0 or * min(M,N) < nb unblocked algorithm is used. * * Returns: * LU factorization and error indicator. * * Compatible with lapack.DGETRF */ func luFactorNoPiv(A *cmat.FloatMatrix, confs ...*gomas.Config) *gomas.Error { var err *gomas.Error = nil conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } mlen := imin(m(A), n(A)) if mlen <= conf.LB || conf.LB == 0 { err = unblockedLUnoPiv(A, conf) } else { err = blockedLUnoPiv(A, conf.LB, conf) } return err }
func TestLU(t *testing.T) { N := 119 K := 41 nb := 0 A := cmat.NewMatrix(N, N) A0 := cmat.NewMatrix(N, N) B := cmat.NewMatrix(N, K) X := cmat.NewMatrix(N, K) unitrand := cmat.NewFloatUniformSource() A.SetFrom(unitrand) A0.Copy(A) B.SetFrom(unitrand) X.Copy(B) piv := lapackd.NewPivots(N) conf := gomas.DefaultConf() conf.LB = nb // R = lu(A) = P*L*U lapackd.LUFactor(A, piv, conf) // X = A.-1*B = U.-1*(L.-1*B) lapackd.LUSolve(X, A, piv, gomas.NONE) // B = B - A*X blasd.Mult(B, A0, X, -1.0, 1.0, gomas.NONE) nrm := lapackd.NormP(B, lapackd.NORM_ONE) t.Logf("Unblocked decomposition: nb=%d\n", conf.LB) t.Logf("N=%d ||B - A*X||_1: %e\n", N, nrm) // blocked conf.LB = 16 A.Copy(A0) B.SetFrom(unitrand) X.Copy(B) // lu(A) = P*L*U lapackd.LUFactor(A, piv, conf) // X = A.-1*B = U.-1*(L.-1*B) lapackd.LUSolve(X, A, piv, gomas.NONE) // B = B - A*X blasd.Mult(B, A0, X, -1.0, 1.0, gomas.NONE) nrm = lapackd.NormP(B, lapackd.NORM_ONE) t.Logf("Blocked decomposition: nb=%d\n", conf.LB) t.Logf("N=%d ||B - A*X||_1: %e\n", N, nrm) }
/* * Compute the Cholesky factorization of a symmetric positive definite * N-by-N matrix A. * * Arguments: * A On entry, the symmetric matrix A. If flags&UPPER the upper triangular part * of A contains the upper triangular part of the matrix A, and strictly * lower part A is not referenced. If flags&LOWER the lower triangular part * of a contains the lower triangular part of the matrix A. Likewise, the * strictly upper part of A is not referenced. On exit, factor U or L from the * Cholesky factorization A = U.T*U or A = L*L.T * * flags The matrix structure indicator, UPPER for upper tridiagonal and LOWER for * lower tridiagonal matrix. * * confs Optional blocking configuration. If not provided default blocking configuration * will be used. * * Compatible with lapack.DPOTRF */ func CHOLFactor(A *cmat.FloatMatrix, flags int, confs ...*gomas.Config) *gomas.Error { var err *gomas.Error = nil conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ar, ac := A.Size() if ac != ar { return gomas.NewError(gomas.ENOTSQUARE, "DecomposeCHOL") } if ac < conf.LB || conf.LB == 0 { if flags&gomas.UPPER != 0 { err = unblockedUpperCHOL(A, flags, 0) } else { err = unblockedLowerCHOL(A, flags, 0) } } else { err = blockedCHOL(A, flags, conf) } return err }
/* * Solves a system system of linear equations A*X = B with symmetric positive * definite matrix A using the Cholesky factorization A = U.T*U or A = L*L.T * computed by DecomposeCHOL(). * * Arguments: * B On entry, the right hand side matrix B. On exit, the solution * matrix X. * * A The triangular factor U or L from Cholesky factorization as computed by * DecomposeCHOL(). * * flags Indicator of which factor is stored in A. If flags&UPPER then upper * triangle of A is stored. If flags&LOWER then lower triangle of A is * stored. * * Compatible with lapack.DPOTRS. */ func CHOLSolve(B, A *cmat.FloatMatrix, flags int, confs ...*gomas.Config) *gomas.Error { // A*X = B; X = A.-1*B == (LU).-1*B == U.-1*L.-1*B == U.-1*(L.-1*B) conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ar, ac := A.Size() br, _ := B.Size() if ac != br || ar != ac { return gomas.NewError(gomas.ESIZE, "SolveCHOL") } if flags&gomas.UPPER != 0 { // X = (U.T*U).-1*B => U.-1*(U.-T*B) blasd.SolveTrm(B, A, 1.0, gomas.UPPER|gomas.TRANSA, conf) blasd.SolveTrm(B, A, 1.0, gomas.UPPER, conf) } else if flags&gomas.LOWER != 0 { // X = (L*L.T).-1*B = L.-T*(L.1*B) blasd.SolveTrm(B, A, 1.0, gomas.LOWER, conf) blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.TRANSA, conf) } return nil }
/* * General matrix-matrix multiplication. * * Computes C = beta*C + alpha*op(A)*op(B), where op is optional transpose operation * encoded in bits argument. Operand A is transposed if gomas.TRANSA bit is set in * bits. And operand B is transposed if gomas.TRANSB bit is set. * * Optional Config block defines blocking parameters for computation. */ func Mult(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if A.Len() == 0 || B.Len() == 0 { return nil } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() P := ac L := cc E := cr switch bits & (gomas.TRANSA | gomas.TRANSB) { case gomas.TRANSA | gomas.TRANSB: ok = cr == ac && cc == br && ar == bc P = ar case gomas.TRANSA: ok = cr == ac && cc == bc && ar == br P = ar case gomas.TRANSB: ok = cr == ar && cc == br && ac == bc P = ac default: ok = cr == ar && cc == bc && ac == br } if !ok { return gomas.NewError(gomas.ESIZE, "Mult") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB { gemm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) nM, nN := blocking(cr, cc, conf.WB) nT := int64(0) for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, cc) jL := blockIndex(j+1, nN, conf.WB, cc) for i := 0; i < nM; i++ { iR := blockIndex(i, nM, conf.WB, cr) iE := blockIndex(i+1, nM, conf.WB, cr) task := func(q chan int) { gemm(Cc, A, B, alpha, beta, bits, P, jS, jL, iR, iE, conf) q <- 1 } nT += 1 conf.Sched.Schedule(gomas.NewTask(task, wait)) } } // wait for subtask to complete for nT > 0 { <-wait nT -= 1 } return nil }
func UpdateTrm(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } if A.Len() == 0 || B.Len() == 0 { return nil } ok := true cr, cc := Cc.Size() ar, ac := A.Size() br, bc := B.Size() P := ac L := cc E := cr switch bits & (gomas.TRANSA | gomas.TRANSB) { case gomas.TRANSA | gomas.TRANSB: ok = cr == ac && cc == br && ar == bc P = ar case gomas.TRANSA: ok = cr == ac && cc == bc && ar == br P = ar case gomas.TRANSB: ok = cr == ar && cc == br && ac == bc default: ok = cr == ar && cc == bc && ac == br } if !ok { return gomas.NewError(gomas.ESIZE, "UpdateTrm") } // single threaded if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB { updtrm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf) return nil } // parallelized wait := make(chan int, 4) nM, nN := blocking(cr, cc, conf.WB) nT := 0 if bits&gomas.UPPER != 0 { // by rows; upper trapezoidial for j := 0; j < nM; j++ { iR := blockIndex(j, nM, conf.WB, cr) iE := blockIndex(j+1, nM, conf.WB, cr) task := func(q chan int) { updtrm(Cc, A, B, alpha, beta, bits, P, iR, L, iR, iE, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } else { // by columns; lower trapezoidial for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, cc) jL := blockIndex(j+1, nN, conf.WB, cc) task := func(q chan int) { updtrm(Cc, A, B, alpha, beta, bits, P, jS, jL, jS, E, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } // wait for subtasks to complete for nT > 0 { <-wait nT -= 1 } return nil }
/* * UpdateSym performs symmetric rank-k update C = beta*C + alpha*A*A.T or * C = beta*C + alpha*A.T*A if gomas.TRANS bit is set. */ func UpdateSym(c, a *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error { conf := gomas.DefaultConf() if len(confs) > 0 { conf = confs[0] } ok := true cr, cc := c.Size() ar, ac := a.Size() if cr*cc == 0 { return nil } P := ac E := cr if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 { bits |= gomas.TRANSA } switch { case bits&gomas.TRANSA != 0: ok = cr == cc && cr == ac P = ar default: ok = cr == cc && cr == ar } if !ok { return gomas.NewError(gomas.ESIZE, "UpdateSym") } if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB { syrk(c, a, alpha, beta, bits, P, 0, E, conf) return nil } // parallelized var sbits int = 0 wait := make(chan int, 4) nM, nN := blocking(E, E, conf.WB) nT := 0 if bits&gomas.TRANS != 0 { sbits |= gomas.TRANSA } else { sbits |= gomas.TRANSB } if bits&gomas.LOWER != 0 { sbits |= gomas.LOWER for j := 0; j < nN; j++ { jS := blockIndex(j, nN, conf.WB, E) jL := blockIndex(j+1, nN, conf.WB, E) // update lower trapezoidal/triangular blocks task := func(q chan int) { updtrm(c, a, a, alpha, beta, sbits, P, jS, jL, jS, E, conf) //syrk(c, a, alpha, beta, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } else { sbits |= gomas.UPPER for j := 0; j < nM; j++ { jS := blockIndex(j, nM, conf.WB, E) jL := blockIndex(j+1, nM, conf.WB, E) // update upper trapezoidal/triangular blocks task := func(q chan int) { updtrm(c, a, a, alpha, beta, sbits, P, jS, E, jS, jL, conf) //syrk(c, a, alpha, beta, bits, P, jS, jL, conf) q <- 1 } conf.Sched.Schedule(gomas.NewTask(task, wait)) nT += 1 } } for nT > 0 { <-wait nT -= 1 } return nil }