Пример #1
0
/*
 * Solve a system of linear equations A*X = B or A.T*X = B with general N-by-N
 * matrix A using the LU factorization computed by LUFactor().
 *
 * Arguments:
 *  B      On entry, the right hand side matrix B. On exit, the solution matrix X.
 *
 *  A      The factor L and U from the factorization A = P*L*U as computed by
 *         LUFactor()
 *
 *  pivots The pivot indices from LUFactor().
 *
 *  flags  The indicator of the form of the system of equations.
 *         If flags&TRANSA then system is transposed. All other values
 *         indicate non transposed system.
 *
 * Compatible with lapack.DGETRS.
 */
func LUSolve(B, A *cmat.FloatMatrix, pivots Pivots, flags int, confs ...*gomas.Config) *gomas.Error {
	var err *gomas.Error = nil
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}
	ar, ac := A.Size()
	br, _ := B.Size()
	if ar != ac {
		return gomas.NewError(gomas.ENOTSQUARE, "SolveLU")
	}
	if br != ac {
		return gomas.NewError(gomas.ESIZE, "SolveLU")
	}
	if pivots != nil {
		applyPivots(B, pivots)
	}
	if flags&gomas.TRANSA != 0 {
		// transposed X = A.-1*B == (L.T*U.T).-1*B == U.-T*(L.-T*B)
		blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.UNIT|gomas.TRANSA, conf)
		blasd.SolveTrm(B, A, 1.0, gomas.UPPER|gomas.TRANSA, conf)
	} else {
		// non-transposed X = A.-1*B == (L*U).-1*B == U.-1*(L.-1*B)
		blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.UNIT, conf)
		blasd.SolveTrm(B, A, 1.0, gomas.UPPER, conf)
	}

	return err
}
Пример #2
0
func Update2Sym(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error {

	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}

	ok := true
	cr, cc := Cc.Size()
	ar, ac := A.Size()
	br, bc := B.Size()

	if cr*cc == 0 {
		return nil
	}
	P := ac
	E := cr
	if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 {
		bits |= gomas.TRANSA
	}
	switch {
	case bits&gomas.TRANSA != 0:
		ok = cr == cc && cr == ac && bc == ac && br == ar
		P = ar
	default:
		ok = cr == cc && cr == ar && br == ar && bc == ac
	}
	if !ok {
		return gomas.NewError(gomas.ESIZE, "Update2Sym")
	}
	if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB {
		syr2k(Cc, A, B, alpha, beta, bits, P, 0, E, conf)
		return nil
	}
	// parallelized
	wait := make(chan int, 4)
	_, nN := blocking(0, E, conf.WB)
	nT := 0
	for j := 0; j < nN; j++ {
		jS := blockIndex(j, nN, conf.WB, E)
		jE := blockIndex(j+1, nN, conf.WB, E)
		task := func(q chan int) {
			syr2k(Cc, A, B, alpha, beta, bits, P, jS, jE, conf)
			q <- 1
		}
		conf.Sched.Schedule(gomas.NewTask(task, wait))
		nT += 1
	}
	for nT > 0 {
		<-wait
		nT -= 1
	}
	return nil
}
Пример #3
0
/*
 * Triangular matrix multiplication.
 */
func MultTrm(B, A *cmat.FloatMatrix, alpha float64, bits int, confs ...*gomas.Config) *gomas.Error {
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}

	if B.Len() == 0 || A.Len() == 0 {
		return nil
	}

	ok := true
	ar, ac := A.Size()
	br, bc := B.Size()
	P := ac
	E := bc
	switch {
	case bits&gomas.RIGHT != 0:
		ok = bc == ar && ar == ac
		E = br
	case bits&gomas.LEFT != 0:
		fallthrough
	default:
		ok = ac == br && ar == ac
	}
	if !ok {
		return gomas.NewError(gomas.ESIZE, "MultTrm")
	}

	// single threaded
	if conf.NProc == 1 || conf.WB <= 0 || E < conf.WB/2 {
		trmm(B, A, alpha, bits, P, 0, E, conf)
		return nil
	}

	// parallelized
	wait := make(chan int, 4)
	_, nN := blocking(0, E, conf.WB/2)
	nT := 0
	for j := 0; j < nN; j++ {
		jS := blockIndex(j, nN, conf.WB/2, E)
		jL := blockIndex(j+1, nN, conf.WB/2, E)
		task := func(q chan int) {
			trmm(B, A, alpha, bits, P, jS, jL, conf)
			q <- 1
		}
		conf.Sched.Schedule(gomas.NewTask(task, wait))
		nT += 1
	}
	for nT > 0 {
		<-wait
		nT -= 1
	}
	return nil
}
Пример #4
0
/*
 * Compute an LU factorization of a general M-by-N matrix without pivoting.
 *
 * Arguments:
 *   A   On entry, the M-by-N matrix to be factored. On exit the factors
 *       L and U from factorization A = P*L*U, the unit diagonal elements
 *       of L are not stored.
 *
 *   nb  Blocking factor for blocked invocations. If bn == 0 or
 *       min(M,N) < nb unblocked algorithm is used.
 *
 * Returns:
 *  LU factorization and error indicator.
 *
 * Compatible with lapack.DGETRF
 */
func luFactorNoPiv(A *cmat.FloatMatrix, confs ...*gomas.Config) *gomas.Error {
	var err *gomas.Error = nil
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}
	mlen := imin(m(A), n(A))
	if mlen <= conf.LB || conf.LB == 0 {
		err = unblockedLUnoPiv(A, conf)
	} else {
		err = blockedLUnoPiv(A, conf.LB, conf)
	}
	return err
}
Пример #5
0
func TestLU(t *testing.T) {
	N := 119
	K := 41
	nb := 0

	A := cmat.NewMatrix(N, N)
	A0 := cmat.NewMatrix(N, N)
	B := cmat.NewMatrix(N, K)
	X := cmat.NewMatrix(N, K)

	unitrand := cmat.NewFloatUniformSource()
	A.SetFrom(unitrand)
	A0.Copy(A)
	B.SetFrom(unitrand)
	X.Copy(B)
	piv := lapackd.NewPivots(N)

	conf := gomas.DefaultConf()
	conf.LB = nb

	// R = lu(A) = P*L*U
	lapackd.LUFactor(A, piv, conf)
	// X = A.-1*B = U.-1*(L.-1*B)
	lapackd.LUSolve(X, A, piv, gomas.NONE)
	// B = B - A*X
	blasd.Mult(B, A0, X, -1.0, 1.0, gomas.NONE)
	nrm := lapackd.NormP(B, lapackd.NORM_ONE)
	t.Logf("Unblocked decomposition: nb=%d\n", conf.LB)
	t.Logf("N=%d  ||B - A*X||_1: %e\n", N, nrm)

	// blocked
	conf.LB = 16
	A.Copy(A0)
	B.SetFrom(unitrand)
	X.Copy(B)
	// lu(A) = P*L*U
	lapackd.LUFactor(A, piv, conf)
	// X = A.-1*B = U.-1*(L.-1*B)
	lapackd.LUSolve(X, A, piv, gomas.NONE)
	// B = B - A*X
	blasd.Mult(B, A0, X, -1.0, 1.0, gomas.NONE)
	nrm = lapackd.NormP(B, lapackd.NORM_ONE)
	t.Logf("Blocked decomposition: nb=%d\n", conf.LB)
	t.Logf("N=%d  ||B - A*X||_1: %e\n", N, nrm)
}
Пример #6
0
/*
 * Compute the Cholesky factorization of a symmetric positive definite
 * N-by-N matrix A.
 *
 * Arguments:
 *  A     On entry, the symmetric matrix A. If flags&UPPER the upper triangular part
 *        of A contains the upper triangular part of the matrix A, and strictly
 *        lower part A is not referenced. If flags&LOWER the lower triangular part
 *        of a contains the lower triangular part of the matrix A. Likewise, the
 *        strictly upper part of A is not referenced. On exit, factor U or L from the
 *        Cholesky factorization A = U.T*U or A = L*L.T
 *
 *  flags The matrix structure indicator, UPPER for upper tridiagonal and LOWER for
 *        lower tridiagonal matrix.
 *
 *  confs Optional blocking configuration. If not provided default blocking configuration
 *        will be used.
 *
 * Compatible with lapack.DPOTRF
 */
func CHOLFactor(A *cmat.FloatMatrix, flags int, confs ...*gomas.Config) *gomas.Error {
	var err *gomas.Error = nil
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}
	ar, ac := A.Size()
	if ac != ar {
		return gomas.NewError(gomas.ENOTSQUARE, "DecomposeCHOL")
	}
	if ac < conf.LB || conf.LB == 0 {
		if flags&gomas.UPPER != 0 {
			err = unblockedUpperCHOL(A, flags, 0)
		} else {
			err = unblockedLowerCHOL(A, flags, 0)
		}
	} else {
		err = blockedCHOL(A, flags, conf)
	}
	return err
}
Пример #7
0
/*
 * Solves a system system of linear equations A*X = B with symmetric positive
 * definite matrix A using the Cholesky factorization A = U.T*U or A = L*L.T
 * computed by DecomposeCHOL().
 *
 * Arguments:
 *  B   On entry, the right hand side matrix B. On exit, the solution
 *      matrix X.
 *
 *  A   The triangular factor U or L from Cholesky factorization as computed by
 *      DecomposeCHOL().
 *
 *  flags Indicator of which factor is stored in A. If flags&UPPER then upper
 *        triangle of A is stored. If flags&LOWER then lower triangle of A is
 *        stored.
 *
 * Compatible with lapack.DPOTRS.
 */
func CHOLSolve(B, A *cmat.FloatMatrix, flags int, confs ...*gomas.Config) *gomas.Error {
	// A*X = B; X = A.-1*B == (LU).-1*B == U.-1*L.-1*B == U.-1*(L.-1*B)
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}
	ar, ac := A.Size()
	br, _ := B.Size()
	if ac != br || ar != ac {
		return gomas.NewError(gomas.ESIZE, "SolveCHOL")
	}
	if flags&gomas.UPPER != 0 {
		// X = (U.T*U).-1*B => U.-1*(U.-T*B)
		blasd.SolveTrm(B, A, 1.0, gomas.UPPER|gomas.TRANSA, conf)
		blasd.SolveTrm(B, A, 1.0, gomas.UPPER, conf)
	} else if flags&gomas.LOWER != 0 {
		// X = (L*L.T).-1*B = L.-T*(L.1*B)
		blasd.SolveTrm(B, A, 1.0, gomas.LOWER, conf)
		blasd.SolveTrm(B, A, 1.0, gomas.LOWER|gomas.TRANSA, conf)
	}
	return nil
}
Пример #8
0
/*
 * General matrix-matrix multiplication.
 *
 * Computes C = beta*C + alpha*op(A)*op(B), where op is optional transpose operation
 * encoded in bits argument. Operand A is transposed if gomas.TRANSA bit is set in
 * bits. And operand B is transposed if gomas.TRANSB bit is set.
 *
 * Optional Config block defines blocking parameters for computation.
 */
func Mult(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error {
	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}

	if A.Len() == 0 || B.Len() == 0 {
		return nil
	}
	ok := true
	cr, cc := Cc.Size()
	ar, ac := A.Size()
	br, bc := B.Size()
	P := ac
	L := cc
	E := cr
	switch bits & (gomas.TRANSA | gomas.TRANSB) {
	case gomas.TRANSA | gomas.TRANSB:
		ok = cr == ac && cc == br && ar == bc
		P = ar
	case gomas.TRANSA:
		ok = cr == ac && cc == bc && ar == br
		P = ar
	case gomas.TRANSB:
		ok = cr == ar && cc == br && ac == bc
		P = ac
	default:
		ok = cr == ar && cc == bc && ac == br
	}
	if !ok {
		return gomas.NewError(gomas.ESIZE, "Mult")
	}

	// single threaded
	if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB {
		gemm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf)
		return nil
	}
	// parallelized
	wait := make(chan int, 4)
	nM, nN := blocking(cr, cc, conf.WB)
	nT := int64(0)

	for j := 0; j < nN; j++ {
		jS := blockIndex(j, nN, conf.WB, cc)
		jL := blockIndex(j+1, nN, conf.WB, cc)
		for i := 0; i < nM; i++ {
			iR := blockIndex(i, nM, conf.WB, cr)
			iE := blockIndex(i+1, nM, conf.WB, cr)
			task := func(q chan int) {
				gemm(Cc, A, B, alpha, beta, bits, P, jS, jL, iR, iE, conf)
				q <- 1
			}
			nT += 1
			conf.Sched.Schedule(gomas.NewTask(task, wait))
		}
	}
	// wait for subtask to complete
	for nT > 0 {
		<-wait
		nT -= 1
	}
	return nil
}
Пример #9
0
func UpdateTrm(Cc, A, B *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error {

	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}
	if A.Len() == 0 || B.Len() == 0 {
		return nil
	}
	ok := true
	cr, cc := Cc.Size()
	ar, ac := A.Size()
	br, bc := B.Size()
	P := ac
	L := cc
	E := cr
	switch bits & (gomas.TRANSA | gomas.TRANSB) {
	case gomas.TRANSA | gomas.TRANSB:
		ok = cr == ac && cc == br && ar == bc
		P = ar
	case gomas.TRANSA:
		ok = cr == ac && cc == bc && ar == br
		P = ar
	case gomas.TRANSB:
		ok = cr == ar && cc == br && ac == bc
	default:
		ok = cr == ar && cc == bc && ac == br
	}
	if !ok {
		return gomas.NewError(gomas.ESIZE, "UpdateTrm")
	}
	// single threaded
	if conf.NProc == 1 || conf.WB <= 0 || Cc.Len() < conf.WB*conf.WB {
		updtrm(Cc, A, B, alpha, beta, bits, P, 0, L, 0, E, conf)
		return nil
	}
	// parallelized
	wait := make(chan int, 4)
	nM, nN := blocking(cr, cc, conf.WB)
	nT := 0
	if bits&gomas.UPPER != 0 {
		// by rows; upper trapezoidial
		for j := 0; j < nM; j++ {
			iR := blockIndex(j, nM, conf.WB, cr)
			iE := blockIndex(j+1, nM, conf.WB, cr)
			task := func(q chan int) {
				updtrm(Cc, A, B, alpha, beta, bits, P, iR, L, iR, iE, conf)
				q <- 1
			}
			conf.Sched.Schedule(gomas.NewTask(task, wait))
			nT += 1
		}
	} else {
		// by columns; lower trapezoidial
		for j := 0; j < nN; j++ {
			jS := blockIndex(j, nN, conf.WB, cc)
			jL := blockIndex(j+1, nN, conf.WB, cc)
			task := func(q chan int) {
				updtrm(Cc, A, B, alpha, beta, bits, P, jS, jL, jS, E, conf)
				q <- 1
			}
			conf.Sched.Schedule(gomas.NewTask(task, wait))
			nT += 1
		}
	}
	// wait for subtasks to complete
	for nT > 0 {
		<-wait
		nT -= 1
	}
	return nil
}
Пример #10
0
/*
 * UpdateSym performs symmetric rank-k update C = beta*C + alpha*A*A.T or
 * C = beta*C + alpha*A.T*A if gomas.TRANS bit is set.
 */
func UpdateSym(c, a *cmat.FloatMatrix, alpha, beta float64, bits int, confs ...*gomas.Config) *gomas.Error {

	conf := gomas.DefaultConf()
	if len(confs) > 0 {
		conf = confs[0]
	}

	ok := true
	cr, cc := c.Size()
	ar, ac := a.Size()
	if cr*cc == 0 {
		return nil
	}
	P := ac
	E := cr
	if bits&gomas.TRANS != 0 && bits&gomas.TRANSA == 0 {
		bits |= gomas.TRANSA
	}
	switch {
	case bits&gomas.TRANSA != 0:
		ok = cr == cc && cr == ac
		P = ar
	default:
		ok = cr == cc && cr == ar
	}
	if !ok {
		return gomas.NewError(gomas.ESIZE, "UpdateSym")
	}
	if conf.NProc == 1 || conf.WB <= 0 || E <= conf.WB {
		syrk(c, a, alpha, beta, bits, P, 0, E, conf)
		return nil
	}

	// parallelized
	var sbits int = 0
	wait := make(chan int, 4)
	nM, nN := blocking(E, E, conf.WB)
	nT := 0
	if bits&gomas.TRANS != 0 {
		sbits |= gomas.TRANSA
	} else {
		sbits |= gomas.TRANSB
	}
	if bits&gomas.LOWER != 0 {
		sbits |= gomas.LOWER
		for j := 0; j < nN; j++ {
			jS := blockIndex(j, nN, conf.WB, E)
			jL := blockIndex(j+1, nN, conf.WB, E)
			// update lower trapezoidal/triangular blocks
			task := func(q chan int) {
				updtrm(c, a, a, alpha, beta, sbits, P, jS, jL, jS, E, conf)
				//syrk(c, a, alpha, beta, bits, P, jS, jL, conf)
				q <- 1
			}
			conf.Sched.Schedule(gomas.NewTask(task, wait))
			nT += 1
		}
	} else {
		sbits |= gomas.UPPER
		for j := 0; j < nM; j++ {
			jS := blockIndex(j, nM, conf.WB, E)
			jL := blockIndex(j+1, nM, conf.WB, E)
			// update upper trapezoidal/triangular blocks
			task := func(q chan int) {
				updtrm(c, a, a, alpha, beta, sbits, P, jS, E, jS, jL, conf)
				//syrk(c, a, alpha, beta, bits, P, jS, jL, conf)
				q <- 1
			}
			conf.Sched.Schedule(gomas.NewTask(task, wait))
			nT += 1
		}
	}
	for nT > 0 {
		<-wait
		nT -= 1
	}
	return nil
}