Esempio n. 1
0
// Drscl multiplies the vector x by 1/a being careful to avoid overflow or
// underflow where possible.
func (impl Implementation) Drscl(n int, a float64, x []float64, incX int) {
	checkVector(n, x, incX)
	bi := blas64.Implementation()
	cden := a
	cnum := 1.0
	smlnum := dlamchS
	bignum := 1 / smlnum
	for {
		cden1 := cden * smlnum
		cnum1 := cnum / bignum
		var mul float64
		var done bool
		switch {
		case cnum != 0 && math.Abs(cden1) > math.Abs(cnum):
			mul = smlnum
			done = false
			cden = cden1
		case math.Abs(cnum1) > math.Abs(cden):
			mul = bignum
			done = false
			cnum = cnum1
		default:
			mul = cnum / cden
			done = true
		}
		bi.Dscal(n, mul, x, incX)
		if done {
			break
		}
	}
}
Esempio n. 2
0
// Dgetrs solves a system of equations using an LU factorization.
// The system of equations solved is
//  A * X = B if trans == blas.Trans
//  A^T * X = B if trans == blas.NoTrans
// A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs.
//
// On entry b contains the elements of the matrix B. On exit, b contains the
// elements of X, the solution to the system of equations.
//
// a and ipiv contain the LU factorization of A and the permutation indices as
// computed by Dgetrf. ipiv is zero-indexed.
func (impl Implementation) Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) {
	checkMatrix(n, n, a, lda)
	checkMatrix(n, nrhs, b, ldb)
	if len(ipiv) < n {
		panic(badIpiv)
	}
	if n == 0 || nrhs == 0 {
		return
	}
	if trans != blas.Trans && trans != blas.NoTrans {
		panic(badTrans)
	}
	bi := blas64.Implementation()
	if trans == blas.NoTrans {
		// Solve A * X = B.
		impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, 1)
		// Solve L * X = B, updating b.
		bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
			n, nrhs, 1, a, lda, b, ldb)
		// Solve U * X = B, updating b.
		bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit,
			n, nrhs, 1, a, lda, b, ldb)
		return
	}
	// Solve A^T * X = B.
	// Solve U^T * X = B, updating b.
	bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit,
		n, nrhs, 1, a, lda, b, ldb)
	// Solve L^T * X = B, updating b.
	bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.Unit,
		n, nrhs, 1, a, lda, b, ldb)
	impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, -1)
}
Esempio n. 3
0
// Dlarf applies an elementary reflector to a general rectangular matrix c.
// This computes
//  c = h * c if side == Left
//  c = c * h if side == right
// where
//  h = 1 - tau * v * v^T
// and c is an m * n matrix.
//
// work is temporary storage of length at least m if side == Left and at least
// n if side == Right. This function will panic if this length requirement is not met.
//
// Dlarf is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dlarf(side blas.Side, m, n int, v []float64, incv int, tau float64, c []float64, ldc int, work []float64) {
	applyleft := side == blas.Left
	if (applyleft && len(work) < n) || (!applyleft && len(work) < m) {
		panic(badWork)
	}
	checkMatrix(m, n, c, ldc)

	// v has length m if applyleft and n otherwise.
	lenV := n
	if applyleft {
		lenV = m
	}

	checkVector(lenV, v, incv)

	lastv := 0 // last non-zero element of v
	lastc := 0 // last non-zero row/column of c
	if tau != 0 {
		var i int
		if applyleft {
			lastv = m - 1
		} else {
			lastv = n - 1
		}
		if incv > 0 {
			i = lastv * incv
		}

		// Look for the last non-zero row in v.
		for lastv >= 0 && v[i] == 0 {
			lastv--
			i -= incv
		}
		if applyleft {
			// Scan for the last non-zero column in C[0:lastv, :]
			lastc = impl.Iladlc(lastv+1, n, c, ldc)
		} else {
			// Scan for the last non-zero row in C[:, 0:lastv]
			lastc = impl.Iladlr(m, lastv+1, c, ldc)
		}
	}
	if lastv == -1 || lastc == -1 {
		return
	}
	// Sometimes 1-indexing is nicer ...
	bi := blas64.Implementation()
	if applyleft {
		// Form H * C
		// w[0:lastc+1] = c[1:lastv+1, 1:lastc+1]^T * v[1:lastv+1,1]
		bi.Dgemv(blas.Trans, lastv+1, lastc+1, 1, c, ldc, v, incv, 0, work, 1)
		// c[0: lastv, 0: lastc] = c[...] - w[0:lastv, 1] * v[1:lastc, 1]^T
		bi.Dger(lastv+1, lastc+1, -tau, v, incv, work, 1, c, ldc)
		return
	}
	// Form C*H
	// w[0:lastc+1,1] := c[0:lastc+1,0:lastv+1] * v[0:lastv+1,1]
	bi.Dgemv(blas.NoTrans, lastc+1, lastv+1, 1, c, ldc, v, incv, 0, work, 1)
	// c[0:lastc+1,0:lastv+1] = c[...] - w[0:lastc+1,0] * v[0:lastv+1,0]^T
	bi.Dger(lastc+1, lastv+1, -tau, work, 1, v, incv, c, ldc)
}
Esempio n. 4
0
// Dpotrf computes the cholesky decomposition of the symmetric positive definite
// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
// and a = U U^T is stored in place into a. If ul == blas.Lower, then a = L L^T
// is computed and stored in-place into a. If a is not positive definite, false
// is returned. This is the blocked version of the algorithm.
func (impl Implementation) Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
	bi := blas64.Implementation()
	if ul != blas.Upper && ul != blas.Lower {
		panic(badUplo)
	}
	if n < 0 {
		panic(nLT0)
	}
	if lda < n {
		panic(badLdA)
	}
	if n == 0 {
		return true
	}
	nb := impl.Ilaenv(1, "DPOTRF", string(ul), n, -1, -1, -1)
	if n <= nb {
		return impl.Dpotf2(ul, n, a, lda)
	}
	if ul == blas.Upper {
		for j := 0; j < n; j += nb {
			jb := min(nb, n-j)
			bi.Dsyrk(blas.Upper, blas.Trans, jb, j,
				-1, a[j:], lda,
				1, a[j*lda+j:], lda)
			ok = impl.Dpotf2(blas.Upper, jb, a[j*lda+j:], lda)
			if !ok {
				return ok
			}
			if j+jb < n {
				bi.Dgemm(blas.Trans, blas.NoTrans, jb, n-j-jb, j,
					-1, a[j:], lda, a[j+jb:], lda,
					1, a[j*lda+j+jb:], lda)
				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, jb, n-j-jb,
					1, a[j*lda+j:], lda,
					a[j*lda+j+jb:], lda)
			}
		}
		return true
	}
	for j := 0; j < n; j += nb {
		jb := min(nb, n-j)
		bi.Dsyrk(blas.Lower, blas.NoTrans, jb, j,
			-1, a[j*lda:], lda,
			1, a[j*lda+j:], lda)
		ok := impl.Dpotf2(blas.Lower, jb, a[j*lda+j:], lda)
		if !ok {
			return ok
		}
		if j+jb < n {
			bi.Dgemm(blas.NoTrans, blas.Trans, n-j-jb, jb, j,
				-1, a[(j+jb)*lda:], lda, a[j*lda:], lda,
				1, a[(j+jb)*lda+j:], lda)
			bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, n-j-jb, jb,
				1, a[j*lda+j:], lda,
				a[(j+jb)*lda+j:], lda)
		}
	}
	return true
}
Esempio n. 5
0
// Dgecon estimates the reciprocal of the condition number of the n×n matrix A
// given the LU decomposition of the matrix. The condition number computed may
// be based on the 1-norm or the ∞-norm.
//
// The slice a contains the result of the LU decomposition of A as computed by Dgetrf.
//
// anorm is the corresponding 1-norm or ∞-norm of the original matrix A.
//
// work is a temporary data slice of length at least 4*n and Dgecon will panic otherwise.
//
// iwork is a temporary data slice of length at least n and Dgecon will panic otherwise.
func (impl Implementation) Dgecon(norm lapack.MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
	checkMatrix(n, n, a, lda)
	if norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum {
		panic(badNorm)
	}
	if len(work) < 4*n {
		panic(badWork)
	}
	if len(iwork) < n {
		panic(badWork)
	}

	if n == 0 {
		return 1
	} else if anorm == 0 {
		return 0
	}

	bi := blas64.Implementation()
	var rcond, ainvnm float64
	var kase int
	var normin bool
	isave := new([3]int)
	onenrm := norm == lapack.MaxColumnSum
	smlnum := dlamchS
	kase1 := 2
	if onenrm {
		kase1 = 1
	}
	for {
		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
		if kase == 0 {
			if ainvnm != 0 {
				rcond = (1 / ainvnm) / anorm
			}
			return rcond
		}
		var sl, su float64
		if kase == kase1 {
			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.Unit, normin, n, a, lda, work, work[2*n:])
			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
		} else {
			su = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
			sl = impl.Dlatrs(blas.Lower, blas.Trans, blas.Unit, normin, n, a, lda, work, work[2*n:])
		}
		scale := sl * su
		normin = true
		if scale != 1 {
			ix := bi.Idamax(n, work, 1)
			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
				return rcond
			}
			impl.Drscl(n, scale, work, 1)
		}
	}
}
Esempio n. 6
0
// Dgebak updates an n×m matrix V as
//  V = P D V,        if side == blas.Right,
//  V = P D^{-1} V,   if side == blas.Left,
// where P and D are n×n permutation and scaling matrices, respectively,
// implicitly represented by job, scale, ilo and ihi as returned by Dgebal.
//
// Typically, columns of the matrix V contain the right or left (determined by
// side) eigenvectors of the balanced matrix output by Dgebal, and Dgebak forms
// the eigenvectors of the original matrix.
//
// Dgebak is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dgebak(job lapack.Job, side blas.Side, n, ilo, ihi int, scale []float64, m int, v []float64, ldv int) {
	switch job {
	default:
		panic(badJob)
	case lapack.None, lapack.Permute, lapack.Scale, lapack.PermuteScale:
	}
	switch side {
	default:
		panic(badSide)
	case blas.Left, blas.Right:
	}
	checkMatrix(n, m, v, ldv)
	switch {
	case ilo < 0 || max(0, n-1) < ilo:
		panic(badIlo)
	case ihi < min(ilo, n-1) || n <= ihi:
		panic(badIhi)
	}

	// Quick return if possible.
	if n == 0 || m == 0 || job == lapack.None {
		return
	}

	bi := blas64.Implementation()
	if ilo != ihi && job != lapack.Permute {
		// Backward balance.
		if side == blas.Right {
			for i := ilo; i <= ihi; i++ {
				bi.Dscal(m, scale[i], v[i*ldv:], 1)
			}
		} else {
			for i := ilo; i <= ihi; i++ {
				bi.Dscal(m, 1/scale[i], v[i*ldv:], 1)
			}
		}
	}
	if job == lapack.Scale {
		return
	}
	// Backward permutation.
	for i := ilo - 1; i >= 0; i-- {
		k := int(scale[i])
		if k == i {
			continue
		}
		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
	}
	for i := ihi + 1; i < n; i++ {
		k := int(scale[i])
		if k == i {
			continue
		}
		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
	}
}
Esempio n. 7
0
// Dpotf2 computes the cholesky decomposition of the symmetric positive definite
// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
// and a = U^T U is stored in place into a. If ul == blas.Lower, then a = L L^T
// is computed and stored in-place into a. If a is not positive definite, false
// is returned. This is the unblocked version of the algorithm.
func (Implementation) Dpotf2(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
	if ul != blas.Upper && ul != blas.Lower {
		panic(badUplo)
	}
	if n < 0 {
		panic(nLT0)
	}
	if lda < n {
		panic(badLdA)
	}
	if n == 0 {
		return true
	}
	bi := blas64.Implementation()
	if ul == blas.Upper {
		for j := 0; j < n; j++ {
			ajj := a[j*lda+j]
			if j != 0 {
				ajj -= bi.Ddot(j, a[j:], lda, a[j:], lda)
			}
			if ajj <= 0 || math.IsNaN(ajj) {
				a[j*lda+j] = ajj
				return false
			}
			ajj = math.Sqrt(ajj)
			a[j*lda+j] = ajj
			if j < n-1 {
				bi.Dgemv(blas.Trans, j, n-j-1,
					-1, a[j+1:], lda, a[j:], lda,
					1, a[j*lda+j+1:], 1)
				bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
			}
		}
		return true
	}
	for j := 0; j < n; j++ {
		ajj := a[j*lda+j]
		if j != 0 {
			ajj -= bi.Ddot(j, a[j*lda:], 1, a[j*lda:], 1)
		}
		if ajj <= 0 || math.IsNaN(ajj) {
			a[j*lda+j] = ajj
			return false
		}
		ajj = math.Sqrt(ajj)
		a[j*lda+j] = ajj
		if j < n-1 {
			bi.Dgemv(blas.NoTrans, n-j-1, j,
				-1, a[(j+1)*lda:], lda, a[j*lda:], 1,
				1, a[(j+1)*lda+j:], lda)
			bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
		}
	}
	return true
}
Esempio n. 8
0
// Dpocon estimates the reciprocal of the condition number of a positive-definite
// matrix A given the Cholesky decmposition of A. The condition number computed
// is based on the 1-norm and the ∞-norm.
//
// anorm is the 1-norm and the ∞-norm of the original matrix A.
//
// work is a temporary data slice of length at least 3*n and Dpocon will panic otherwise.
//
// iwork is a temporary data slice of length at least n and Dpocon will panic otherwise.
func (impl Implementation) Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
	checkMatrix(n, n, a, lda)
	if uplo != blas.Upper && uplo != blas.Lower {
		panic(badUplo)
	}
	if len(work) < 3*n {
		panic(badWork)
	}
	if len(iwork) < n {
		panic(badWork)
	}
	var rcond float64
	if n == 0 {
		return 1
	}
	if anorm == 0 {
		return rcond
	}

	bi := blas64.Implementation()
	var ainvnm float64
	smlnum := dlamchS
	upper := uplo == blas.Upper
	var kase int
	var normin bool
	isave := new([3]int)
	var sl, su float64
	for {
		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
		if kase == 0 {
			if ainvnm != 0 {
				rcond = (1 / ainvnm) / anorm
			}
			return rcond
		}
		if upper {
			sl = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
			normin = true
			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
		} else {
			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
			normin = true
			su = impl.Dlatrs(blas.Lower, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
		}
		scale := sl * su
		if scale != 1 {
			ix := bi.Idamax(n, work, 1)
			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
				return rcond
			}
			impl.Drscl(n, scale, work, 1)
		}
	}
}
Esempio n. 9
0
func DpotrfTest(t *testing.T, impl Dpotrfer) {
	rnd := rand.New(rand.NewSource(1))
	bi := blas64.Implementation()
	for i, test := range []struct {
		n int
	}{
		{n: 10},
		{n: 30},
		{n: 63},
		{n: 65},
		{n: 128},
		{n: 1000},
	} {
		n := test.n
		// Construct a positive-definite symmetric matrix
		base := make([]float64, n*n)
		for i := range base {
			base[i] = rnd.Float64()
		}
		a := make([]float64, len(base))
		bi.Dgemm(blas.Trans, blas.NoTrans, n, n, n, 1, base, n, base, n, 0, a, n)

		aCopy := make([]float64, len(a))
		copy(aCopy, a)

		// Test with Upper
		impl.Dpotrf(blas.Upper, n, a, n)

		// zero all the other elements
		for i := 0; i < n; i++ {
			for j := 0; j < i; j++ {
				a[i*n+j] = 0
			}
		}
		// Multiply u^T * u
		ans := make([]float64, len(a))
		bi.Dsyrk(blas.Upper, blas.Trans, n, n, 1, a, n, 0, ans, n)

		match := true
		for i := 0; i < n; i++ {
			for j := i; j < n; j++ {
				if !floats.EqualWithinAbsOrRel(ans[i*n+j], aCopy[i*n+j], 1e-14, 1e-14) {
					match = false
				}
			}
		}
		if !match {
			//fmt.Println(aCopy)
			//fmt.Println(ans)
			t.Errorf("Case %v: Mismatch for upper", i)
		}
	}
}
Esempio n. 10
0
// Dlaswp swaps the rows k1 to k2 of a according to the indices in ipiv.
// a is a matrix with n columns and stride lda. incX is the increment for ipiv.
// k1 and k2 are zero-indexed. If incX is negative, then loops from k2 to k1
//
// Dlaswp is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dlaswp(n int, a []float64, lda, k1, k2 int, ipiv []int, incX int) {
	if incX != 1 && incX != -1 {
		panic(absIncNotOne)
	}
	bi := blas64.Implementation()
	if incX == 1 {
		for k := k1; k <= k2; k++ {
			bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
		}
		return
	}
	for k := k2; k >= k1; k-- {
		bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
	}
}
Esempio n. 11
0
// Dtrtri computes the inverse of a triangular matrix, storing the result in place
// into a. This is the BLAS level 3 version of the algorithm which builds upon
// Dtrti2 to operate on matrix blocks instead of only individual columns.
//
// Dtrtri will not perform the inversion if the matrix is singular, and returns
// a boolean indicating whether the inversion was successful.
func (impl Implementation) Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool) {
	checkMatrix(n, n, a, lda)
	if uplo != blas.Upper && uplo != blas.Lower {
		panic(badUplo)
	}
	if diag != blas.NonUnit && diag != blas.Unit {
		panic(badDiag)
	}
	if n == 0 {
		return false
	}
	nonUnit := diag == blas.NonUnit
	if nonUnit {
		for i := 0; i < n; i++ {
			if a[i*lda+i] == 0 {
				return false
			}
		}
	}

	bi := blas64.Implementation()

	nb := impl.Ilaenv(1, "DTRTRI", "UD", n, -1, -1, -1)
	if nb <= 1 || nb > n {
		impl.Dtrti2(uplo, diag, n, a, lda)
		return true
	}
	if uplo == blas.Upper {
		for j := 0; j < n; j += nb {
			jb := min(nb, n-j)
			bi.Dtrmm(blas.Left, blas.Upper, blas.NoTrans, diag, j, jb, 1, a, lda, a[j:], lda)
			bi.Dtrsm(blas.Right, blas.Upper, blas.NoTrans, diag, j, jb, -1, a[j*lda+j:], lda, a[j:], lda)
			impl.Dtrti2(blas.Upper, diag, jb, a[j*lda+j:], lda)
		}
		return true
	}
	nn := ((n - 1) / nb) * nb
	for j := nn; j >= 0; j -= nb {
		jb := min(nb, n-j)
		if j+jb <= n-1 {
			bi.Dtrmm(blas.Left, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, 1, a[(j+jb)*lda+j+jb:], lda, a[(j+jb)*lda+j:], lda)
			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, -1, a[j*lda+j:], lda, a[(j+jb)*lda+j:], lda)
		}
		impl.Dtrti2(blas.Lower, diag, jb, a[j*lda+j:], lda)
	}
	return true
}
Esempio n. 12
0
// Dorg2r generates an m×n matrix Q with orthonormal columns defined by the
// product of elementary reflectors as computed by Dgeqrf.
//  Q = H_0 * H_1 * ... * H_{k-1}
// len(tau) >= k, 0 <= k <= n, 0 <= n <= m, len(work) >= n.
// Dorg2r will panic if these conditions are not met.
//
// Dorg2r is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dorg2r(m, n, k int, a []float64, lda int, tau []float64, work []float64) {
	checkMatrix(m, n, a, lda)
	if len(tau) < k {
		panic(badTau)
	}
	if len(work) < n {
		panic(badWork)
	}
	if k > n {
		panic(kGTN)
	}
	if n > m {
		panic(mLTN)
	}
	if len(work) < n {
		panic(badWork)
	}
	if n == 0 {
		return
	}
	bi := blas64.Implementation()
	// Initialize columns k+1:n to columns of the unit matrix.
	for l := 0; l < m; l++ {
		for j := k; j < n; j++ {
			a[l*lda+j] = 0
		}
	}
	for j := k; j < n; j++ {
		a[j*lda+j] = 1
	}
	for i := k - 1; i >= 0; i-- {
		for i := range work {
			work[i] = 0
		}
		if i < n-1 {
			a[i*lda+i] = 1
			impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tau[i], a[i*lda+i+1:], lda, work)
		}
		if i < m-1 {
			bi.Dscal(m-i-1, -tau[i], a[(i+1)*lda+i:], lda)
		}
		a[i*lda+i] = 1 - tau[i]
		for l := 0; l < i; l++ {
			a[l*lda+i] = 0
		}
	}
}
Esempio n. 13
0
// Dtrtrs solves a triangular system of the form A * X = B or A^T * X = B. Dtrtrs
// returns whether the solve completed successfully. If A is singular, no solve is performed.
func (impl Implementation) Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) {
	nounit := diag == blas.NonUnit
	if n == 0 {
		return false
	}
	// Check for singularity.
	if nounit {
		for i := 0; i < n; i++ {
			if a[i*lda+i] == 0 {
				return false
			}
		}
	}
	bi := blas64.Implementation()
	bi.Dtrsm(blas.Left, uplo, trans, diag, n, nrhs, 1, a, lda, b, ldb)
	return true
}
Esempio n. 14
0
// Dorgl2 generates an m×n matrix Q with orthonormal rows defined by the
// first m rows product of elementary reflectors as computed by Dgelqf.
//  Q = H_0 * H_1 * ... * H_{k-1}
// len(tau) >= k, 0 <= k <= m, 0 <= m <= n, len(work) >= m.
// Dorgl2 will panic if these conditions are not met.
//
// Dorgl2 is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dorgl2(m, n, k int, a []float64, lda int, tau, work []float64) {
	checkMatrix(m, n, a, lda)
	if len(tau) < k {
		panic(badTau)
	}
	if k > m {
		panic(kGTM)
	}
	if k > m {
		panic(kGTM)
	}
	if m > n {
		panic(nLTM)
	}
	if len(work) < m {
		panic(badWork)
	}
	if m == 0 {
		return
	}
	bi := blas64.Implementation()
	if k < m {
		for i := k; i < m; i++ {
			for j := 0; j < n; j++ {
				a[i*lda+j] = 0
			}
		}
		for j := k; j < m; j++ {
			a[j*lda+j] = 1
		}
	}
	for i := k - 1; i >= 0; i-- {
		if i < n-1 {
			if i < m-1 {
				a[i*lda+i] = 1
				impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tau[i], a[(i+1)*lda+i:], lda, work)
			}
			bi.Dscal(n-i-1, -tau[i], a[i*lda+i+1:], 1)
		}
		a[i*lda+i] = 1 - tau[i]
		for l := 0; l < i; l++ {
			a[i*lda+l] = 0
		}
	}
}
Esempio n. 15
0
// Dtrti2 computes the inverse of a triangular matrix, storing the result in place
// into a. This is the BLAS level 2 version of the algorithm.
func (impl Implementation) Dtrti2(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) {
	checkMatrix(n, n, a, lda)
	if uplo != blas.Upper && uplo != blas.Lower {
		panic(badUplo)
	}
	if diag != blas.NonUnit && diag != blas.Unit {
		panic(badDiag)
	}
	bi := blas64.Implementation()

	nonUnit := diag == blas.NonUnit
	// TODO(btracey): Replace this with a row-major ordering.
	if uplo == blas.Upper {
		for j := 0; j < n; j++ {
			var ajj float64
			if nonUnit {
				ajj = 1 / a[j*lda+j]
				a[j*lda+j] = ajj
				ajj *= -1
			} else {
				ajj = -1
			}
			bi.Dtrmv(blas.Upper, blas.NoTrans, diag, j, a, lda, a[j:], lda)
			bi.Dscal(j, ajj, a[j:], lda)
		}
		return
	}
	for j := n - 1; j >= 0; j-- {
		var ajj float64
		if nonUnit {
			ajj = 1 / a[j*lda+j]
			a[j*lda+j] = ajj
			ajj *= -1
		} else {
			ajj = -1
		}
		if j < n-1 {
			bi.Dtrmv(blas.Lower, blas.NoTrans, diag, n-j-1, a[(j+1)*lda+j+1:], lda, a[(j+1)*lda+j:], lda)
			bi.Dscal(n-j-1, ajj, a[(j+1)*lda+j:], lda)
		}
	}
}
Esempio n. 16
0
// Dgetrf computes the LU decomposition of the m×n matrix A.
// The LU decomposition is a factorization of A into
//  A = P * L * U
// where P is a permutation matrix, L is a unit lower triangular matrix, and
// U is a (usually) non-unit upper triangular matrix. On exit, L and U are stored
// in place into a.
//
// ipiv is a permutation vector. It indicates that row i of the matrix was
// changed with ipiv[i]. ipiv must have length at least min(m,n), and will panic
// otherwise. ipiv is zero-indexed.
//
// Dgetrf is the blocked version of the algorithm.
//
// Dgetrf returns whether the matrix A is singular. The LU decomposition will
// be computed regardless of the singularity of A, but division by zero
// will occur if the false is returned and the result is used to solve a
// system of equations.
func (impl Implementation) Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
	mn := min(m, n)
	checkMatrix(m, n, a, lda)
	if len(ipiv) < mn {
		panic(badIpiv)
	}
	if m == 0 || n == 0 {
		return
	}
	bi := blas64.Implementation()
	nb := impl.Ilaenv(1, "DGETRF", " ", m, n, -1, -1)
	if nb <= 1 || nb >= min(m, n) {
		// Use the unblocked algorithm.
		return impl.Dgetf2(m, n, a, lda, ipiv)
	}
	ok = true
	for j := 0; j < mn; j += nb {
		jb := min(mn-j, nb)
		blockOk := impl.Dgetf2(m-j, jb, a[j*lda+j:], lda, ipiv[j:])
		if !blockOk {
			ok = false
		}
		for i := j; i <= min(m-1, j+jb-1); i++ {
			ipiv[i] = j + ipiv[i]
		}
		impl.Dlaswp(j, a, lda, j, j+jb-1, ipiv, 1)
		if j+jb < n {
			impl.Dlaswp(n-j-jb, a[j+jb:], lda, j, j+jb-1, ipiv, 1)
			bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
				jb, n-j-jb, 1,
				a[j*lda+j:], lda,
				a[j*lda+j+jb:], lda)
			if j+jb < m {
				bi.Dgemm(blas.NoTrans, blas.NoTrans, m-j-jb, n-j-jb, jb, -1,
					a[(j+jb)*lda+j:], lda,
					a[j*lda+j+jb:], lda,
					1, a[(j+jb)*lda+j+jb:], lda)
			}
		}
	}
	return ok
}
Esempio n. 17
0
// Dorg2l generates an m×n matrix Q with orthonormal columns which is defined
// as the last n columns of a product of k elementary reflectors of order m.
//  Q = H_{k-1} * ... * H_1 * H_0
// See Dgelqf for more information. It must be that m >= n >= k.
//
// tau contains the scalar reflectors computed by Dgeqlf. tau must have length
// at least k, and Dorg2l will panic otherwise.
//
// work contains temporary memory, and must have length at least n. Dorg2l will
// panic otherwise.
//
// Dorg2l is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dorg2l(m, n, k int, a []float64, lda int, tau, work []float64) {
	checkMatrix(m, n, a, lda)
	if len(tau) < k {
		panic(badTau)
	}
	if len(work) < n {
		panic(badWork)
	}
	if m < n {
		panic(mLTN)
	}
	if k > n {
		panic(kGTN)
	}
	if n == 0 {
		return
	}

	// Initialize columns 0:n-k to columns of the unit matrix.
	for j := 0; j < n-k; j++ {
		for l := 0; l < m; l++ {
			a[l*lda+j] = 0
		}
		a[(m-n+j)*lda+j] = 1
	}

	bi := blas64.Implementation()
	for i := 0; i < k; i++ {
		ii := n - k + i

		// Apply H_i to A[0:m-k+i, 0:n-k+i] from the left.
		a[(m-n+ii)*lda+ii] = 1
		impl.Dlarf(blas.Left, m-n+ii+1, ii, a[ii:], lda, tau[i], a, lda, work)
		bi.Dscal(m-n+ii, -tau[i], a[ii:], lda)
		a[(m-n+ii)*lda+ii] = 1 - tau[i]

		// Set A[m-k+i:m, n-k+i+1] to zero.
		for l := m - n + ii + 1; l < m; l++ {
			a[l*lda+ii] = 0
		}
	}
}
Esempio n. 18
0
// Dgetf2 computes the LU decomposition of the m×n matrix A.
// The LU decomposition is a factorization of a into
//  A = P * L * U
// where P is a permutation matrix, L is a unit lower triangular matrix, and
// U is a (usually) non-unit upper triangular matrix. On exit, L and U are stored
// in place into a.
//
// ipiv is a permutation vector. It indicates that row i of the matrix was
// changed with ipiv[i]. ipiv must have length at least min(m,n), and will panic
// otherwise. ipiv is zero-indexed.
//
// Dgetf2 returns whether the matrix A is singular. The LU decomposition will
// be computed regardless of the singularity of A, but division by zero
// will occur if the false is returned and the result is used to solve a
// system of equations.
//
// Dgetf2 is an internal routine. It is exported for testing purposes.
func (Implementation) Dgetf2(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
	mn := min(m, n)
	checkMatrix(m, n, a, lda)
	if len(ipiv) < mn {
		panic(badIpiv)
	}
	if m == 0 || n == 0 {
		return true
	}
	bi := blas64.Implementation()
	sfmin := dlamchS
	ok = true
	for j := 0; j < mn; j++ {
		// Find a pivot and test for singularity.
		jp := j + bi.Idamax(m-j, a[j*lda+j:], lda)
		ipiv[j] = jp
		if a[jp*lda+j] == 0 {
			ok = false
		} else {
			// Swap the rows if necessary.
			if jp != j {
				bi.Dswap(n, a[j*lda:], 1, a[jp*lda:], 1)
			}
			if j < m-1 {
				aj := a[j*lda+j]
				if math.Abs(aj) >= sfmin {
					bi.Dscal(m-j-1, 1/aj, a[(j+1)*lda+j:], lda)
				} else {
					for i := 0; i < m-j-1; i++ {
						a[(j+1)*lda+j] = a[(j+1)*lda+j] / a[lda*j+j]
					}
				}
			}
		}
		if j < mn-1 {
			bi.Dger(m-j-1, n-j-1, -1, a[(j+1)*lda+j:], lda, a[j*lda+j+1:], 1, a[(j+1)*lda+j+1:], lda)
		}
	}
	return ok
}
Esempio n. 19
0
// Dlarfg generates an elementary reflector for a Householder matrix. It creates
// a real elementary reflector of order n such that
//  H * (alpha) = (beta)
//      (    x)   (   0)
//  H^T * H = I
// H is represented in the form
//  H = 1 - tau * (1; v) * (1 v^T)
// where tau is a real scalar.
//
// On entry, x contains the vector x, on exit it contains v.
func (impl Implementation) Dlarfg(n int, alpha float64, x []float64, incX int) (beta, tau float64) {
	if n < 0 {
		panic(nLT0)
	}
	if n <= 1 {
		return alpha, 0
	}
	checkVector(n-1, x, incX)
	bi := blas64.Implementation()
	xnorm := bi.Dnrm2(n-1, x, incX)
	if xnorm == 0 {
		return alpha, 0
	}
	beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
	safmin := dlamchS / dlamchE
	knt := 0
	if math.Abs(beta) < safmin {
		// xnorm and beta may be innacurate, scale x and recompute.
		rsafmn := 1 / safmin
		for {
			knt++
			bi.Dscal(n-1, rsafmn, x, incX)
			beta *= rsafmn
			alpha *= rsafmn
			if math.Abs(beta) >= safmin {
				break
			}
		}
		xnorm = bi.Dnrm2(n-1, x, incX)
		beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
	}
	tau = (beta - alpha) / beta
	bi.Dscal(n-1, 1/(alpha-beta), x, incX)
	for j := 0; j < knt; j++ {
		beta *= safmin
	}
	return beta, tau
}
Esempio n. 20
0
func DtrtriTest(t *testing.T, impl Dtrtrier) {
	bi := blas64.Implementation()
	for _, uplo := range []blas.Uplo{blas.Upper} {
		for _, diag := range []blas.Diag{blas.NonUnit, blas.Unit} {
			for _, test := range []struct {
				n, lda int
			}{
				{3, 0},
				{70, 0},
				{200, 0},
				{3, 5},
				{70, 92},
				{200, 205},
			} {
				n := test.n
				lda := test.lda
				if lda == 0 {
					lda = n
				}
				a := make([]float64, n*lda)
				for i := range a {
					a[i] = rand.Float64() + 1 // This keeps the matrices well conditioned.
				}
				aCopy := make([]float64, len(a))
				copy(aCopy, a)
				impl.Dtrtri(uplo, diag, n, a, lda)
				if uplo == blas.Upper {
					for i := 1; i < n; i++ {
						for j := 0; j < i; j++ {
							aCopy[i*lda+j] = 0
							a[i*lda+j] = 0
						}
					}
				} else {
					for i := 1; i < n; i++ {
						for j := i + 1; j < n; j++ {
							aCopy[i*lda+j] = 0
							a[i*lda+j] = 0
						}
					}
				}
				if diag == blas.Unit {
					for i := 0; i < n; i++ {
						a[i*lda+i] = 1
						aCopy[i*lda+i] = 1
					}
				}
				ans := make([]float64, len(a))
				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, n, n, 1, a, lda, aCopy, lda, 0, ans, lda)
				iseye := true
				for i := 0; i < n; i++ {
					for j := 0; j < n; j++ {
						if i == j {
							if math.Abs(ans[i*lda+i]-1) > 1e-4 {
								iseye = false
								break
							}
						} else {
							if math.Abs(ans[i*lda+j]) > 1e-4 {
								iseye = false
								break
							}
						}
					}
				}
				if !iseye {
					t.Errorf("inv(A) * A != I. Upper = %v, unit = %v, n = %v, lda = %v",
						uplo == blas.Upper, diag == blas.Unit, n, lda)
				}
			}
		}
	}
}
Esempio n. 21
0
func DormbrTest(t *testing.T, impl Dormbrer) {
	rnd := rand.New(rand.NewSource(1))
	bi := blas64.Implementation()
	for _, vect := range []lapack.DecompUpdate{lapack.ApplyQ, lapack.ApplyP} {
		for _, side := range []blas.Side{blas.Left, blas.Right} {
			for _, trans := range []blas.Transpose{blas.NoTrans, blas.Trans} {
				for _, test := range []struct {
					m, n, k, lda, ldc int
				}{
					{3, 4, 5, 0, 0},
					{3, 5, 4, 0, 0},
					{4, 3, 5, 0, 0},
					{4, 5, 3, 0, 0},
					{5, 3, 4, 0, 0},
					{5, 4, 3, 0, 0},

					{3, 4, 5, 10, 12},
					{3, 5, 4, 10, 12},
					{4, 3, 5, 10, 12},
					{4, 5, 3, 10, 12},
					{5, 3, 4, 10, 12},
					{5, 4, 3, 10, 12},
				} {
					m := test.m
					n := test.n
					k := test.k
					ldc := test.ldc
					if ldc == 0 {
						ldc = n
					}
					nq := n
					if side == blas.Left {
						nq = m
					}

					// Compute a decomposition.
					var ma, na int
					var a []float64
					if vect == lapack.ApplyQ {
						ma = nq
						na = k
					} else {
						ma = k
						na = nq
					}
					lda := test.lda
					if lda == 0 {
						lda = na
					}
					a = make([]float64, ma*lda)
					for i := range a {
						a[i] = rnd.NormFloat64()
					}
					nTau := min(nq, k)
					tauP := make([]float64, nTau)
					tauQ := make([]float64, nTau)
					d := make([]float64, nTau)
					e := make([]float64, nTau)
					lwork := -1
					work := make([]float64, 1)
					impl.Dgebrd(ma, na, a, lda, d, e, tauQ, tauP, work, lwork)
					work = make([]float64, int(work[0]))
					lwork = len(work)

					impl.Dgebrd(ma, na, a, lda, d, e, tauQ, tauP, work, lwork)

					// Apply and compare update.
					c := make([]float64, m*ldc)
					for i := range c {
						c[i] = rnd.NormFloat64()
					}

					cCopy := make([]float64, len(c))
					copy(cCopy, c)

					if vect == lapack.ApplyQ {
						impl.Dormbr(vect, side, trans, m, n, k, a, lda, tauQ, c, ldc, work, lwork)
					} else {
						impl.Dormbr(vect, side, trans, m, n, k, a, lda, tauP, c, ldc, work, lwork)
					}

					// Check that the multiplication was correct.
					cOrig := blas64.General{
						Rows:   m,
						Cols:   n,
						Stride: ldc,
						Data:   make([]float64, len(cCopy)),
					}
					copy(cOrig.Data, cCopy)
					cAns := blas64.General{
						Rows:   m,
						Cols:   n,
						Stride: ldc,
						Data:   make([]float64, len(cCopy)),
					}
					copy(cAns.Data, cCopy)
					nb := min(ma, na)
					var mulMat blas64.General
					if vect == lapack.ApplyQ {
						mulMat = constructQPBidiagonal(lapack.ApplyQ, ma, na, nb, a, lda, tauQ)
					} else {
						mulMat = constructQPBidiagonal(lapack.ApplyP, ma, na, nb, a, lda, tauP)
					}

					mulTrans := trans

					if side == blas.Left {
						bi.Dgemm(mulTrans, blas.NoTrans, m, n, m, 1, mulMat.Data, mulMat.Stride, cOrig.Data, cOrig.Stride, 0, cAns.Data, cAns.Stride)
					} else {
						bi.Dgemm(blas.NoTrans, mulTrans, m, n, n, 1, cOrig.Data, cOrig.Stride, mulMat.Data, mulMat.Stride, 0, cAns.Data, cAns.Stride)
					}

					if !floats.EqualApprox(cAns.Data, c, 1e-8) {
						isApplyQ := vect == lapack.ApplyQ
						isLeft := side == blas.Left
						isTrans := trans == blas.Trans

						t.Errorf("C mismatch. isApplyQ: %v, isLeft: %v, isTrans: %v, m = %v, n = %v, k = %v, lda = %v, ldc = %v",
							isApplyQ, isLeft, isTrans, m, n, k, lda, ldc)
					}
				}
			}
		}
	}
}
Esempio n. 22
0
// Dlarfb applies a block reflector to a matrix.
//
// In the call to Dlarfb, the mxn c is multiplied by the implicitly defined matrix h as follows:
//  c = h * c if side == Left and trans == NoTrans
//  c = c * h if side == Right and trans == NoTrans
//  c = h^T * c if side == Left and trans == Trans
//  c = c * h^t if side == Right and trans == Trans
// h is a product of elementary reflectors. direct sets the direction of multiplication
//  h = h_1 * h_2 * ... * h_k if direct == Forward
//  h = h_k * h_k-1 * ... * h_1 if direct == Backward
// The combination of direct and store defines the orientation of the elementary
// reflectors. In all cases the ones on the diagonal are implicitly represented.
//
// If direct == lapack.Forward and store == lapack.ColumnWise
//  V = (  1       )
//      ( v1  1    )
//      ( v1 v2  1 )
//      ( v1 v2 v3 )
//      ( v1 v2 v3 )
// If direct == lapack.Forward and store == lapack.RowWise
//  V = (  1 v1 v1 v1 v1 )
//      (     1 v2 v2 v2 )
//      (        1 v3 v3 )
// If direct == lapack.Backward and store == lapack.ColumnWise
//  V = ( v1 v2 v3 )
//      ( v1 v2 v3 )
//      (  1 v2 v3 )
//      (     1 v3 )
//      (        1 )
// If direct == lapack.Backward and store == lapack.RowWise
//  V = ( v1 v1  1       )
//      ( v2 v2 v2  1    )
//      ( v3 v3 v3 v3  1 )
// An elementary reflector can be explicitly constructed by extracting the
// corresponding elements of v, placing a 1 where the diagonal would be, and
// placing zeros in the remaining elements.
//
// t is a k×k matrix containing the block reflector, and this function will panic
// if t is not of sufficient size. See Dlarft for more information.
//
// Work is a temporary storage matrix with stride ldwork.
// Work must be of size at least n×k side == Left and m×k if side == Right, and
// this function will panic if this size is not met.
func (Implementation) Dlarfb(side blas.Side, trans blas.Transpose, direct lapack.Direct,
	store lapack.StoreV, m, n, k int, v []float64, ldv int, t []float64, ldt int,
	c []float64, ldc int, work []float64, ldwork int) {

	checkMatrix(m, n, c, ldc)
	if m == 0 || n == 0 {
		return
	}
	if k < 0 {
		panic("lapack: negative number of transforms")
	}
	if side != blas.Left && side != blas.Right {
		panic(badSide)
	}
	if trans != blas.Trans && trans != blas.NoTrans {
		panic(badTrans)
	}
	if direct != lapack.Forward && direct != lapack.Backward {
		panic(badDirect)
	}
	if store != lapack.ColumnWise && store != lapack.RowWise {
		panic(badStore)
	}

	rowsWork := n
	if side == blas.Right {
		rowsWork = m
	}
	checkMatrix(rowsWork, k, work, ldwork)

	bi := blas64.Implementation()

	transt := blas.Trans
	if trans == blas.Trans {
		transt = blas.NoTrans
	}
	// TODO(btracey): This follows the original Lapack code where the
	// elements are copied into the columns of the working array. The
	// loops should go in the other direction so the data is written
	// into the rows of work so the copy is not strided. A bigger change
	// would be to replace work with work^T, but benchmarks would be
	// needed to see if the change is merited.
	if store == lapack.ColumnWise {
		if direct == lapack.Forward {
			// V1 is the first k rows of C. V2 is the remaining rows.
			if side == blas.Left {
				// W = C^T V = C1^T V1 + C2^T V2 (stored in work).

				// W = C1.
				for j := 0; j < k; j++ {
					bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
				}
				// W = W * V1.
				bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit,
					n, k, 1,
					v, ldv,
					work, ldwork)
				if m > k {
					// W = W + C2^T V2.
					bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
						1, c[k*ldc:], ldc, v[k*ldv:], ldv,
						1, work, ldwork)
				}
				// W = W * T^T or W * T.
				bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
					1, t, ldt,
					work, ldwork)
				// C -= V * W^T.
				if m > k {
					// C2 -= V2 * W^T.
					bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
						-1, v[k*ldv:], ldv, work, ldwork,
						1, c[k*ldc:], ldc)
				}
				// W *= V1^T.
				bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
					1, v, ldv,
					work, ldwork)
				// C1 -= W^T.
				// TODO(btracey): This should use blas.Axpy.
				for i := 0; i < n; i++ {
					for j := 0; j < k; j++ {
						c[j*ldc+i] -= work[i*ldwork+j]
					}
				}
				return
			}
			// Form C = C * H or C * H^T, where C = (C1 C2).

			// W = C1.
			for i := 0; i < k; i++ {
				bi.Dcopy(m, c[i:], ldc, work[i:], ldwork)
			}
			// W *= V1.
			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
				1, v, ldv,
				work, ldwork)
			if n > k {
				bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
					1, c[k:], ldc, v[k*ldv:], ldv,
					1, work, ldwork)
			}
			// W *= T or T^T.
			bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
				1, t, ldt,
				work, ldwork)
			if n > k {
				bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
					-1, work, ldwork, v[k*ldv:], ldv,
					1, c[k:], ldc)
			}
			// C -= W * V^T.
			bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
				1, v, ldv,
				work, ldwork)
			// C -= W.
			// TODO(btracey): This should use blas.Axpy.
			for i := 0; i < m; i++ {
				for j := 0; j < k; j++ {
					c[i*ldc+j] -= work[i*ldwork+j]
				}
			}
			return
		}
		// V = (V1)
		//   = (V2) (last k rows)
		// Where V2 is unit upper triangular.
		if side == blas.Left {
			// Form H * C or
			// W = C^T V.

			// W = C2^T.
			for j := 0; j < k; j++ {
				bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
			}
			// W *= V2.
			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
				1, v[(m-k)*ldv:], ldv,
				work, ldwork)
			if m > k {
				// W += C1^T * V1.
				bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
					1, c, ldc, v, ldv,
					1, work, ldwork)
			}
			// W *= T or T^T.
			bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
				1, t, ldt,
				work, ldwork)
			// C -= V * W^T.
			if m > k {
				bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
					-1, v, ldv, work, ldwork,
					1, c, ldc)
			}
			// W *= V2^T.
			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
				1, v[(m-k)*ldv:], ldv,
				work, ldwork)
			// C2 -= W^T.
			// TODO(btracey): This should use blas.Axpy.
			for i := 0; i < n; i++ {
				for j := 0; j < k; j++ {
					c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
				}
			}
			return
		}
		// Form C * H or C * H^T where C = (C1 C2).
		// W = C * V.

		// W = C2.
		for j := 0; j < k; j++ {
			bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
		}

		// W = W * V2.
		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
			1, v[(n-k)*ldv:], ldv,
			work, ldwork)
		if n > k {
			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
				1, c, ldc, v, ldv,
				1, work, ldwork)
		}
		// W *= T or T^T.
		bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
			1, t, ldt,
			work, ldwork)
		// C -= W * V^T.
		if n > k {
			// C1 -= W * V1^T.
			bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
				-1, work, ldwork, v, ldv,
				1, c, ldc)
		}
		// W *= V2^T.
		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
			1, v[(n-k)*ldv:], ldv,
			work, ldwork)
		// C2 -= W.
		// TODO(btracey): This should use blas.Axpy.
		for i := 0; i < m; i++ {
			for j := 0; j < k; j++ {
				c[i*ldc+n-k+j] -= work[i*ldwork+j]
			}
		}
		return
	}
	// Store = Rowwise.
	if direct == lapack.Forward {
		// V = (V1 V2) where v1 is unit upper triangular.
		if side == blas.Left {
			// Form H * C or H^T * C where C = (C1; C2).
			// W = C^T * V^T.

			// W = C1^T.
			for j := 0; j < k; j++ {
				bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
			}
			// W *= V1^T.
			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
				1, v, ldv,
				work, ldwork)
			if m > k {
				bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
					1, c[k*ldc:], ldc, v[k:], ldv,
					1, work, ldwork)
			}
			// W *= T or T^T.
			bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
				1, t, ldt,
				work, ldwork)
			// C -= V^T * W^T.
			if m > k {
				bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
					-1, v[k:], ldv, work, ldwork,
					1, c[k*ldc:], ldc)
			}
			// W *= V1.
			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
				1, v, ldv,
				work, ldwork)
			// C1 -= W^T.
			// TODO(btracey): This should use blas.Axpy.
			for i := 0; i < n; i++ {
				for j := 0; j < k; j++ {
					c[j*ldc+i] -= work[i*ldwork+j]
				}
			}
			return
		}
		// Form C * H or C * H^T where C = (C1 C2).
		// W = C * V^T.

		// W = C1.
		for j := 0; j < k; j++ {
			bi.Dcopy(m, c[j:], ldc, work[j:], ldwork)
		}
		// W *= V1^T.
		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
			1, v, ldv,
			work, ldwork)
		if n > k {
			bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
				1, c[k:], ldc, v[k:], ldv,
				1, work, ldwork)
		}
		// W *= T or T^T.
		bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
			1, t, ldt,
			work, ldwork)
		// C -= W * V.
		if n > k {
			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
				-1, work, ldwork, v[k:], ldv,
				1, c[k:], ldc)
		}
		// W *= V1.
		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
			1, v, ldv,
			work, ldwork)
		// C1 -= W.
		// TODO(btracey): This should use blas.Axpy.
		for i := 0; i < m; i++ {
			for j := 0; j < k; j++ {
				c[i*ldc+j] -= work[i*ldwork+j]
			}
		}
		return
	}
	// V = (V1 V2) where V2 is the last k columns and is lower unit triangular.
	if side == blas.Left {
		// Form H * C or H^T C where C = (C1 ; C2).
		// W = C^T * V^T.

		// W = C2^T.
		for j := 0; j < k; j++ {
			bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
		}
		// W *= V2^T.
		bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
			1, v[m-k:], ldv,
			work, ldwork)
		if m > k {
			bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
				1, c, ldc, v, ldv,
				1, work, ldwork)
		}
		// W *= T or T^T.
		bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
			1, t, ldt,
			work, ldwork)
		// C -= V^T * W^T.
		if m > k {
			bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
				-1, v, ldv, work, ldwork,
				1, c, ldc)
		}
		// W *= V2.
		bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, k,
			1, v[m-k:], ldv,
			work, ldwork)
		// C2 -= W^T.
		// TODO(btracey): This should use blas.Axpy.
		for i := 0; i < n; i++ {
			for j := 0; j < k; j++ {
				c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
			}
		}
		return
	}
	// Form C * H or C * H^T where C = (C1 C2).
	// W = C * V^T.
	// W = C2.
	for j := 0; j < k; j++ {
		bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
	}
	// W *= V2^T.
	bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
		1, v[n-k:], ldv,
		work, ldwork)
	if n > k {
		bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
			1, c, ldc, v, ldv,
			1, work, ldwork)
	}
	// W *= T or T^T.
	bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
		1, t, ldt,
		work, ldwork)
	// C -= W * V.
	if n > k {
		bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
			-1, work, ldwork, v, ldv,
			1, c, ldc)
	}
	// W *= V2.
	bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
		1, v[n-k:], ldv,
		work, ldwork)
	// C1 -= W.
	// TODO(btracey): This should use blas.Axpy.
	for i := 0; i < m; i++ {
		for j := 0; j < k; j++ {
			c[i*ldc+n-k+j] -= work[i*ldwork+j]
		}
	}
}
Esempio n. 23
0
// Dgetri computes the inverse of the matrix A using the LU factorization computed
// by Dgetrf. On entry, a contains the PLU decomposition of A as computed by
// Dgetrf and on exit contains the reciprocal of the original matrix.
//
// Dgetri will not perform the inversion if the matrix is singular, and returns
// a boolean indicating whether the inversion was successful.
//
// Work is temporary storage, and lwork specifies the usable memory length.
// At minimum, lwork >= n and this function will panic otherwise.
// Dgetri is a blocked inversion, but the block size is limited
// by the temporary space available. If lwork == -1, instead of performing Dgetri,
// the optimal work length will be stored into work[0].
func (impl Implementation) Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool) {
	checkMatrix(n, n, a, lda)
	if len(ipiv) < n {
		panic(badIpiv)
	}
	nb := impl.Ilaenv(1, "DGETRI", " ", n, -1, -1, -1)
	if lwork == -1 {
		work[0] = float64(n * nb)
		return true
	}
	if lwork < n {
		panic(badWork)
	}
	if len(work) < lwork {
		panic(badWork)
	}
	if n == 0 {
		return true
	}
	ok = impl.Dtrtri(blas.Upper, blas.NonUnit, n, a, lda)
	if !ok {
		return false
	}
	nbmin := 2
	ldwork := nb
	if nb > 1 && nb < n {
		iws := max(ldwork*n, 1)
		if lwork < iws {
			nb = lwork / ldwork
			nbmin = max(2, impl.Ilaenv(2, "DGETRI", " ", n, -1, -1, -1))
		}
	}
	bi := blas64.Implementation()
	// TODO(btracey): Replace this with a more row-major oriented algorithm.
	if nb < nbmin || nb >= n {
		// Unblocked code.
		for j := n - 1; j >= 0; j-- {
			for i := j + 1; i < n; i++ {
				work[i*ldwork] = a[i*lda+j]
				a[i*lda+j] = 0
			}
			if j < n {
				bi.Dgemv(blas.NoTrans, n, n-j-1, -1, a[(j+1):], lda, work[(j+1)*ldwork:], ldwork, 1, a[j:], lda)
			}
		}
	} else {
		nn := ((n - 1) / nb) * nb
		for j := nn; j >= 0; j -= nb {
			jb := min(nb, n-j)
			for jj := j; jj < j+jb-1; jj++ {
				for i := jj + 1; i < n; i++ {
					work[i*ldwork+(jj-j)] = a[i*lda+jj]
					a[i*lda+jj] = 0
				}
			}
			if j+jb < n {
				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, jb, n-j-jb, -1, a[(j+jb):], lda, work[(j+jb)*ldwork:], ldwork, 1, a[j:], lda)
				bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, jb, 1, work[j*ldwork:], ldwork, a[j:], lda)
			}
		}
	}
	for j := n - 2; j >= 0; j-- {
		jp := ipiv[j]
		if jp != j {
			bi.Dswap(n, a[j:], lda, a[jp:], lda)
		}
	}
	return true
}
Esempio n. 24
0
// Dgesvd computes the singular value decomposition of the input matrix A.
//
// The singular value decomposition is
//  A = U * Sigma * V^T
// where Sigma is an m×n diagonal matrix containing the singular values of A,
// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
// min(m,n) columns of U and V are the left and right singular vectors of A
// respectively.
//
// jobU and jobVT are options for computing the singular vectors. The behavior
// is as follows
//  jobU == lapack.SVDAll       All m columns of U are returned in u
//  jobU == lapack.SVDInPlace   The first min(m,n) columns are returned in u
//  jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
//  jobU == lapack.SVDNone      The columns of U are not computed.
// The behavior is the same for jobVT and the rows of V^T. At most one of jobU
// and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise.
//
// On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
// the data is overwritten. On exit, A contains the appropriate singular vectors
// if either job is lapack.SVDOverwrite.
//
// s is a slice of length at least min(m,n) and on exit contains the singular
// values in decreasing order.
//
// u contains the left singular vectors on exit, stored column-wise. If
// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDInPlace u is
// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
// not used.
//
// vt contains the left singular vectors on exit, stored row-wise. If
// jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDInPlace vt is
// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
// not used.
//
// work is a slice for storing temporary memory, and lwork is the usable size of
// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
// If lwork == -1, instead of performing Dgesvd, the optimal work length will be
// stored into work[0]. Dgesvd will panic if the working memory has insufficient
// storage.
//
// Dgesvd returns whether the decomposition successfully completed.
func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) {
	minmn := min(m, n)
	checkMatrix(m, n, a, lda)
	if jobU == lapack.SVDAll {
		checkMatrix(m, m, u, ldu)
	} else if jobU == lapack.SVDInPlace {
		checkMatrix(m, minmn, u, ldu)
	}
	if jobVT == lapack.SVDAll {
		checkMatrix(n, n, vt, ldvt)
	} else if jobVT == lapack.SVDInPlace {
		checkMatrix(minmn, n, vt, ldvt)
	}
	if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite {
		panic("lapack: both jobU and jobVT are lapack.SVDOverwrite")
	}
	if len(s) < minmn {
		panic(badS)
	}
	if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite {
		panic(noSVDO)
	}
	if m == 0 || n == 0 {
		return true
	}

	wantua := jobU == lapack.SVDAll
	wantus := jobU == lapack.SVDInPlace
	wantuas := wantua || wantus
	wantuo := jobU == lapack.SVDOverwrite
	wantun := jobU == lapack.None

	wantva := jobVT == lapack.SVDAll
	wantvs := jobVT == lapack.SVDInPlace
	wantvas := wantva || wantvs
	wantvo := jobVT == lapack.SVDOverwrite
	wantvn := jobVT == lapack.None

	bi := blas64.Implementation()
	var mnthr int

	// TODO(btracey): The netlib implementation checks have this at only length 1.
	// Our implementation checks all input sizes before examining the l == -1 case.
	// Fix the failing cases to reduce the needed memory here.
	dum := make([]float64, m*n)

	// Compute optimal space for subroutines.
	maxwrk := 1
	opts := string(jobU) + string(jobVT)
	var wrkbl, bdspac int
	if m >= n {
		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
		bdspac = 5 * n
		impl.Dgeqrf(m, n, a, lda, dum, dum, -1)
		lwork_dgeqrf := int(dum[0])
		impl.Dorgqr(m, n, n, a, lda, dum, dum, -1)
		lwork_dorgqr_n := int(dum[0])
		impl.Dorgqr(m, m, n, a, lda, dum, dum, -1)
		lwork_dorgqr_m := int(dum[0])
		impl.Dgebrd(n, n, a, lda, s, dum, dum, dum, dum, -1)
		lwork_dgebrd := int(dum[0])
		impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, dum, dum, -1)
		lwork_dorgbr_p := int(dum[0])
		impl.Dorgbr(lapack.ApplyQ, n, n, n, a, lda, dum, dum, -1)
		lwork_dorgbr_q := int(dum[0])

		if m >= mnthr {
			// m >> n
			if wantun {
				// Path 1
				maxwrk = n + lwork_dgeqrf
				maxwrk = max(maxwrk, 3*n+lwork_dgebrd)
				if wantvo || wantvas {
					maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
				}
				maxwrk = max(maxwrk, bdspac)
			} else if wantuo && wantvn {
				// Path 2
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
			} else if wantuo && wantvs {
				// Path 3
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
			} else if wantus && wantvn {
				// Path 4
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = n*n + wrkbl
			} else if wantus && wantvo {
				// Path 5
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = 2*n*n + wrkbl
			} else if wantus && wantvas {
				// Path 6
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = n*n + wrkbl
			} else if wantua && wantvn {
				// Path 7
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = n*n + wrkbl
			} else if wantua && wantvo {
				// Path 8
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = 2*n*n + wrkbl
			} else if wantua && wantvas {
				// Path 9
				wrkbl = n + lwork_dgeqrf
				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = n*n + wrkbl
			}
		} else {
			// Path 10: m > n
			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
			lwork_dgebrd := int(dum[0])
			maxwrk = 3*n + lwork_dgebrd
			if wantus || wantuo {
				impl.Dorgbr(lapack.ApplyQ, m, n, n, a, lda, dum, dum, -1)
				lwork_dorgbr_q = int(dum[0])
				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
			}
			if wantua {
				impl.Dorgbr(lapack.ApplyQ, m, m, n, a, lda, dum, dum, -1)
				lwork_dorgbr_q := int(dum[0])
				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
			}
			if !wantvn {
				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
			}
			maxwrk = max(maxwrk, bdspac)
		}
	} else {
		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)

		bdspac = 5 * m
		impl.Dgelqf(m, n, a, lda, dum, dum, -1)
		lwork_dgelqf := int(dum[0])
		impl.Dorglq(n, n, m, dum, n, dum, dum, -1)
		lwork_dorglq_n := int(dum[0])
		impl.Dorglq(m, n, m, a, lda, dum, dum, -1)
		lwork_dorglq_m := int(dum[0])
		impl.Dgebrd(m, m, a, lda, s, dum, dum, dum, dum, -1)
		lwork_dgebrd := int(dum[0])
		impl.Dorgbr(lapack.ApplyP, m, m, m, a, n, dum, dum, -1)
		lwork_dorgbr_p := int(dum[0])
		impl.Dorgbr(lapack.ApplyQ, m, m, m, a, n, dum, dum, -1)
		lwork_dorgbr_q := int(dum[0])
		if n >= mnthr {
			// n >> m
			if wantvn {
				// Path 1t
				maxwrk = m + lwork_dgelqf
				maxwrk = max(maxwrk, 3*m+lwork_dgebrd)
				if wantuo || wantuas {
					maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
				}
				maxwrk = max(maxwrk, bdspac)
			} else if wantvo && wantun {
				// Path 2t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_m)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
			} else if wantvo && wantuas {
				// Path 3t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_m)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
			} else if wantvs && wantun {
				// Path 4t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_m)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = m*m + wrkbl
			} else if wantvs && wantuo {
				// Path 5t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_m)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = 2*m*m + wrkbl
			} else if wantvs && wantuas {
				// Path 6t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_m)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = m*m + wrkbl
			} else if wantva && wantun {
				// Path 7t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_n)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = m*m + wrkbl
			} else if wantva && wantuo {
				// Path 8t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_n)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = 2*m*m + wrkbl
			} else if wantva && wantuas {
				// Path 9t
				wrkbl = m + lwork_dgelqf
				wrkbl = max(wrkbl, m+lwork_dorglq_n)
				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
				wrkbl = max(wrkbl, bdspac)
				maxwrk = m*m + wrkbl
			}
		} else {
			// Path 10t, n > m
			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
			lwork_dgebrd = int(dum[0])
			maxwrk := 3*m + lwork_dgebrd
			if wantvs || wantvo {
				impl.Dorgbr(lapack.ApplyP, m, n, m, a, n, dum, dum, -1)
				lwork_dorgbr_p = int(dum[0])
				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
			}
			if wantva {
				impl.Dorgbr(lapack.ApplyP, n, n, m, a, n, dum, dum, -1)
				lwork_dorgbr_p = int(dum[0])
				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
			}
			if !wantun {
				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
			}
			maxwrk = max(maxwrk, bdspac)
		}
	}

	minWork := max(1, 5*minmn)
	if !((wantun && m >= mnthr) || (wantvn && n >= mnthr)) {
		minWork = max(minWork, 3*minmn+max(m, n))
	}

	if lwork != -1 {
		if len(work) < lwork {
			panic(badWork)
		}
		if lwork < minWork {
			panic(badWork)
		}
	}
	if m == 0 || n == 0 {
		return true
	}

	maxwrk = max(maxwrk, minWork)
	work[0] = float64(maxwrk)
	if lwork == -1 {
		return true
	}

	// Perform decomposition.
	eps := dlamchE
	smlnum := math.Sqrt(dlamchS) / eps
	bignum := 1 / smlnum

	// Scale A if max element outside range [smlnum, bignum].
	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, dum)
	var iscl bool
	if anrm > 0 && anrm < smlnum {
		iscl = true
		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
	} else if anrm > bignum {
		iscl = true
		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
	}

	var ie int
	if m >= n {
		// If A has sufficiently more rows than columns, use the QR decomposition.
		if m >= mnthr {
			// m >> n
			if wantun {
				// Path 1.
				itau := 0
				iwork := itau + n

				// Compute A = Q * R.
				impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

				// Zero out below R.
				impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
				ie = 0
				itauq := ie + n
				itaup := itauq + n
				iwork = itaup + n
				// Bidiagonalize R in A.
				impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:],
					work[itaup:], work[iwork:], lwork-iwork)
				ncvt := 0
				if wantvo || wantvas {
					// Generate P^T.
					impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, work[itaup:],
						work[iwork:], lwork-iwork)
					ncvt = n
				}
				iwork = ie + n

				// Perform bidiagonal QR iteration computing right singular vectors
				// of A in A if desired.
				ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:],
					a, lda, dum, 1, dum, 1, work[iwork:])

				// If right singular vectors desired in VT, copy them there.
				if wantvas {
					impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt)
				}
			} else if wantuo && wantvn {
				// Path 2
				panic(noSVDO)
			} else if wantuo && wantvas {
				// Path 3
				panic(noSVDO)
			} else if wantus {
				if wantvn {
					// Path 4
					if lwork >= n*n+max(4*n, bdspac) {
						// Sufficient workspace for a fast algorithm.
						ir := 0
						var ldworkr int
						if lwork >= wrkbl+lda*n {
							ldworkr = lda
						} else {
							ldworkr = n
						}
						itau := ir + ldworkr*n
						iwork := itau + n
						// Compute A = Q * R.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

						// Copy R to work[ir:], zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)

						// Generate Q in A.
						impl.Dorgqr(m, n, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in work[ir:].
						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Generate left vectors bidiagonalizing R in work[ir:].
						impl.Dorgbr(lapack.ApplyQ, n, n, n, work[ir:], ldworkr,
							work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, compuing left singular
						// vectors of R in work[ir:].
						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], dum, 1,
							work[ir:], ldworkr, dum, 1, work[iwork:])

						// Multiply Q in A by left singular vectors of R in
						// work[ir:], storing result in U.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
							work[ir:], ldworkr, 0, u, ldu)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + n

						// Compute A = Q*R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Generate Q in U.
						impl.Dorgqr(m, n, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Zero out below R in A.
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)

						// Bidiagonalize R in A.
						impl.Dgebrd(n, n, a, lda, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply Q in U by left vectors bidiagonalizing R.
						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left
						// singular vectors of A in U.
						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:], dum, 1,
							u, ldu, dum, 1, work[iwork:])
					}
				} else if wantvo {
					// Path 5
					panic(noSVDO)
				} else if wantvas {
					// Path 6
					if lwork >= n*n+max(4*n, bdspac) {
						// Sufficient workspace for a fast algorithm.
						iu := 0
						var ldworku int
						if lwork >= wrkbl+lda*n {
							ldworku = lda
						} else {
							ldworku = n
						}
						itau := iu + ldworku*n
						iwork := itau + n

						// Compute A = Q * R.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						// Copy R to work[iu:], zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)

						// Generate Q in A.
						impl.Dorgqr(m, n, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in work[iu:], copying result to VT.
						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)

						// Generate left bidiagonalizing vectors in work[iu:].
						impl.Dorgbr(lapack.ApplyQ, n, n, n, work[iu:], ldworku,
							work[itauq:], work[iwork:], lwork-iwork)

						// Generate right bidiagonalizing vectors in VT.
						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of R in work[iu:], and computing right singular
						// vectors of R in VT.
						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
							vt, ldvt, work[iu:], ldworku, dum, 1, work[iwork:])

						// Multiply Q in A by left singular vectors of R in
						// work[iu:], storing result in U.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
							work[iu:], ldworku, 0, u, ldu)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + n

						// Compute A = Q * R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Generate Q in U.
						impl.Dorgqr(m, n, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)

						// Copy R to VT, zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)

						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in VT.
						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply Q in U by left bidiagonalizing vectors in VT.
						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
							vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)

						// Generate right bidiagonalizing vectors in VT.
						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of A in U and computing right singular vectors
						// of A in VT.
						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
							vt, ldvt, u, ldu, dum, 1, work[iwork:])
					}
				}
			} else if wantua {
				if wantvn {
					// Path 7
					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
						// Sufficient workspace for a fast algorithm.
						ir := 0
						var ldworkr int
						if lwork >= wrkbl+lda*n {
							ldworkr = lda
						} else {
							ldworkr = n
						}
						itau := ir + ldworkr*n
						iwork := itau + n

						// Compute A = Q*R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Copy R to work[ir:], zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)

						// Generate Q in U.
						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in work[ir:].
						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Generate left bidiagonalizing vectors in work[ir:].
						impl.Dorgbr(lapack.ApplyQ, n, n, n, work[ir:], ldworkr,
							work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of R in work[ir:].
						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], dum, 1,
							work[ir:], ldworkr, dum, 1, work[iwork:])

						// Multiply Q in U by left singular vectors of R in
						// work[ir:], storing result in A.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, u, ldu,
							work[ir:], ldworkr, 0, a, lda)

						// Copy left singular vectors of A from A to U.
						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + n

						// Compute A = Q*R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Generate Q in U.
						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Zero out below R in A.
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)

						// Bidiagonalize R in A.
						impl.Dgebrd(n, n, a, lda, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply Q in U by left bidiagonalizing vectors in A.
						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left
						// singular vectors of A in U.
						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:],
							dum, 1, u, ldu, dum, 1, work[iwork:])
					}
				} else if wantvo {
					// Path 8.
					panic(noSVDO)
				} else if wantvas {
					// Path 9.
					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
						// Sufficient workspace for a fast algorithm.
						iu := 0
						var ldworku int
						if lwork >= wrkbl+lda*n {
							ldworku = lda
						} else {
							ldworku = n
						}
						itau := iu + ldworku*n
						iwork := itau + n

						// Compute A = Q * R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Generate Q in U.
						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)

						// Copy R to work[iu:], zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)

						ie = itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in work[iu:], copying result to VT.
						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)

						// Generate left bidiagonalizing vectors in work[iu:].
						impl.Dorgbr(lapack.ApplyQ, n, n, n, work[iu:], ldworku,
							work[itauq:], work[iwork:], lwork-iwork)

						// Generate right bidiagonalizing vectors in VT.
						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of R in work[iu:] and computing right
						// singular vectors of R in VT.
						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
							vt, ldvt, work[iu:], ldworku, dum, 1, work[iwork:])

						// Multiply Q in U by left singular vectors of R in
						// work[iu:], storing result in A.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1,
							u, ldu, work[iu:], ldworku, 0, a, lda)

						// Copy left singular vectors of A from A to U.
						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)

						/*
							// Bidiagonalize R in VT.
							impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
								work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

							// Multiply Q in U by left bidiagonalizing vectors in VT.
							impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
								m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)

							// Generate right bidiagonalizing vectors in VT.
							impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt,
								work[itaup:], work[iwork:], lwork-iwork)
							iwork = ie + n

							// Perform bidiagonal QR iteration, computing left singular
							// vectors of A in U and computing right singular vectors
							// of A in VT.
							ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
								vt, ldvt, u, ldu, dum, 1, work[iwork:])
						*/
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + n

						// Compute A = Q*R, copying result to U.
						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)

						// Generate Q in U.
						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)

						// Copy R from A to VT, zeroing out below it.
						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)

						ie := itau
						itauq := ie + n
						itaup := itauq + n
						iwork = itaup + n

						// Bidiagonalize R in VT.
						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply Q in U by left bidiagonalizing vectors in VT.
						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
							m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)

						// Generate right bidiagonizing vectors in VT.
						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + n

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of A in U and computing right singular vectors
						// of A in VT.
						impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
							vt, ldvt, u, ldu, dum, 1, work[iwork:])
					}
				}
			}
		} else {
			// Path 10.
			// M at least N, but not much larger.
			ie = 0
			itauq := ie + n
			itaup := itauq + n
			iwork := itaup + n

			// Bidiagonalize A.
			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:],
				work[itaup:], work[iwork:], lwork-iwork)
			if wantuas {
				// Left singular vectors are desired in U. Copy result to U and
				// generate left biadiagonalizing vectors in U.
				impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
				var ncu int
				if wantus {
					ncu = n
				}
				if wantua {
					ncu = m
				}
				impl.Dorgbr(lapack.ApplyQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
			}
			if wantvas {
				// Right singular vectors are desired in VT. Copy result to VT and
				// generate left biadiagonalizing vectors in VT.
				impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
				impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
			}
			if wantuo {
				panic(noSVDO)
			}
			if wantvo {
				panic(noSVDO)
			}
			iwork = ie + n
			var nru, ncvt int
			if wantuas || wantuo {
				nru = m
			}
			if wantun {
				nru = 0
			}
			if wantvas || wantvo {
				ncvt = n
			}
			if wantvn {
				ncvt = 0
			}
			if !wantuo && !wantvo {
				// Perform bidiagonal QR iteration, if desired, computing left
				// singular vectors in U and right singular vectors in VT.
				ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:],
					vt, ldvt, u, ldu, dum, 1, work[iwork:])
			} else {
				// There will be two branches when the implementation is complete.
				panic(noSVDO)
			}
		}
	} else {
		// A has more columns than rows. If A has sufficiently more columns than
		// rows, first reduce using the LQ decomposition.
		if n >= mnthr {
			// n >> m.
			if wantvn {
				// Path 1t.
				itau := 0
				iwork := itau + m

				// Compute A = L*Q.
				impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

				// Zero out above L.
				impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
				ie := 0
				itauq := ie + m
				itaup := itauq + m
				iwork = itaup + m

				// Bidiagonalize L in A.
				impl.Dgebrd(m, m, a, lda, s, work[ie:itauq],
					work[itauq:itaup], work[itaup:iwork], work[iwork:], lwork-iwork)
				if wantuo || wantuas {
					impl.Dorgbr(lapack.ApplyQ, m, m, m, a, lda,
						work[itauq:], work[iwork:], lwork-iwork)
				}
				iwork = ie + m
				nru := 0
				if wantuo || wantuas {
					nru = m
				}

				// Perform bidiagonal QR iteration, computing left singular vectors
				// of A in A if desired.
				ok = impl.Dbdsqr(blas.Upper, m, 0, nru, 0, s, work[ie:],
					dum, 1, a, lda, dum, 1, work[iwork:])

				// If left singular vectors desired in U, copy them there.
				if wantuas {
					impl.Dlacpy(blas.All, m, m, a, lda, u, ldu)
				}
			} else if wantvo && wantun {
				// Path 2t.
				panic(noSVDO)
			} else if wantvo && wantuas {
				// Path 3t.
				panic(noSVDO)
			} else if wantvs {
				if wantun {
					// Path 4t.
					if lwork >= m*m+max(4*m, bdspac) {
						// Sufficient workspace for a fast algorithm.
						ir := 0
						var ldworkr int
						if lwork >= wrkbl+lda*m {
							ldworkr = lda
						} else {
							ldworkr = m
						}
						itau := ir + ldworkr*m
						iwork := itau + m

						// Compute A = L*Q.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

						// Copy L to work[ir:], zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)

						// Generate Q in A.
						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in work[ir:].
						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Generate right vectors bidiagonalizing L in work[ir:].
						impl.Dorgbr(lapack.ApplyP, m, m, m, work[ir:], ldworkr,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing right singular
						// vectors of L in work[ir:].
						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
							work[ir:], ldworkr, dum, 1, dum, 1, work[iwork:])

						// Multiply right singular vectors of L in work[ir:] by
						// Q in A, storing result in VT.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
							work[ir:], ldworkr, a, lda, 0, vt, ldvt)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + m

						// Compute A = L*Q.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

						// Copy result to VT.
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Generate Q in VT.
						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Zero out above L in A.
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)

						// Bidiagonalize L in A.
						impl.Dgebrd(m, m, a, lda, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply right vectors bidiagonalizing L by Q in VT.
						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing right
						// singular vectors of A in VT.
						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
							vt, ldvt, dum, 1, dum, 1, work[iwork:])
					}
				} else if wantuo {
					// Path 5t.
					panic(noSVDO)
				} else if wantuas {
					// Path 6t.
					if lwork >= m*m+max(4*m, bdspac) {
						// Sufficient workspace for a fast algorithm.
						iu := 0
						var ldworku int
						if lwork >= wrkbl+lda*m {
							ldworku = lda
						} else {
							ldworku = m
						}
						itau := iu + ldworku*m
						iwork := itau + m

						// Compute A = L*Q.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)

						// Copy L to work[iu:], zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)

						// Generate Q in A.
						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in work[iu:], copying result to U.
						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)

						// Generate right bidiagionalizing vectors in work[iu:].
						impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku,
							work[itaup:], work[iwork:], lwork-iwork)

						// Generate left bidiagonalizing vectors in U.
						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of L in U and computing right singular vectors of
						// L in work[iu:].
						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
							work[iu:], ldworku, u, ldu, dum, 1, work[iwork:])

						// Multiply right singular vectors of L in work[iu:] by
						// Q in A, storing result in VT.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
							work[iu:], ldworku, a, lda, 0, vt, ldvt)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + m

						// Compute A = L*Q, copying result to VT.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Generate Q in VT.
						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)

						// Copy L to U, zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)

						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in U.
						impl.Dgebrd(m, m, u, ldu, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Multiply right bidiagonalizing vectors in U by Q in VT.
						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)

						// Generate left bidiagonalizing vectors in U.
						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of A in U and computing right singular vectors
						// of A in VT.
						impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt,
							u, ldu, dum, 1, work[iwork:])
					}
				}
			} else if wantva {
				if wantun {
					// Path 7t.
					if lwork >= m*m+max(max(n+m, 4*m), bdspac) {
						// Sufficient workspace for a fast algorithm.
						ir := 0
						var ldworkr int
						if lwork >= wrkbl+lda*m {
							ldworkr = lda
						} else {
							ldworkr = m
						}
						itau := ir + ldworkr*m
						iwork := itau + m

						// Compute A = L*Q, copying result to VT.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Copy L to work[ir:], zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)

						// Generate Q in VT.
						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)

						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in work[ir:].
						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)

						// Generate right bidiagonalizing vectors in work[ir:].
						impl.Dorgbr(lapack.ApplyP, m, m, m, work[ir:], ldworkr,
							work[itaup:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing right
						// singular vectors of L in work[ir:].
						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
							work[ir:], ldworkr, dum, 1, dum, 1, work[iwork:])

						// Multiply right singular vectors of L in work[ir:] by
						// Q in VT, storing result in A.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
							work[ir:], ldworkr, vt, ldvt, 0, a, lda)

						// Copy right singular vectors of A from A to VT.
						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + m
						// Compute A = L * Q, copying result to VT.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Generate Q in VT.
						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)

						ie := itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Zero out above L in A.
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)

						// Bidiagonalize L in A.
						impl.Dgebrd(m, m, a, lda, s, work[ie:], work[itauq:],
							work[itaup:], work[iwork:], lwork-iwork)

						// Multiply right bidiagonalizing vectors in A by Q in VT.
						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing right singular
						// vectors of A in VT.
						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
							vt, ldvt, dum, 1, dum, 1, work[iwork:])
					}
				} else if wantuo {
					panic(noSVDO)
				} else if wantuas {
					// Path 9t.
					if lwork >= m*m+max(max(m+n, 4*m), bdspac) {
						// Sufficient workspace for a fast algorithm.
						iu := 0

						var ldworku int
						if lwork >= wrkbl+lda*m {
							ldworku = lda
						} else {
							ldworku = m
						}
						itau := iu + ldworku*m
						iwork := itau + m

						// Generate A = L * Q copying result to VT.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Generate Q in VT.
						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)

						// Copy L to work[iu:], zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)
						ie = itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in work[iu:], copying result to U.
						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)

						// Generate right bidiagonalizing vectors in work[iu:].
						impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku,
							work[itaup:], work[iwork:], lwork-iwork)

						// Generate left bidiagonalizing vectors in U.
						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of L in U and computing right singular vectors
						// of L in work[iu:].
						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
							work[iu:], ldworku, u, ldu, dum, 1, work[iwork:])

						// Multiply right singular vectors of L in work[iu:]
						// Q in VT, storing result in A.
						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
							work[iu:], ldworku, vt, ldvt, 0, a, lda)

						// Copy right singular vectors of A from A to VT.
						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
					} else {
						// Insufficient workspace for a fast algorithm.
						itau := 0
						iwork := itau + m

						// Compute A = L * Q, copying result to VT.
						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)

						// Generate Q in VT.
						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)

						// Copy L to U, zeroing out above it.
						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)

						ie = itau
						itauq := ie + m
						itaup := itauq + m
						iwork = itaup + m

						// Bidiagonalize L in U.
						impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:],
							work[itaup:], work[iwork:], lwork-iwork)

						// Multiply right bidiagonalizing vectors in U by Q in VT.
						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)

						// Generate left bidiagonalizing vectors in U.
						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
						iwork = ie + m

						// Perform bidiagonal QR iteration, computing left singular
						// vectors of A in U and computing right singular vectors
						// of A in VT.
						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:],
							vt, ldvt, u, ldu, dum, 1, work[iwork:])
					}
				}
			}
		} else {
			// Path 10t.
			// N at least M, but not much larger.
			ie = 0
			itauq := ie + m
			itaup := itauq + m
			iwork := itaup + m

			// Bidiagonalize A.
			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
			if wantuas {
				// If left singular vectors desired in U, copy result to U and
				// generate left bidiagonalizing vectors in U.
				impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
				impl.Dorgbr(lapack.ApplyQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
			}
			if wantvas {
				// If right singular vectors desired in VT, copy result to VT
				// and generate right bidiagonalizing vectors in VT.
				impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
				var nrvt int
				if wantva {
					nrvt = n
				} else {
					nrvt = m
				}
				impl.Dorgbr(lapack.ApplyP, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
			}
			if wantuo {
				panic(noSVDO)
			}
			if wantvo {
				panic(noSVDO)
			}
			iwork = ie + m
			var nru, ncvt int
			if wantuas || wantuo {
				nru = m
			}
			if wantvas || wantvo {
				ncvt = n
			}
			if !wantuo && !wantvo {
				// Perform bidiagonal QR iteration, if desired, computing left
				// singular vectors in U and computing right singular vectors in
				// VT.
				ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:],
					vt, ldvt, u, ldu, dum, 1, work[iwork:])
			} else {
				// There will be two branches when the implementation is complete.
				panic(noSVDO)
			}
		}
	}
	if !ok {
		if ie > 1 {
			for i := 0; i < minmn-1; i++ {
				work[i+1] = work[i+ie]
			}
		}
		if ie < 1 {
			for i := minmn - 2; i >= 0; i-- {
				work[i+1] = work[i+ie]
			}
		}
	}
	// Undo scaling if necessary.
	if iscl {
		if anrm > bignum {
			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn, 1, s, minmn)
		}
		if !ok && anrm > bignum {
			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn-1, 1, work[minmn:], minmn)
		}
		if anrm < smlnum {
			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn, 1, s, minmn)
		}
		if !ok && anrm < smlnum {
			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn-1, 1, work[minmn:], minmn)
		}
	}
	work[0] = float64(maxwrk)
	return ok
}
Esempio n. 25
0
// Dgebal balances an n×n matrix A. Balancing consists of two stages, permuting
// and scaling. Both steps are optional and depend on the value of job.
//
// Permuting consists of applying a permutation matrix P such that the matrix
// that results from P^T*A*P takes the upper block triangular form
//            [ T1  X  Y  ]
//  P^T A P = [  0  B  Z  ],
//            [  0  0  T2 ]
// where T1 and T2 are upper triangular matrices and B contains at least one
// nonzero off-diagonal element in each row and column. The indices ilo and ihi
// mark the starting and ending columns of the submatrix B. The eigenvalues of A
// isolated in the first 0 to ilo-1 and last ihi+1 to n-1 elements on the
// diagonal can be read off without any roundoff error.
//
// Scaling consists of applying a diagonal similarity transformation D such that
// D^{-1}*B*D has the 1-norm of each row and its corresponding column nearly
// equal. The output matrix is
//  [ T1     X*D          Y    ]
//  [  0  inv(D)*B*D  inv(D)*Z ].
//  [  0      0           T2   ]
// Scaling may reduce the 1-norm of the matrix, and improve the accuracy of
// the computed eigenvalues and/or eigenvectors.
//
// job specifies the operations that will be performed on A.
// If job is lapack.None, Dgebal sets scale[i] = 1 for all i and returns ilo=0, ihi=n-1.
// If job is lapack.Permute, only permuting will be done.
// If job is lapack.Scale, only scaling will be done.
// If job is lapack.PermuteScale, both permuting and scaling will be done.
//
// On return, if job is lapack.Permute or lapack.PermuteScale, it will hold that
//  A[i,j] == 0,   for i > j and j ∈ {0, ..., ilo-1, ihi+1, ..., n-1}.
// If job is lapack.None or lapack.Scale, or if n == 0, it will hold that
//  ilo == 0 and ihi == n-1.
//
// On return, scale will contain information about the permutations and scaling
// factors applied to A. If π(j) denotes the index of the column interchanged
// with column j, and D[j,j] denotes the scaling factor applied to column j,
// then
//  scale[j] == π(j),     for j ∈ {0, ..., ilo-1, ihi+1, ..., n-1},
//           == D[j,j],   for j ∈ {ilo, ..., ihi}.
// scale must have length equal to n, otherwise Dgebal will panic.
//
// Dgebal is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dgebal(job lapack.Job, n int, a []float64, lda int, scale []float64) (ilo, ihi int) {
	switch job {
	default:
		panic(badJob)
	case lapack.None, lapack.Permute, lapack.Scale, lapack.PermuteScale:
	}
	checkMatrix(n, n, a, lda)
	if len(scale) != n {
		panic("lapack: bad length of scale")
	}

	ilo = 0
	ihi = n - 1

	if n == 0 || job == lapack.None {
		for i := range scale {
			scale[i] = 1
		}
		return ilo, ihi
	}

	bi := blas64.Implementation()
	swapped := true

	if job == lapack.Scale {
		goto scaling
	}

	// Permutation to isolate eigenvalues if possible.
	//
	// Search for rows isolating an eigenvalue and push them down.
	for swapped {
		swapped = false
	rows:
		for i := ihi; i >= 0; i-- {
			for j := 0; j <= ihi; j++ {
				if i == j {
					continue
				}
				if a[i*lda+j] != 0 {
					continue rows
				}
			}
			// Row i has only zero off-diagonal elements in the
			// block A[ilo:ihi+1,ilo:ihi+1].
			scale[ihi] = float64(i)
			if i != ihi {
				bi.Dswap(ihi+1, a[i:], lda, a[ihi:], lda)
				bi.Dswap(n, a[i*lda:], 1, a[ihi*lda:], 1)
			}
			if ihi == 0 {
				scale[0] = 1
				return ilo, ihi
			}
			ihi--
			swapped = true
			break
		}
	}
	// Search for columns isolating an eigenvalue and push them left.
	swapped = true
	for swapped {
		swapped = false
	columns:
		for j := ilo; j <= ihi; j++ {
			for i := ilo; i <= ihi; i++ {
				if i == j {
					continue
				}
				if a[i*lda+j] != 0 {
					continue columns
				}
			}
			// Column j has only zero off-diagonal elements in the
			// block A[ilo:ihi+1,ilo:ihi+1].
			scale[ilo] = float64(j)
			if j != ilo {
				bi.Dswap(ihi+1, a[j:], lda, a[ilo:], lda)
				bi.Dswap(n-ilo, a[j*lda+ilo:], 1, a[ilo*lda+ilo:], 1)
			}
			swapped = true
			ilo++
			break
		}
	}

scaling:
	for i := ilo; i <= ihi; i++ {
		scale[i] = 1
	}

	if job == lapack.Permute {
		return ilo, ihi
	}

	// Balance the submatrix in rows ilo to ihi.

	const (
		// sclfac should be a power of 2 to avoid roundoff errors.
		// Elements of scale are restricted to powers of sclfac,
		// therefore the matrix will be only nearly balanced.
		sclfac = 2
		// factor determines the minimum reduction of the row and column
		// norms that is considered non-negligible. It must be less than 1.
		factor = 0.95
	)
	sfmin1 := dlamchS / dlamchP
	sfmax1 := 1 / sfmin1
	sfmin2 := sfmin1 * sclfac
	sfmax2 := 1 / sfmin2

	// Iterative loop for norm reduction.
	var conv bool
	for !conv {
		conv = true
		for i := ilo; i <= ihi; i++ {
			c := bi.Dnrm2(ihi-ilo+1, a[ilo*lda+i:], lda)
			r := bi.Dnrm2(ihi-ilo+1, a[i*lda+ilo:], 1)
			ica := bi.Idamax(ihi+1, a[i:], lda)
			ca := math.Abs(a[ica*lda+i])
			ira := bi.Idamax(n-ilo, a[i*lda+ilo:], 1)
			ra := math.Abs(a[i*lda+ilo+ira])

			// Guard against zero c or r due to underflow.
			if c == 0 || r == 0 {
				continue
			}
			g := r / sclfac
			f := 1.0
			s := c + r
			for c < g && math.Max(f, math.Max(c, ca)) < sfmax2 && math.Min(r, math.Min(g, ra)) > sfmin2 {
				if math.IsNaN(c + f + ca + r + g + ra) {
					// Panic if NaN to avoid infinite loop.
					panic("lapack: NaN")
				}
				f *= sclfac
				c *= sclfac
				ca *= sclfac
				g /= sclfac
				r /= sclfac
				ra /= sclfac
			}
			g = c / sclfac
			for r <= g && math.Max(r, ra) < sfmax2 && math.Min(math.Min(f, c), math.Min(g, ca)) > sfmin2 {
				f /= sclfac
				c /= sclfac
				ca /= sclfac
				g /= sclfac
				r *= sclfac
				ra *= sclfac
			}

			if c+r >= factor*s {
				// Reduction would be negligible.
				continue
			}
			if f < 1 && scale[i] < 1 && f*scale[i] <= sfmin1 {
				continue
			}
			if f > 1 && scale[i] > 1 && scale[i] >= sfmax1/f {
				continue
			}

			// Now balance.
			scale[i] *= f
			bi.Dscal(n-ilo, 1/f, a[i*lda+ilo:], 1)
			bi.Dscal(ihi+1, f, a[i:], lda)
			conv = false
		}
	}
	return ilo, ihi
}
Esempio n. 26
0
// Dsteqr computes the eigenvalues and optionally the eigenvectors of a symmetric
// tridiagonal matrix using the implicit QL or QR method. The eigenvectors of a
// full or band symmetric matrix can also be found if Dsytrd, Dsptrd, or Dsbtrd
// have been used to reduce this matrix to tridiagonal form.
//
// d, on entry, contains the diagonal elements of the tridiagonal matrix. On exit,
// d contains the eigenvalues in ascending order. d must have length n and
// Dsteqr will panic otherwise.
//
// e, on entry, contains the off-diagonal elements of the tridiagonal matrix on
// entry, and is overwritten during the call to Dsteqr. e must have length n-1 and
// Dsteqr will panic otherwise.
//
// z, on entry, contains the n×n orthogonal matrix used in the reduction to
// tridiagonal form if compz == lapack.EigDecomp. On exit, if
// compz == lapack.EigBoth, z contains the orthonormal eigenvectors of the
// original symmetric matrix, and if compz == lapack.EigDecomp, z contains the
// orthonormal eigenvectors of the symmetric tridiagonal matrix. z is not used
// if compz == lapack.EigValueOnly.
//
// work must have length at least max(1, 2*n-2) if the eigenvectors are computed,
// and Dsteqr will panic otherwise.
//
// Dsteqr is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dsteqr(compz lapack.EigComp, n int, d, e, z []float64, ldz int, work []float64) (ok bool) {
	if len(d) < n {
		panic(badD)
	}
	if len(e) < n-1 {
		panic(badE)
	}
	if compz != lapack.EigValueOnly && compz != lapack.EigBoth && compz != lapack.EigDecomp {
		panic(badEigComp)
	}
	if compz != lapack.EigValueOnly {
		if len(work) < max(1, 2*n-2) {
			panic(badWork)
		}
		checkMatrix(n, n, z, ldz)
	}

	var icompz int
	if compz == lapack.EigDecomp {
		icompz = 1
	} else if compz == lapack.EigBoth {
		icompz = 2
	}

	if n == 0 {
		return true
	}
	if n == 1 {
		if icompz == 2 {
			z[0] = 1
		}
		return true
	}

	bi := blas64.Implementation()

	eps := dlamchE
	eps2 := eps * eps
	safmin := dlamchS
	safmax := 1 / safmin
	ssfmax := math.Sqrt(safmax) / 3
	ssfmin := math.Sqrt(safmin) / eps2

	// Compute the eigenvalues and eigenvectors of the tridiagonal matrix.
	if icompz == 2 {
		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
	}
	const maxit = 30
	nmaxit := n * maxit

	jtot := 0

	// Determine where the matrix splits and choose QL or QR iteration for each
	// block, according to whether top or bottom diagonal element is smaller.
	l1 := 0
	nm1 := n - 1

	type scaletype int
	const (
		none scaletype = iota
		down
		up
	)
	var iscale scaletype

	for {
		if l1 > n-1 {
			// Order eigenvalues and eigenvectors.
			if icompz == 0 {
				impl.Dlasrt(lapack.SortIncreasing, n, d)
			} else {
				// TODO(btracey): Consider replacing this sort with a call to sort.Sort.
				for ii := 1; ii < n; ii++ {
					i := ii - 1
					k := i
					p := d[i]
					for j := ii; j < n; j++ {
						if d[j] < p {
							k = j
							p = d[j]
						}
					}
					if k != i {
						d[k] = d[i]
						d[i] = p
						bi.Dswap(n, z[i:], ldz, z[k:], ldz)
					}
				}
			}
			return true
		}
		if l1 > 0 {
			e[l1-1] = 0
		}
		var m int
		if l1 <= nm1 {
			for m = l1; m < nm1; m++ {
				test := math.Abs(e[m])
				if test == 0 {
					break
				}
				if test <= (math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1])))*eps {
					e[m] = 0
					break
				}
			}
		}
		l := l1
		lsv := l
		lend := m
		lendsv := lend
		l1 = m + 1
		if lend == l {
			continue
		}

		// Scale submatrix in rows and columns L to Lend
		anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:])
		switch {
		case anorm == 0:
			continue
		case anorm > ssfmax:
			iscale = down
			// TODO(btracey): Why is lda n?
			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], n)
			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], n)
		case anorm < ssfmin:
			iscale = up
			// TODO(btracey): Why is lda n?
			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], n)
			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], n)
		}

		// Choose between QL and QR.
		if math.Abs(d[lend]) < math.Abs(d[l]) {
			lend = lsv
			l = lendsv
		}
		if lend > l {
			// QL Iteration. Look for small subdiagonal element.
			for {
				if l != lend {
					for m = l; m < lend; m++ {
						v := math.Abs(e[m])
						if v*v <= (eps2*math.Abs(d[m]))*math.Abs(d[m+1])+safmin {
							break
						}
					}
				} else {
					m = lend
				}
				if m < lend {
					e[m] = 0
				}
				p := d[l]
				if m == l {
					// Eigenvalue found.
					l++
					if l > lend {
						break
					}
					continue
				}

				// If remaining matrix is 2×2, use Dlae2 to compute its eigensystem.
				if m == l+1 {
					if icompz > 0 {
						d[l], d[l+1], work[l], work[n-1+l] = impl.Dlaev2(d[l], e[l], d[l+1])
						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
							n, 2, work[l:], work[n-1+l:], z[l:], ldz)
					} else {
						d[l], d[l+1] = impl.Dlae2(d[l], e[l], d[l+1])
					}
					e[l] = 0
					l += 2
					if l > lend {
						break
					}
					continue
				}

				if jtot == nmaxit {
					break
				}
				jtot++

				// Form shift
				g := (d[l+1] - p) / (2 * e[l])
				r := impl.Dlapy2(g, 1)
				g = d[m] - p + e[l]/(g+math.Copysign(r, g))
				s := 1.0
				c := 1.0
				p = 0.0

				// Inner loop
				for i := m - 1; i >= l; i-- {
					f := s * e[i]
					b := c * e[i]
					c, s, r = impl.Dlartg(g, f)
					if i != m-1 {
						e[i+1] = r
					}
					g = d[i+1] - p
					r = (d[i]-g)*s + 2*c*b
					p = s * r
					d[i+1] = g + p
					g = c*r - b

					// If eigenvectors are desired, then save rotations.
					if icompz > 0 {
						work[i] = c
						work[n-1+i] = -s
					}
				}
				// If eigenvectors are desired, then apply saved rotations.
				if icompz > 0 {
					mm := m - l + 1
					impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
						n, mm, work[l:], work[n-1+l:], z[l:], ldz)
				}
				d[l] -= p
				e[l] = g
			}
		} else {
			// QR Iteration.
			// Look for small superdiagonal element.
			for {
				if l != lend {
					for m = l; m > lend; m-- {
						v := math.Abs(e[m-1])
						if v*v <= (eps2*math.Abs(d[m])*math.Abs(d[m-1]) + safmin) {
							break
						}
					}
				} else {
					m = lend
				}
				if m > lend {
					e[m-1] = 0
				}
				p := d[l]
				if m == l {
					// Eigenvalue found
					l--
					if l < lend {
						break
					}
					continue
				}

				// If remaining matrix is 2×2, use Dlae2 to compute its eigenvalues.
				if m == l-1 {
					if icompz > 0 {
						d[l-1], d[l], work[m], work[n-1+m] = impl.Dlaev2(d[l-1], e[l-1], d[l])
						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
							n, 2, work[m:], work[n-1+m:], z[l-1:], ldz)
					} else {
						d[l-1], d[l] = impl.Dlae2(d[l-1], e[l-1], d[l])
					}
					e[l-1] = 0
					l -= 2
					if l < lend {
						break
					}
					continue
				}
				if jtot == nmaxit {
					break
				}
				jtot++

				// Form shift.
				g := (d[l-1] - p) / (2 * e[l-1])
				r := impl.Dlapy2(g, 1)
				g = d[m] - p + (e[l-1])/(g+math.Copysign(r, g))
				s := 1.0
				c := 1.0
				p = 0.0

				// Inner loop.
				for i := m; i < l; i++ {
					f := s * e[i]
					b := c * e[i]
					c, s, r = impl.Dlartg(g, f)
					if i != m {
						e[i-1] = r
					}
					g = d[i] - p
					r = (d[i+1]-g)*s + 2*c*b
					p = s * r
					d[i] = g + p
					g = c*r - b

					// If eigenvectors are desired, then save rotations.
					if icompz > 0 {
						work[i] = c
						work[n-1+i] = s
					}
				}

				// If eigenvectors are desired, then apply saved rotations.
				if icompz > 0 {
					mm := l - m + 1
					impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
						n, mm, work[m:], work[n-1+m:], z[m:], ldz)
				}
				d[l] -= p
				e[l-1] = g
			}
		}

		// Undo scaling if necessary.
		switch iscale {
		case down:
			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], n)
			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv, 1, e[lsv:], n)
		case up:
			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], n)
			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv, 1, e[lsv:], n)
		}

		// Check for no convergence to an eigenvalue after a total of n*maxit iterations.
		if jtot >= nmaxit {
			break
		}
	}
	for i := 0; i < n-1; i++ {
		if e[i] != 0 {
			return false
		}
	}
	return true
}
Esempio n. 27
0
// Dsytd2 reduces a symmetric n×n matrix A to symmetric tridiagonal form T by an
// orthogonal similarity transformation
//  Q^T * A * Q = T
// On entry, the matrix is contained in the specified triangle of a. On exit,
// if uplo == blas.Upper, the diagonal and first super-diagonal of a are
// overwritten with the elements of T. The elements above the first super-diagonal
// are overwritten with the the elementary reflectors that are used with the
// elements written to tau in order to construct Q. If uplo == blas.Lower, the
// elements are written in the lower triangular region.
//
// d must have length at least n. e and tau must have length at least n-1. Dsytd2
// will panic if these sizes are not met.
//
// Q is represented as a product of elementary reflectors.
// If uplo == blas.Upper
//  Q = H_{n-2} * ... * H_1 * H_0
// and if uplo == blas.Lower
//  Q = H_0 * H_1 * ... * H_{n-2}
// where
//  H_i = I - tau * v * v^T
// where tau is stored in tau[i], and v is stored in a.
//
// If uplo == blas.Upper, v[0:i-1] is stored in A[0:i-1,i+1], v[i] = 1, and
// v[i+1:] = 0. The elements of a are
//  [ d   e  v2  v3  v4]
//  [     d   e  v3  v4]
//  [         d   e  v4]
//  [             d   e]
//  [                 d]
// If uplo == blas.Lower, v[0:i+1] = 0, v[i+1] = 1, and v[i+2:] is stored in
// A[i+2:n,i].
// The elements of a are
//  [ d                ]
//  [ e   d            ]
//  [v1   e   d        ]
//  [v1  v2   e   d    ]
//  [v1  v2  v3   e   d]
//
// Dsytd2 is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dsytd2(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau []float64) {
	checkMatrix(n, n, a, lda)
	if len(d) < n {
		panic(badD)
	}
	if len(e) < n-1 {
		panic(badE)
	}
	if len(tau) < n-1 {
		panic(badTau)
	}
	if n <= 0 {
		return
	}
	bi := blas64.Implementation()
	if uplo == blas.Upper {
		// Reduce the upper triangle of A.
		for i := n - 2; i >= 0; i-- {
			// Generate elementary reflector H_i = I - tau * v * v^T to
			// annihilate A[i:i-1, i+1].
			var taui float64
			a[i*lda+i+1], taui = impl.Dlarfg(i+1, a[i*lda+i+1], a[i+1:], lda)
			e[i] = a[i*lda+i+1]
			if taui != 0 {
				// Apply H_i from both sides to A[0:i,0:i].
				a[i*lda+i+1] = 1

				// Compute x := tau * A * v storing x in tau[0:i].
				bi.Dsymv(uplo, i+1, taui, a, lda, a[i+1:], lda, 0, tau, 1)

				// Compute w := x - 1/2 * tau * (x^T * v) * v.
				alpha := -0.5 * taui * bi.Ddot(i+1, tau, 1, a[i+1:], lda)
				bi.Daxpy(i+1, alpha, a[i+1:], lda, tau, 1)

				// Apply the transformation as a rank-2 update
				// A = A - v * w^T - w * v^T.
				bi.Dsyr2(uplo, i+1, -1, a[i+1:], lda, tau, 1, a, lda)
				a[i*lda+i+1] = e[i]
			}
			d[i+1] = a[(i+1)*lda+i+1]
			tau[i] = taui
		}
		d[0] = a[0]
		return
	}
	// Reduce the lower triangle of A.
	for i := 0; i < n-1; i++ {
		// Generate elementary reflector H_i = I - tau * v * v^T to
		// annihilate A[i+2:n, i].
		var taui float64
		a[(i+1)*lda+i], taui = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
		e[i] = a[(i+1)*lda+i]
		if taui != 0 {
			// Apply H_i from both sides to A[i+1:n, i+1:n].
			a[(i+1)*lda+i] = 1

			// Compute x := tau * A * v, storing y in tau[i:n-1].
			bi.Dsymv(uplo, n-i-1, taui, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, tau[i:], 1)

			// Compute w := x - 1/2 * tau * (x^T * v) * v.
			alpha := -0.5 * taui * bi.Ddot(n-i-1, tau[i:], 1, a[(i+1)*lda+i:], lda)
			bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda, tau[i:], 1)

			// Apply the transformation as a rank-2 update
			// A = A - v * w^T - w * v^T.
			bi.Dsyr2(uplo, n-i-1, -1, a[(i+1)*lda+i:], lda, tau[i:], 1, a[(i+1)*lda+i+1:], lda)
			a[(i+1)*lda+i] = e[i]
		}
		d[i] = a[i*lda+i]
		tau[i] = taui
	}
	d[n-1] = a[(n-1)*lda+n-1]
}
Esempio n. 28
0
// Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
// block of a Hessenberg matrix.
//
// wantt and wantz determine whether the quasi-triangular Schur factor and the
// orthogonal Schur factor, respectively, will be computed.
//
// kacc22 specifies the computation mode of far-from-diagonal orthogonal
// updates. Permitted values are:
//  0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
//     multiply to update far-from-diagonal matrix entries.
//  1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
//     update far-from-diagonal matrix entries.
//  2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update
//     far-from-diagonal matrix entries, and take advantage of 2×2 block
//     structure during matrix multiplies.
// For other values of kacc2 Dlaqr5 will panic.
//
// n is the order of the Hessenberg matrix H.
//
// ktop and kbot are indices of the first and last row and column of an isolated
// diagonal block upon which the QR sweep will be applied. It must hold that
//  ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
//  kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
// otherwise Dlaqr5 will panic.
//
// nshfts is the number of simultaneous shifts. It must be positive and even,
// otherwise Dlaqr5 will panic.
//
// sr and si contain the real and imaginary parts, respectively, of the shifts
// of origin that define the multi-shift QR sweep. On return both slices may be
// reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
// will panic.
//
// h and ldh represent the Hessenberg matrix H of size n×n. On return
// multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
// diagonal block in rows and columns ktop through kbot, inclusive.
//
// iloz and ihiz specify the rows of Z to which transformations will be applied
// if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
// will panic.
//
// z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
// orthogonal similarity transformation is accumulated into
// z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
//
// v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
// is transposed with respect to the reference netlib implementation.
//
// u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3).
//
// wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh.
//
// wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3).
//
// Dlaqr5 is an internal routine. It is exported for testing purposes.
func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
	checkMatrix(n, n, h, ldh)
	if ktop < 0 || n <= ktop {
		panic("lapack: invalid value of ktop")
	}
	if ktop > 0 && h[ktop*ldh+ktop-1] != 0 {
		panic("lapack: diagonal block is not isolated")
	}
	if kbot < 0 || n <= kbot {
		panic("lapack: invalid value of kbot")
	}
	if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 {
		panic("lapack: diagonal block is not isolated")
	}
	if nshfts < 0 || nshfts&0x1 != 0 {
		panic("lapack: invalid number of shifts")
	}
	if len(sr) != nshfts || len(si) != nshfts {
		panic(badSlice) // TODO(vladimir-ch) Another message?
	}
	if wantz {
		if ihiz >= n {
			panic("lapack: invalid value of ihiz")
		}
		if iloz < 0 || ihiz < iloz {
			panic("lapack: invalid value of iloz")
		}
		checkMatrix(n, n, z, ldz)
	}
	checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack.
	checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu)
	checkMatrix(nv, 3*nshfts-3, wv, ldwv)
	checkMatrix(3*nshfts-3, nh, wh, ldwh)
	if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 {
		panic("lapack: invalid value of kacc22")
	}

	// If there are no shifts, then there is nothing to do.
	if nshfts < 2 {
		return
	}
	// If the active block is empty or 1×1, then there is nothing to do.
	if ktop >= kbot {
		return
	}

	// Shuffle shifts into pairs of real shifts and pairs of complex
	// conjugate shifts assuming complex conjugate shifts are already
	// adjacent to one another.
	for i := 0; i < nshfts-2; i += 2 {
		if si[i] == -si[i+1] {
			continue
		}
		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
	}

	// Note: lapack says that nshfts must be even but allows it to be odd
	// anyway. At the moment, we panic above if nshfts is not even, so
	// reducing it by one is unnecessary (it seems that Dlaqr0 and Dlaqr4
	// indeed use only even nshfts).
	//
	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
	// *     ==== NSHFTS is supposed to be even, but if it is odd,
	// *     .    then simply reduce it by one.  The shuffle above
	// *     .    ensures that the dropped shift is real and that
	// *     .    the remaining shifts are paired. ====
	// *
	//      NS = NSHFTS - MOD( NSHFTS, 2 )
	ns := nshfts

	safmin := dlamchS
	ulp := dlamchP
	smlnum := safmin * float64(n) / ulp

	// Use accumulated reflections to update far-from-diagonal entries?
	accum := kacc22 == 1 || kacc22 == 2
	// If so, exploit the 2×2 block structure?
	blk22 := ns > 2 && kacc22 == 2

	// Clear trash.
	if ktop+2 <= kbot {
		h[(ktop+2)*ldh+ktop] = 0
	}

	// nbmps = number of 2-shift bulges in the chain.
	nbmps := ns / 2

	// kdu = width of slab.
	kdu := 6*nbmps - 3

	// Create and chase chains of nbmps bulges.
	for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 {
		ndcol := incol + kdu
		if accum {
			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
		}

		// Near-the-diagonal bulge chase. The following loop performs
		// the near-the-diagonal part of a small bulge multi-shift QR
		// sweep. Each 6*nbmps-2 column diagonal chunk extends from
		// column incol to column ndcol (including both column incol and
		// column ndcol). The following loop chases a 3*nbmps column
		// long chain of nbmps bulges 3*nbmps-2 columns to the right.
		// (incol may be less than ktop and ndcol may be greater than
		// kbot indicating phantom columns from which to chase bulges
		// before they are actually introduced or to which to chase
		// bulges beyond column kbot.)
		for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ {
			// Bulges number mtop to mbot are active double implicit
			// shift bulges. There may or may not also be small 2×2
			// bulge, if there is room. The inactive bulges (if any)
			// must wait until the active bulges have moved down the
			// diagonal to make room. The phantom matrix paradigm
			// described above helps keep track.

			mtop := max(0, ((ktop-1)-krcol+2)/3)
			mbot := min(nbmps, (kbot-krcol)/3) - 1
			m22 := mbot + 1
			bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2)

			// Generate reflections to chase the chain right one
			// column. (The minimum value of k is ktop-1.)
			for m := mtop; m <= mbot; m++ {
				k := krcol + 3*m
				if k == ktop-1 {
					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
						v[m*ldv:m*ldv+3])
					alpha := v[m*ldv]
					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
					continue
				}
				beta := h[(k+1)*ldh+k]
				v[m*ldv+1] = h[(k+2)*ldh+k]
				v[m*ldv+2] = h[(k+3)*ldh+k]
				beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)

				// A bulge may collapse because of vigilant deflation or
				// destructive underflow. In the underflow case, try the
				// two-small-subdiagonals trick to try to reinflate the
				// bulge.
				if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
					// Typical case: not collapsed (yet).
					h[(k+1)*ldh+k] = beta
					h[(k+2)*ldh+k] = 0
					h[(k+3)*ldh+k] = 0
					continue
				}

				// Atypical case: collapsed. Attempt to reintroduce
				// ignoring H[k+1,k] and H[k+2,k]. If the fill
				// resulting from the new reflector is too large,
				// then abandon it. Otherwise, use the new one.
				var vt [3]float64
				impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m],
					si[2*m], sr[2*m+1], si[2*m+1], vt[:])
				alpha := vt[0]
				_, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1)
				refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k])

				dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
				if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum {
					// Starting a new bulge here would create
					// non-negligible fill. Use the old one with
					// trepidation.
					h[(k+1)*ldh+k] = beta
					h[(k+2)*ldh+k] = 0
					h[(k+3)*ldh+k] = 0
					continue
				} else {
					// Starting a new bulge here would create
					// only negligible fill. Replace the old
					// reflector with the new one.
					h[(k+1)*ldh+k] -= refsum
					h[(k+2)*ldh+k] = 0
					h[(k+3)*ldh+k] = 0
					v[m*ldv] = vt[0]
					v[m*ldv+1] = vt[1]
					v[m*ldv+2] = vt[2]
				}
			}

			// Generate a 2×2 reflection, if needed.
			if bmp22 {
				k := krcol + 3*m22
				if k == ktop-1 {
					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
						v[m22*ldv:m22*ldv+2])
					beta := v[m22*ldv]
					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
				} else {
					beta := h[(k+1)*ldh+k]
					v[m22*ldv+1] = h[(k+2)*ldh+k]
					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
					h[(k+1)*ldh+k] = beta
					h[(k+2)*ldh+k] = 0
				}
			}

			// Multiply H by reflections from the left.
			var jbot int
			switch {
			case accum:
				jbot = min(ndcol, kbot)
			case wantt:
				jbot = n - 1
			default:
				jbot = kbot
			}
			for j := max(ktop, krcol); j <= jbot; j++ {
				mend := min(mbot+1, (j-krcol+2)/3) - 1
				for m := mtop; m <= mend; m++ {
					k := krcol + 3*m
					refsum := v[m*ldv] * (h[(k+1)*ldh+j] +
						v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j])
					h[(k+1)*ldh+j] -= refsum
					h[(k+2)*ldh+j] -= refsum * v[m*ldv+1]
					h[(k+3)*ldh+j] -= refsum * v[m*ldv+2]
				}
			}
			if bmp22 {
				k := krcol + 3*m22
				for j := max(k+1, ktop); j <= jbot; j++ {
					refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j])
					h[(k+1)*ldh+j] -= refsum
					h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1]
				}
			}

			// Multiply H by reflections from the right. Delay filling in the last row
			// until the vigilant deflation check is complete.
			var jtop int
			switch {
			case accum:
				jtop = max(ktop, incol)
			case wantt:
				jtop = 0
			default:
				jtop = ktop
			}
			for m := mtop; m <= mbot; m++ {
				if v[m*ldv] == 0 {
					continue
				}
				k := krcol + 3*m
				for j := jtop; j <= min(kbot, k+3); j++ {
					refsum := v[m*ldv] * (h[j*ldh+k+1] +
						v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3])
					h[j*ldh+k+1] -= refsum
					h[j*ldh+k+2] -= refsum * v[m*ldv+1]
					h[j*ldh+k+3] -= refsum * v[m*ldv+2]
				}
				if accum {
					// Accumulate U. (If necessary, update Z later with with an
					// efficient matrix-matrix multiply.)
					kms := k - incol
					for j := max(0, ktop-incol-1); j < kdu; j++ {
						refsum := v[m*ldv] * (u[j*ldu+kms] +
							v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2])
						u[j*ldu+kms] -= refsum
						u[j*ldu+kms+1] -= refsum * v[m*ldv+1]
						u[j*ldu+kms+2] -= refsum * v[m*ldv+2]
					}
				} else if wantz {
					// U is not accumulated, so update Z now by multiplying by
					// reflections from the right.
					for j := iloz; j <= ihiz; j++ {
						refsum := v[m*ldv] * (z[j*ldz+k+1] +
							v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3])
						z[j*ldz+k+1] -= refsum
						z[j*ldz+k+2] -= refsum * v[m*ldv+1]
						z[j*ldz+k+3] -= refsum * v[m*ldv+2]
					}
				}
			}

			// Special case: 2×2 reflection (if needed).
			if bmp22 && v[m22*ldv] != 0 {
				k := krcol + 3*m22
				for j := jtop; j <= min(kbot, k+3); j++ {
					refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2])
					h[j*ldh+k+1] -= refsum
					h[j*ldh+k+2] -= refsum * v[m22*ldv+1]
				}
				if accum {
					kms := k - incol
					for j := max(0, ktop-incol-1); j < kdu; j++ {
						refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1])
						u[j*ldu+kms] -= refsum
						u[j*ldu+kms+1] -= refsum * v[m22*ldv+1]
					}
				} else if wantz {
					for j := iloz; j <= ihiz; j++ {
						refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2])
						z[j*ldz+k+1] -= refsum
						z[j*ldz+k+2] -= refsum * v[m22*ldv+1]
					}
				}
			}

			// Vigilant deflation check.
			mstart := mtop
			if krcol+3*mstart < ktop {
				mstart++
			}
			mend := mbot
			if bmp22 {
				mend++
			}
			if krcol == kbot-2 {
				mend++
			}
			for m := mstart; m <= mend; m++ {
				k := min(kbot-1, krcol+3*m)

				// The following convergence test requires that the tradition
				// small-compared-to-nearby-diagonals criterion and the Ahues &
				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
				// improves accuracy in some examples. Falling back on an alternate
				// convergence criterion when tst1 or tst2 is zero (as done here) is
				// traditional but probably unnecessary.

				if h[(k+1)*ldh+k] == 0 {
					continue
				}
				tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
				if tst1 == 0 {
					if k >= ktop+1 {
						tst1 += math.Abs(h[k*ldh+k-1])
					}
					if k >= ktop+2 {
						tst1 += math.Abs(h[k*ldh+k-2])
					}
					if k >= ktop+3 {
						tst1 += math.Abs(h[k*ldh+k-3])
					}
					if k <= kbot-2 {
						tst1 += math.Abs(h[(k+2)*ldh+k+1])
					}
					if k <= kbot-3 {
						tst1 += math.Abs(h[(k+3)*ldh+k+1])
					}
					if k <= kbot-4 {
						tst1 += math.Abs(h[(k+4)*ldh+k+1])
					}
				}
				if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
					h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
					h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
					h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
					h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
					scl := h11 + h12
					tst2 := h22 * (h11 / scl)
					if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
						h[(k+1)*ldh+k] = 0
					}
				}
			}

			// Fill in the last row of each bulge.
			mend = min(nbmps, (kbot-krcol-1)/3) - 1
			for m := mtop; m <= mend; m++ {
				k := krcol + 3*m
				refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3]
				h[(k+4)*ldh+k+1] = -refsum
				h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1]
				h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2]
			}
		}

		// Use U (if accumulated) to update far-from-diagonal entries in H.
		// If required, use U to update Z as well.
		if !accum {
			continue
		}
		var jtop, jbot int
		if wantt {
			jtop = 0
			jbot = n - 1
		} else {
			jtop = ktop
			jbot = kbot
		}
		bi := blas64.Implementation()
		if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 {
			// Updates not exploiting the 2×2 block structure of U. k0 and nu keep track
			// of the location and size of U in the special cases of introducing bulges
			// and chasing bulges off the bottom. In these special cases and in case the
			// number of shifts is ns = 2, there is no 2×2 block structure to exploit.

			k0 := max(0, ktop-incol-1)
			nu := kdu - max(0, ndcol-kbot) - k0

			// Horizontal multiply.
			for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
				jlen := min(nh, jbot-jcol+1)
				bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
					1, u[k0*ldu+k0:], ldu,
					h[(incol+k0+1)*ldh+jcol:], ldh,
					0, wh, ldwh)
				impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh)
			}

			// Vertical multiply.
			for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv {
				jlen := min(nv, max(ktop, incol)-jrow)
				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
					1, h[jrow*ldh+incol+k0+1:], ldh,
					u[k0*ldu+k0:], ldu,
					0, wv, ldwv)
				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh)
			}

			// Z multiply (also vertical).
			if wantz {
				for jrow := iloz; jrow <= ihiz; jrow += nv {
					jlen := min(nv, ihiz-jrow+1)
					bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
						1, z[jrow*ldz+incol+k0+1:], ldz,
						u[k0*ldu+k0:], ldu,
						0, wv, ldwv)
					impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz)
				}
			}

			continue
		}

		// Updates exploiting U's 2×2 block structure.

		// i2, i4, j2, j4 are the last rows and columns of the blocks.
		i2 := (kdu + 1) / 2
		i4 := kdu
		j2 := i4 - i2
		j4 := kdu

		// kzs and knz deal with the band of zeros along the diagonal of one of the
		// triangular blocks.
		kzs := (j4 - j2) - (ns + 1)
		knz := ns + 1

		// Horizontal multiply.
		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
			jlen := min(nh, jbot-jcol+1)

			// Copy bottom of H to top+kzs of scratch (the first kzs
			// rows get multiplied by zero).
			impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh)

			// Multiply by U21^T.
			impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh)
			bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen,
				1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh)

			// Multiply top of H by U11^T.
			bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2,
				1, u, ldu, h[(incol+1)*ldh+jcol:], ldh,
				1, wh, ldwh)

			// Copy top of H to bottom of WH.
			impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh)

			// Multiply by U21^T.
			bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen,
				1, u[i2:], ldu, wh[i2*ldwh:], ldwh)

			// Multiply by U22.
			bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2,
				1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh,
				1, wh[i2*ldwh:], ldwh)

			// Copy it back.
			impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh)
		}

		// Vertical multiply.
		for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv {
			jlen := min(nv, max(incol, ktop)-jrow)

			// Copy right of H to scratch (the first kzs columns get multiplied
			// by zero).
			impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv)

			// Multiply by U21.
			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)

			// Multiply by U11.
			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
				1, h[jrow*ldh+incol+1:], ldh, u, ldu,
				1, wv, ldwv)

			// Copy left of H to right of scratch.
			impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv)

			// Multiply by U21.
			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
				1, u[i2:], ldu, wv[i2:], ldwv)

			// Multiply by U22.
			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
				1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu,
				1, wv[i2:], ldwv)

			// Copy it back.
			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh)
		}

		if !wantz {
			continue
		}
		// Multiply Z (also vertical).
		for jrow := iloz; jrow <= ihiz; jrow += nv {
			jlen := min(nv, ihiz-jrow+1)

			// Copy right of Z to left of scratch (first kzs columns get
			// multiplied by zero).
			impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv)

			// Multiply by U12.
			impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
				1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)

			// Multiply by U11.
			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
				1, z[jrow*ldz+incol+1:], ldz, u, ldu,
				1, wv, ldwv)

			// Copy left of Z to right of scratch.
			impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv)

			// Multiply by U21.
			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
				1, u[i2:], ldu, wv[i2:], ldwv)

			// Multiply by U22.
			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
				1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu,
				1, wv[i2:], ldwv)

			// Copy the result back to Z.
			impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz)
		}
	}
}
Esempio n. 29
0
// Dlatrs solves a triangular system of equations scaled to prevent overflow. It
// solves
//  A * x = scale * b if trans == blas.NoTrans
//  A^T * x = scale * b if trans == blas.Trans
// where the scale s is set for numeric stability.
//
// A is an n×n triangular matrix. On entry, the slice x contains the values of
// of b, and on exit it contains the solution vector x.
//
// If normin == true, cnorm is an input and cnorm[j] contains the norm of the off-diagonal
// part of the j^th column of A. If trans == blas.NoTrans, cnorm[j] must be greater
// than or equal to the infinity norm, and greater than or equal to the one-norm
// otherwise. If normin == false, then cnorm is treated as an output, and is set
// to contain the 1-norm of the off-diagonal part of the j^th column of A.
func (impl Implementation) Dlatrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n int, a []float64, lda int, x []float64, cnorm []float64) (scale float64) {
	if uplo != blas.Upper && uplo != blas.Lower {
		panic(badUplo)
	}
	if trans != blas.Trans && trans != blas.NoTrans {
		panic(badTrans)
	}
	if diag != blas.Unit && diag != blas.NonUnit {
		panic(badDiag)
	}
	upper := uplo == blas.Upper
	noTrans := trans == blas.NoTrans
	nonUnit := diag == blas.NonUnit

	if n < 0 {
		panic(nLT0)
	}
	checkMatrix(n, n, a, lda)
	checkVector(n, x, 1)
	checkVector(n, cnorm, 1)

	if n == 0 {
		return
	}
	scale = 1
	bi := blas64.Implementation()
	if !normin {
		if upper {
			for j := 0; j < n; j++ {
				cnorm[j] = bi.Dasum(j, a[j:], lda)
			}
		} else {
			for j := 0; j < n-1; j++ {
				cnorm[j] = bi.Dasum(n-j-1, a[(j+1)*lda+j:], lda)
			}
			cnorm[n-1] = 0
		}
	}
	// Scale the column norms by tscal if the maximum element in cnorm is greater than bignum.
	imax := bi.Idamax(n, cnorm, 1)
	tmax := cnorm[imax]
	var tscal float64
	if tmax <= bignum {
		tscal = 1
	} else {
		tscal = 1 / (smlnum * tmax)
		bi.Dscal(n, tscal, cnorm, 1)
	}

	// Compute a bound on the computed solution vector to see if bi.Dtrsv can be used.
	j := bi.Idamax(n, x, 1)
	xmax := math.Abs(x[j])
	xbnd := xmax
	var grow float64
	var jfirst, jlast, jinc int
	if noTrans {
		if upper {
			jfirst = n - 1
			jlast = 0
			jinc = -1
		} else {
			jfirst = 0
			jlast = n - 1
			jinc = 1
		}
		// Compute the growth in A * x = b.
		if tscal != 1 {
			grow = 0
			goto Finish
		}
		if nonUnit {
			grow = 1 / math.Max(xbnd, smlnum)
			xbnd = grow
			for j := jfirst; j != jlast; j += jinc {
				if grow <= smlnum {
					goto Finish
				}
				tjj := math.Abs(a[j*lda+j])
				xbnd = math.Min(xbnd, math.Min(1, tjj)*grow)
				if tjj+cnorm[j] >= smlnum {
					grow *= tjj / (tjj + cnorm[j])
				} else {
					grow = 0
				}
			}
			grow = xbnd
		} else {
			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
			for j := jfirst; j != jlast; j += jinc {
				if grow <= smlnum {
					goto Finish
				}
				grow *= 1 / (1 + cnorm[j])
			}
		}
	} else {
		if upper {
			jfirst = 0
			jlast = n - 1
			jinc = 1
		} else {
			jfirst = n - 1
			jlast = 0
			jinc = -1
		}
		if tscal != 1 {
			grow = 0
			goto Finish
		}
		if nonUnit {
			grow = 1 / (math.Max(xbnd, smlnum))
			xbnd = grow
			for j := jfirst; j != jlast; j += jinc {
				if grow <= smlnum {
					goto Finish
				}
				xj := 1 + cnorm[j]
				grow = math.Min(grow, xbnd/xj)
				tjj := math.Abs(a[j*lda+j])
				if xj > tjj {
					xbnd *= tjj / xj
				}
			}
			grow = math.Min(grow, xbnd)
		} else {
			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
			for j := jfirst; j != jlast; j += jinc {
				if grow <= smlnum {
					goto Finish
				}
				xj := 1 + cnorm[j]
				grow /= xj
			}
		}
	}

Finish:
	if grow*tscal > smlnum {
		bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1)
		// TODO(btracey): check if this else is everything
	} else {
		if xmax > bignum {
			scale = bignum / xmax
			bi.Dscal(n, scale, x, 1)
			xmax = bignum
		}
		if noTrans {
			for j := jfirst; j != jlast; j += jinc {
				xj := math.Abs(x[j])
				var tjjs float64
				if nonUnit {
					tjjs = a[j*lda+j] * tscal
				} else {
					tjjs = tscal
					if tscal == 1 {
						break
					}
				}
				tjj := math.Abs(tjjs)
				if tjj > smlnum {
					if tjj < 1 {
						if xj > tjj*bignum {
							rec := 1 / xj
							bi.Dscal(n, rec, x, 1)
							scale *= rec
							xmax *= rec
						}
					}
					x[j] /= tjjs
					xj = math.Abs(x[j])
				} else if tjj > 0 {
					if xj > tjj*bignum {
						rec := (tjj * bignum) / xj
						if cnorm[j] > 1 {
							rec /= cnorm[j]
						}
						bi.Dscal(n, rec, x, 1)
						scale *= rec
						xmax *= rec
					}
					x[j] /= tjjs
					xj = math.Abs(x[j])
				} else {
					for i := 0; i < n; i++ {
						x[i] = 0
					}
					x[j] = 1
					xj = 1
					scale = 0
					xmax = 0
				}
				if xj > 1 {
					rec := 1 / xj
					if cnorm[j] > (bignum-xmax)*rec {
						rec *= 0.5
						bi.Dscal(n, rec, x, 1)
						scale *= rec
					}
				} else if xj*cnorm[j] > bignum-xmax {
					bi.Dscal(n, 0.5, x, 1)
					scale *= 0.5
				}
				if upper {
					if j > 0 {
						bi.Daxpy(j, -x[j]*tscal, a[j:], lda, x, 1)
						i := bi.Idamax(j, x, 1)
						xmax = math.Abs(x[i])
					}
				} else {
					if j < n-1 {
						bi.Daxpy(n-j-1, -x[j]*tscal, a[(j+1)*lda+j:], lda, x[j+1:], 1)
						i := j + bi.Idamax(n-j-1, x[j+1:], 1)
						xmax = math.Abs(x[i])
					}
				}
			}
		} else {
			for j := jfirst; j != jlast; j += jinc {
				xj := math.Abs(x[j])
				uscal := tscal
				rec := 1 / math.Max(xmax, 1)
				var tjjs float64
				if cnorm[j] > (bignum-xj)*rec {
					rec *= 0.5
					if nonUnit {
						tjjs = a[j*lda+j] * tscal
					} else {
						tjjs = tscal
					}
					tjj := math.Abs(tjjs)
					if tjj > 1 {
						rec = math.Min(1, rec*tjj)
						uscal /= tjjs
					}
					if rec < 1 {
						bi.Dscal(n, rec, x, 1)
						scale *= rec
						xmax *= rec
					}
				}
				var sumj float64
				if uscal == 1 {
					if upper {
						sumj = bi.Ddot(j, a[j:], lda, x, 1)
					} else if j < n-1 {
						sumj = bi.Ddot(n-j-1, a[(j+1)*lda+j:], lda, x[j+1:], 1)
					}
				} else {
					if upper {
						for i := 0; i < j; i++ {
							sumj += (a[i*lda+j] * uscal) * x[i]
						}
					} else if j < n {
						for i := j + 1; i < n; i++ {
							sumj += (a[i*lda+j] * uscal) * x[i]
						}
					}
				}
				if uscal == tscal {
					x[j] -= sumj
					xj := math.Abs(x[j])
					var tjjs float64
					if nonUnit {
						tjjs = a[j*lda+j] * tscal
					} else {
						tjjs = tscal
						if tscal == 1 {
							goto Out2
						}
					}
					tjj := math.Abs(tjjs)
					if tjj > smlnum {
						if tjj < 1 {
							if xj > tjj*bignum {
								rec = 1 / xj
								bi.Dscal(n, rec, x, 1)
								scale *= rec
								xmax *= rec
							}
						}
						x[j] /= tjjs
					} else if tjj > 0 {
						if xj > tjj*bignum {
							rec = (tjj * bignum) / xj
							bi.Dscal(n, rec, x, 1)
							scale *= rec
							xmax *= rec
						}
						x[j] /= tjjs
					} else {
						for i := 0; i < n; i++ {
							x[i] = 0
						}
						x[j] = 1
						scale = 0
						xmax = 0
					}
				} else {
					x[j] = x[j]/tjjs - sumj
				}
			Out2:
				xmax = math.Max(xmax, math.Abs(x[j]))
			}
		}
		scale /= tscal
	}
	if tscal != 1 {
		bi.Dscal(n, 1/tscal, cnorm, 1)
	}
	return scale
}
Esempio n. 30
0
// Dtrcon estimates the reciprocal of the condition number of a triangular matrix A.
// The condition number computed may be based on the 1-norm or the ∞-norm.
//
// work is a temporary data slice of length at least 3*n and Dtrcon will panic otherwise.
//
// iwork is a temporary data slice of length at least n and Dtrcon will panic otherwise.
func (impl Implementation) Dtrcon(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64 {
	if norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum {
		panic(badNorm)
	}
	if uplo != blas.Upper && uplo != blas.Lower {
		panic(badUplo)
	}
	if diag != blas.NonUnit && diag != blas.Unit {
		panic(badDiag)
	}
	if len(work) < 3*n {
		panic(badWork)
	}
	if len(iwork) < n {
		panic(badWork)
	}
	if n == 0 {
		return 1
	}
	bi := blas64.Implementation()

	var rcond float64
	smlnum := dlamchS * float64(n)

	anorm := impl.Dlantr(norm, uplo, diag, n, n, a, lda, work)

	if anorm <= 0 {
		return rcond
	}
	var ainvnm float64
	var normin bool
	kase1 := 2
	if norm == lapack.MaxColumnSum {
		kase1 = 1
	}
	var kase int
	isave := new([3]int)
	var scale float64
	for {
		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
		if kase == 0 {
			if ainvnm != 0 {
				rcond = (1 / anorm) / ainvnm
			}
			return rcond
		}
		if kase == kase1 {
			scale = impl.Dlatrs(uplo, blas.NoTrans, diag, normin, n, a, lda, work, work[2*n:])
		} else {
			scale = impl.Dlatrs(uplo, blas.Trans, diag, normin, n, a, lda, work, work[2*n:])
		}
		normin = true
		if scale != 1 {
			ix := bi.Idamax(n, work, 1)
			xnorm := math.Abs(work[ix])
			if scale == 0 || scale < xnorm*smlnum {
				return rcond
			}
			impl.Drscl(n, scale, work, 1)
		}
	}
}