// Drscl multiplies the vector x by 1/a being careful to avoid overflow or // underflow where possible. func (impl Implementation) Drscl(n int, a float64, x []float64, incX int) { checkVector(n, x, incX) bi := blas64.Implementation() cden := a cnum := 1.0 smlnum := dlamchS bignum := 1 / smlnum for { cden1 := cden * smlnum cnum1 := cnum / bignum var mul float64 var done bool switch { case cnum != 0 && math.Abs(cden1) > math.Abs(cnum): mul = smlnum done = false cden = cden1 case math.Abs(cnum1) > math.Abs(cden): mul = bignum done = false cnum = cnum1 default: mul = cnum / cden done = true } bi.Dscal(n, mul, x, incX) if done { break } } }
// Dgetrs solves a system of equations using an LU factorization. // The system of equations solved is // A * X = B if trans == blas.Trans // A^T * X = B if trans == blas.NoTrans // A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs. // // On entry b contains the elements of the matrix B. On exit, b contains the // elements of X, the solution to the system of equations. // // a and ipiv contain the LU factorization of A and the permutation indices as // computed by Dgetrf. ipiv is zero-indexed. func (impl Implementation) Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) { checkMatrix(n, n, a, lda) checkMatrix(n, nrhs, b, ldb) if len(ipiv) < n { panic(badIpiv) } if n == 0 || nrhs == 0 { return } if trans != blas.Trans && trans != blas.NoTrans { panic(badTrans) } bi := blas64.Implementation() if trans == blas.NoTrans { // Solve A * X = B. impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, 1) // Solve L * X = B, updating b. bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit, n, nrhs, 1, a, lda, b, ldb) // Solve U * X = B, updating b. bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb) return } // Solve A^T * X = B. // Solve U^T * X = B, updating b. bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb) // Solve L^T * X = B, updating b. bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.Unit, n, nrhs, 1, a, lda, b, ldb) impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, -1) }
// Dlarf applies an elementary reflector to a general rectangular matrix c. // This computes // c = h * c if side == Left // c = c * h if side == right // where // h = 1 - tau * v * v^T // and c is an m * n matrix. // // work is temporary storage of length at least m if side == Left and at least // n if side == Right. This function will panic if this length requirement is not met. // // Dlarf is an internal routine. It is exported for testing purposes. func (impl Implementation) Dlarf(side blas.Side, m, n int, v []float64, incv int, tau float64, c []float64, ldc int, work []float64) { applyleft := side == blas.Left if (applyleft && len(work) < n) || (!applyleft && len(work) < m) { panic(badWork) } checkMatrix(m, n, c, ldc) // v has length m if applyleft and n otherwise. lenV := n if applyleft { lenV = m } checkVector(lenV, v, incv) lastv := 0 // last non-zero element of v lastc := 0 // last non-zero row/column of c if tau != 0 { var i int if applyleft { lastv = m - 1 } else { lastv = n - 1 } if incv > 0 { i = lastv * incv } // Look for the last non-zero row in v. for lastv >= 0 && v[i] == 0 { lastv-- i -= incv } if applyleft { // Scan for the last non-zero column in C[0:lastv, :] lastc = impl.Iladlc(lastv+1, n, c, ldc) } else { // Scan for the last non-zero row in C[:, 0:lastv] lastc = impl.Iladlr(m, lastv+1, c, ldc) } } if lastv == -1 || lastc == -1 { return } // Sometimes 1-indexing is nicer ... bi := blas64.Implementation() if applyleft { // Form H * C // w[0:lastc+1] = c[1:lastv+1, 1:lastc+1]^T * v[1:lastv+1,1] bi.Dgemv(blas.Trans, lastv+1, lastc+1, 1, c, ldc, v, incv, 0, work, 1) // c[0: lastv, 0: lastc] = c[...] - w[0:lastv, 1] * v[1:lastc, 1]^T bi.Dger(lastv+1, lastc+1, -tau, v, incv, work, 1, c, ldc) return } // Form C*H // w[0:lastc+1,1] := c[0:lastc+1,0:lastv+1] * v[0:lastv+1,1] bi.Dgemv(blas.NoTrans, lastc+1, lastv+1, 1, c, ldc, v, incv, 0, work, 1) // c[0:lastc+1,0:lastv+1] = c[...] - w[0:lastc+1,0] * v[0:lastv+1,0]^T bi.Dger(lastc+1, lastv+1, -tau, work, 1, v, incv, c, ldc) }
// Dpotrf computes the cholesky decomposition of the symmetric positive definite // matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix, // and a = U U^T is stored in place into a. If ul == blas.Lower, then a = L L^T // is computed and stored in-place into a. If a is not positive definite, false // is returned. This is the blocked version of the algorithm. func (impl Implementation) Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool) { bi := blas64.Implementation() if ul != blas.Upper && ul != blas.Lower { panic(badUplo) } if n < 0 { panic(nLT0) } if lda < n { panic(badLdA) } if n == 0 { return true } nb := impl.Ilaenv(1, "DPOTRF", string(ul), n, -1, -1, -1) if n <= nb { return impl.Dpotf2(ul, n, a, lda) } if ul == blas.Upper { for j := 0; j < n; j += nb { jb := min(nb, n-j) bi.Dsyrk(blas.Upper, blas.Trans, jb, j, -1, a[j:], lda, 1, a[j*lda+j:], lda) ok = impl.Dpotf2(blas.Upper, jb, a[j*lda+j:], lda) if !ok { return ok } if j+jb < n { bi.Dgemm(blas.Trans, blas.NoTrans, jb, n-j-jb, j, -1, a[j:], lda, a[j+jb:], lda, 1, a[j*lda+j+jb:], lda) bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, jb, n-j-jb, 1, a[j*lda+j:], lda, a[j*lda+j+jb:], lda) } } return true } for j := 0; j < n; j += nb { jb := min(nb, n-j) bi.Dsyrk(blas.Lower, blas.NoTrans, jb, j, -1, a[j*lda:], lda, 1, a[j*lda+j:], lda) ok := impl.Dpotf2(blas.Lower, jb, a[j*lda+j:], lda) if !ok { return ok } if j+jb < n { bi.Dgemm(blas.NoTrans, blas.Trans, n-j-jb, jb, j, -1, a[(j+jb)*lda:], lda, a[j*lda:], lda, 1, a[(j+jb)*lda+j:], lda) bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, n-j-jb, jb, 1, a[j*lda+j:], lda, a[(j+jb)*lda+j:], lda) } } return true }
// Dgecon estimates the reciprocal of the condition number of the n×n matrix A // given the LU decomposition of the matrix. The condition number computed may // be based on the 1-norm or the ∞-norm. // // The slice a contains the result of the LU decomposition of A as computed by Dgetrf. // // anorm is the corresponding 1-norm or ∞-norm of the original matrix A. // // work is a temporary data slice of length at least 4*n and Dgecon will panic otherwise. // // iwork is a temporary data slice of length at least n and Dgecon will panic otherwise. func (impl Implementation) Dgecon(norm lapack.MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 { checkMatrix(n, n, a, lda) if norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum { panic(badNorm) } if len(work) < 4*n { panic(badWork) } if len(iwork) < n { panic(badWork) } if n == 0 { return 1 } else if anorm == 0 { return 0 } bi := blas64.Implementation() var rcond, ainvnm float64 var kase int var normin bool isave := new([3]int) onenrm := norm == lapack.MaxColumnSum smlnum := dlamchS kase1 := 2 if onenrm { kase1 = 1 } for { ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave) if kase == 0 { if ainvnm != 0 { rcond = (1 / ainvnm) / anorm } return rcond } var sl, su float64 if kase == kase1 { sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.Unit, normin, n, a, lda, work, work[2*n:]) su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[3*n:]) } else { su = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[3*n:]) sl = impl.Dlatrs(blas.Lower, blas.Trans, blas.Unit, normin, n, a, lda, work, work[2*n:]) } scale := sl * su normin = true if scale != 1 { ix := bi.Idamax(n, work, 1) if scale == 0 || scale < math.Abs(work[ix])*smlnum { return rcond } impl.Drscl(n, scale, work, 1) } } }
// Dgebak updates an n×m matrix V as // V = P D V, if side == blas.Right, // V = P D^{-1} V, if side == blas.Left, // where P and D are n×n permutation and scaling matrices, respectively, // implicitly represented by job, scale, ilo and ihi as returned by Dgebal. // // Typically, columns of the matrix V contain the right or left (determined by // side) eigenvectors of the balanced matrix output by Dgebal, and Dgebak forms // the eigenvectors of the original matrix. // // Dgebak is an internal routine. It is exported for testing purposes. func (impl Implementation) Dgebak(job lapack.Job, side blas.Side, n, ilo, ihi int, scale []float64, m int, v []float64, ldv int) { switch job { default: panic(badJob) case lapack.None, lapack.Permute, lapack.Scale, lapack.PermuteScale: } switch side { default: panic(badSide) case blas.Left, blas.Right: } checkMatrix(n, m, v, ldv) switch { case ilo < 0 || max(0, n-1) < ilo: panic(badIlo) case ihi < min(ilo, n-1) || n <= ihi: panic(badIhi) } // Quick return if possible. if n == 0 || m == 0 || job == lapack.None { return } bi := blas64.Implementation() if ilo != ihi && job != lapack.Permute { // Backward balance. if side == blas.Right { for i := ilo; i <= ihi; i++ { bi.Dscal(m, scale[i], v[i*ldv:], 1) } } else { for i := ilo; i <= ihi; i++ { bi.Dscal(m, 1/scale[i], v[i*ldv:], 1) } } } if job == lapack.Scale { return } // Backward permutation. for i := ilo - 1; i >= 0; i-- { k := int(scale[i]) if k == i { continue } bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1) } for i := ihi + 1; i < n; i++ { k := int(scale[i]) if k == i { continue } bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1) } }
// Dpotf2 computes the cholesky decomposition of the symmetric positive definite // matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix, // and a = U^T U is stored in place into a. If ul == blas.Lower, then a = L L^T // is computed and stored in-place into a. If a is not positive definite, false // is returned. This is the unblocked version of the algorithm. func (Implementation) Dpotf2(ul blas.Uplo, n int, a []float64, lda int) (ok bool) { if ul != blas.Upper && ul != blas.Lower { panic(badUplo) } if n < 0 { panic(nLT0) } if lda < n { panic(badLdA) } if n == 0 { return true } bi := blas64.Implementation() if ul == blas.Upper { for j := 0; j < n; j++ { ajj := a[j*lda+j] if j != 0 { ajj -= bi.Ddot(j, a[j:], lda, a[j:], lda) } if ajj <= 0 || math.IsNaN(ajj) { a[j*lda+j] = ajj return false } ajj = math.Sqrt(ajj) a[j*lda+j] = ajj if j < n-1 { bi.Dgemv(blas.Trans, j, n-j-1, -1, a[j+1:], lda, a[j:], lda, 1, a[j*lda+j+1:], 1) bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1) } } return true } for j := 0; j < n; j++ { ajj := a[j*lda+j] if j != 0 { ajj -= bi.Ddot(j, a[j*lda:], 1, a[j*lda:], 1) } if ajj <= 0 || math.IsNaN(ajj) { a[j*lda+j] = ajj return false } ajj = math.Sqrt(ajj) a[j*lda+j] = ajj if j < n-1 { bi.Dgemv(blas.NoTrans, n-j-1, j, -1, a[(j+1)*lda:], lda, a[j*lda:], 1, 1, a[(j+1)*lda+j:], lda) bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda) } } return true }
// Dpocon estimates the reciprocal of the condition number of a positive-definite // matrix A given the Cholesky decmposition of A. The condition number computed // is based on the 1-norm and the ∞-norm. // // anorm is the 1-norm and the ∞-norm of the original matrix A. // // work is a temporary data slice of length at least 3*n and Dpocon will panic otherwise. // // iwork is a temporary data slice of length at least n and Dpocon will panic otherwise. func (impl Implementation) Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 { checkMatrix(n, n, a, lda) if uplo != blas.Upper && uplo != blas.Lower { panic(badUplo) } if len(work) < 3*n { panic(badWork) } if len(iwork) < n { panic(badWork) } var rcond float64 if n == 0 { return 1 } if anorm == 0 { return rcond } bi := blas64.Implementation() var ainvnm float64 smlnum := dlamchS upper := uplo == blas.Upper var kase int var normin bool isave := new([3]int) var sl, su float64 for { ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave) if kase == 0 { if ainvnm != 0 { rcond = (1 / ainvnm) / anorm } return rcond } if upper { sl = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:]) normin = true su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:]) } else { sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:]) normin = true su = impl.Dlatrs(blas.Lower, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:]) } scale := sl * su if scale != 1 { ix := bi.Idamax(n, work, 1) if scale == 0 || scale < math.Abs(work[ix])*smlnum { return rcond } impl.Drscl(n, scale, work, 1) } } }
func DpotrfTest(t *testing.T, impl Dpotrfer) { rnd := rand.New(rand.NewSource(1)) bi := blas64.Implementation() for i, test := range []struct { n int }{ {n: 10}, {n: 30}, {n: 63}, {n: 65}, {n: 128}, {n: 1000}, } { n := test.n // Construct a positive-definite symmetric matrix base := make([]float64, n*n) for i := range base { base[i] = rnd.Float64() } a := make([]float64, len(base)) bi.Dgemm(blas.Trans, blas.NoTrans, n, n, n, 1, base, n, base, n, 0, a, n) aCopy := make([]float64, len(a)) copy(aCopy, a) // Test with Upper impl.Dpotrf(blas.Upper, n, a, n) // zero all the other elements for i := 0; i < n; i++ { for j := 0; j < i; j++ { a[i*n+j] = 0 } } // Multiply u^T * u ans := make([]float64, len(a)) bi.Dsyrk(blas.Upper, blas.Trans, n, n, 1, a, n, 0, ans, n) match := true for i := 0; i < n; i++ { for j := i; j < n; j++ { if !floats.EqualWithinAbsOrRel(ans[i*n+j], aCopy[i*n+j], 1e-14, 1e-14) { match = false } } } if !match { //fmt.Println(aCopy) //fmt.Println(ans) t.Errorf("Case %v: Mismatch for upper", i) } } }
// Dlaswp swaps the rows k1 to k2 of a according to the indices in ipiv. // a is a matrix with n columns and stride lda. incX is the increment for ipiv. // k1 and k2 are zero-indexed. If incX is negative, then loops from k2 to k1 // // Dlaswp is an internal routine. It is exported for testing purposes. func (impl Implementation) Dlaswp(n int, a []float64, lda, k1, k2 int, ipiv []int, incX int) { if incX != 1 && incX != -1 { panic(absIncNotOne) } bi := blas64.Implementation() if incX == 1 { for k := k1; k <= k2; k++ { bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1) } return } for k := k2; k >= k1; k-- { bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1) } }
// Dtrtri computes the inverse of a triangular matrix, storing the result in place // into a. This is the BLAS level 3 version of the algorithm which builds upon // Dtrti2 to operate on matrix blocks instead of only individual columns. // // Dtrtri will not perform the inversion if the matrix is singular, and returns // a boolean indicating whether the inversion was successful. func (impl Implementation) Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool) { checkMatrix(n, n, a, lda) if uplo != blas.Upper && uplo != blas.Lower { panic(badUplo) } if diag != blas.NonUnit && diag != blas.Unit { panic(badDiag) } if n == 0 { return false } nonUnit := diag == blas.NonUnit if nonUnit { for i := 0; i < n; i++ { if a[i*lda+i] == 0 { return false } } } bi := blas64.Implementation() nb := impl.Ilaenv(1, "DTRTRI", "UD", n, -1, -1, -1) if nb <= 1 || nb > n { impl.Dtrti2(uplo, diag, n, a, lda) return true } if uplo == blas.Upper { for j := 0; j < n; j += nb { jb := min(nb, n-j) bi.Dtrmm(blas.Left, blas.Upper, blas.NoTrans, diag, j, jb, 1, a, lda, a[j:], lda) bi.Dtrsm(blas.Right, blas.Upper, blas.NoTrans, diag, j, jb, -1, a[j*lda+j:], lda, a[j:], lda) impl.Dtrti2(blas.Upper, diag, jb, a[j*lda+j:], lda) } return true } nn := ((n - 1) / nb) * nb for j := nn; j >= 0; j -= nb { jb := min(nb, n-j) if j+jb <= n-1 { bi.Dtrmm(blas.Left, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, 1, a[(j+jb)*lda+j+jb:], lda, a[(j+jb)*lda+j:], lda) bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, -1, a[j*lda+j:], lda, a[(j+jb)*lda+j:], lda) } impl.Dtrti2(blas.Lower, diag, jb, a[j*lda+j:], lda) } return true }
// Dorg2r generates an m×n matrix Q with orthonormal columns defined by the // product of elementary reflectors as computed by Dgeqrf. // Q = H_0 * H_1 * ... * H_{k-1} // len(tau) >= k, 0 <= k <= n, 0 <= n <= m, len(work) >= n. // Dorg2r will panic if these conditions are not met. // // Dorg2r is an internal routine. It is exported for testing purposes. func (impl Implementation) Dorg2r(m, n, k int, a []float64, lda int, tau []float64, work []float64) { checkMatrix(m, n, a, lda) if len(tau) < k { panic(badTau) } if len(work) < n { panic(badWork) } if k > n { panic(kGTN) } if n > m { panic(mLTN) } if len(work) < n { panic(badWork) } if n == 0 { return } bi := blas64.Implementation() // Initialize columns k+1:n to columns of the unit matrix. for l := 0; l < m; l++ { for j := k; j < n; j++ { a[l*lda+j] = 0 } } for j := k; j < n; j++ { a[j*lda+j] = 1 } for i := k - 1; i >= 0; i-- { for i := range work { work[i] = 0 } if i < n-1 { a[i*lda+i] = 1 impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tau[i], a[i*lda+i+1:], lda, work) } if i < m-1 { bi.Dscal(m-i-1, -tau[i], a[(i+1)*lda+i:], lda) } a[i*lda+i] = 1 - tau[i] for l := 0; l < i; l++ { a[l*lda+i] = 0 } } }
// Dtrtrs solves a triangular system of the form A * X = B or A^T * X = B. Dtrtrs // returns whether the solve completed successfully. If A is singular, no solve is performed. func (impl Implementation) Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) { nounit := diag == blas.NonUnit if n == 0 { return false } // Check for singularity. if nounit { for i := 0; i < n; i++ { if a[i*lda+i] == 0 { return false } } } bi := blas64.Implementation() bi.Dtrsm(blas.Left, uplo, trans, diag, n, nrhs, 1, a, lda, b, ldb) return true }
// Dorgl2 generates an m×n matrix Q with orthonormal rows defined by the // first m rows product of elementary reflectors as computed by Dgelqf. // Q = H_0 * H_1 * ... * H_{k-1} // len(tau) >= k, 0 <= k <= m, 0 <= m <= n, len(work) >= m. // Dorgl2 will panic if these conditions are not met. // // Dorgl2 is an internal routine. It is exported for testing purposes. func (impl Implementation) Dorgl2(m, n, k int, a []float64, lda int, tau, work []float64) { checkMatrix(m, n, a, lda) if len(tau) < k { panic(badTau) } if k > m { panic(kGTM) } if k > m { panic(kGTM) } if m > n { panic(nLTM) } if len(work) < m { panic(badWork) } if m == 0 { return } bi := blas64.Implementation() if k < m { for i := k; i < m; i++ { for j := 0; j < n; j++ { a[i*lda+j] = 0 } } for j := k; j < m; j++ { a[j*lda+j] = 1 } } for i := k - 1; i >= 0; i-- { if i < n-1 { if i < m-1 { a[i*lda+i] = 1 impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tau[i], a[(i+1)*lda+i:], lda, work) } bi.Dscal(n-i-1, -tau[i], a[i*lda+i+1:], 1) } a[i*lda+i] = 1 - tau[i] for l := 0; l < i; l++ { a[i*lda+l] = 0 } } }
// Dtrti2 computes the inverse of a triangular matrix, storing the result in place // into a. This is the BLAS level 2 version of the algorithm. func (impl Implementation) Dtrti2(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) { checkMatrix(n, n, a, lda) if uplo != blas.Upper && uplo != blas.Lower { panic(badUplo) } if diag != blas.NonUnit && diag != blas.Unit { panic(badDiag) } bi := blas64.Implementation() nonUnit := diag == blas.NonUnit // TODO(btracey): Replace this with a row-major ordering. if uplo == blas.Upper { for j := 0; j < n; j++ { var ajj float64 if nonUnit { ajj = 1 / a[j*lda+j] a[j*lda+j] = ajj ajj *= -1 } else { ajj = -1 } bi.Dtrmv(blas.Upper, blas.NoTrans, diag, j, a, lda, a[j:], lda) bi.Dscal(j, ajj, a[j:], lda) } return } for j := n - 1; j >= 0; j-- { var ajj float64 if nonUnit { ajj = 1 / a[j*lda+j] a[j*lda+j] = ajj ajj *= -1 } else { ajj = -1 } if j < n-1 { bi.Dtrmv(blas.Lower, blas.NoTrans, diag, n-j-1, a[(j+1)*lda+j+1:], lda, a[(j+1)*lda+j:], lda) bi.Dscal(n-j-1, ajj, a[(j+1)*lda+j:], lda) } } }
// Dgetrf computes the LU decomposition of the m×n matrix A. // The LU decomposition is a factorization of A into // A = P * L * U // where P is a permutation matrix, L is a unit lower triangular matrix, and // U is a (usually) non-unit upper triangular matrix. On exit, L and U are stored // in place into a. // // ipiv is a permutation vector. It indicates that row i of the matrix was // changed with ipiv[i]. ipiv must have length at least min(m,n), and will panic // otherwise. ipiv is zero-indexed. // // Dgetrf is the blocked version of the algorithm. // // Dgetrf returns whether the matrix A is singular. The LU decomposition will // be computed regardless of the singularity of A, but division by zero // will occur if the false is returned and the result is used to solve a // system of equations. func (impl Implementation) Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool) { mn := min(m, n) checkMatrix(m, n, a, lda) if len(ipiv) < mn { panic(badIpiv) } if m == 0 || n == 0 { return } bi := blas64.Implementation() nb := impl.Ilaenv(1, "DGETRF", " ", m, n, -1, -1) if nb <= 1 || nb >= min(m, n) { // Use the unblocked algorithm. return impl.Dgetf2(m, n, a, lda, ipiv) } ok = true for j := 0; j < mn; j += nb { jb := min(mn-j, nb) blockOk := impl.Dgetf2(m-j, jb, a[j*lda+j:], lda, ipiv[j:]) if !blockOk { ok = false } for i := j; i <= min(m-1, j+jb-1); i++ { ipiv[i] = j + ipiv[i] } impl.Dlaswp(j, a, lda, j, j+jb-1, ipiv, 1) if j+jb < n { impl.Dlaswp(n-j-jb, a[j+jb:], lda, j, j+jb-1, ipiv, 1) bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit, jb, n-j-jb, 1, a[j*lda+j:], lda, a[j*lda+j+jb:], lda) if j+jb < m { bi.Dgemm(blas.NoTrans, blas.NoTrans, m-j-jb, n-j-jb, jb, -1, a[(j+jb)*lda+j:], lda, a[j*lda+j+jb:], lda, 1, a[(j+jb)*lda+j+jb:], lda) } } } return ok }
// Dorg2l generates an m×n matrix Q with orthonormal columns which is defined // as the last n columns of a product of k elementary reflectors of order m. // Q = H_{k-1} * ... * H_1 * H_0 // See Dgelqf for more information. It must be that m >= n >= k. // // tau contains the scalar reflectors computed by Dgeqlf. tau must have length // at least k, and Dorg2l will panic otherwise. // // work contains temporary memory, and must have length at least n. Dorg2l will // panic otherwise. // // Dorg2l is an internal routine. It is exported for testing purposes. func (impl Implementation) Dorg2l(m, n, k int, a []float64, lda int, tau, work []float64) { checkMatrix(m, n, a, lda) if len(tau) < k { panic(badTau) } if len(work) < n { panic(badWork) } if m < n { panic(mLTN) } if k > n { panic(kGTN) } if n == 0 { return } // Initialize columns 0:n-k to columns of the unit matrix. for j := 0; j < n-k; j++ { for l := 0; l < m; l++ { a[l*lda+j] = 0 } a[(m-n+j)*lda+j] = 1 } bi := blas64.Implementation() for i := 0; i < k; i++ { ii := n - k + i // Apply H_i to A[0:m-k+i, 0:n-k+i] from the left. a[(m-n+ii)*lda+ii] = 1 impl.Dlarf(blas.Left, m-n+ii+1, ii, a[ii:], lda, tau[i], a, lda, work) bi.Dscal(m-n+ii, -tau[i], a[ii:], lda) a[(m-n+ii)*lda+ii] = 1 - tau[i] // Set A[m-k+i:m, n-k+i+1] to zero. for l := m - n + ii + 1; l < m; l++ { a[l*lda+ii] = 0 } } }
// Dgetf2 computes the LU decomposition of the m×n matrix A. // The LU decomposition is a factorization of a into // A = P * L * U // where P is a permutation matrix, L is a unit lower triangular matrix, and // U is a (usually) non-unit upper triangular matrix. On exit, L and U are stored // in place into a. // // ipiv is a permutation vector. It indicates that row i of the matrix was // changed with ipiv[i]. ipiv must have length at least min(m,n), and will panic // otherwise. ipiv is zero-indexed. // // Dgetf2 returns whether the matrix A is singular. The LU decomposition will // be computed regardless of the singularity of A, but division by zero // will occur if the false is returned and the result is used to solve a // system of equations. // // Dgetf2 is an internal routine. It is exported for testing purposes. func (Implementation) Dgetf2(m, n int, a []float64, lda int, ipiv []int) (ok bool) { mn := min(m, n) checkMatrix(m, n, a, lda) if len(ipiv) < mn { panic(badIpiv) } if m == 0 || n == 0 { return true } bi := blas64.Implementation() sfmin := dlamchS ok = true for j := 0; j < mn; j++ { // Find a pivot and test for singularity. jp := j + bi.Idamax(m-j, a[j*lda+j:], lda) ipiv[j] = jp if a[jp*lda+j] == 0 { ok = false } else { // Swap the rows if necessary. if jp != j { bi.Dswap(n, a[j*lda:], 1, a[jp*lda:], 1) } if j < m-1 { aj := a[j*lda+j] if math.Abs(aj) >= sfmin { bi.Dscal(m-j-1, 1/aj, a[(j+1)*lda+j:], lda) } else { for i := 0; i < m-j-1; i++ { a[(j+1)*lda+j] = a[(j+1)*lda+j] / a[lda*j+j] } } } } if j < mn-1 { bi.Dger(m-j-1, n-j-1, -1, a[(j+1)*lda+j:], lda, a[j*lda+j+1:], 1, a[(j+1)*lda+j+1:], lda) } } return ok }
// Dlarfg generates an elementary reflector for a Householder matrix. It creates // a real elementary reflector of order n such that // H * (alpha) = (beta) // ( x) ( 0) // H^T * H = I // H is represented in the form // H = 1 - tau * (1; v) * (1 v^T) // where tau is a real scalar. // // On entry, x contains the vector x, on exit it contains v. func (impl Implementation) Dlarfg(n int, alpha float64, x []float64, incX int) (beta, tau float64) { if n < 0 { panic(nLT0) } if n <= 1 { return alpha, 0 } checkVector(n-1, x, incX) bi := blas64.Implementation() xnorm := bi.Dnrm2(n-1, x, incX) if xnorm == 0 { return alpha, 0 } beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha) safmin := dlamchS / dlamchE knt := 0 if math.Abs(beta) < safmin { // xnorm and beta may be innacurate, scale x and recompute. rsafmn := 1 / safmin for { knt++ bi.Dscal(n-1, rsafmn, x, incX) beta *= rsafmn alpha *= rsafmn if math.Abs(beta) >= safmin { break } } xnorm = bi.Dnrm2(n-1, x, incX) beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha) } tau = (beta - alpha) / beta bi.Dscal(n-1, 1/(alpha-beta), x, incX) for j := 0; j < knt; j++ { beta *= safmin } return beta, tau }
func DtrtriTest(t *testing.T, impl Dtrtrier) { bi := blas64.Implementation() for _, uplo := range []blas.Uplo{blas.Upper} { for _, diag := range []blas.Diag{blas.NonUnit, blas.Unit} { for _, test := range []struct { n, lda int }{ {3, 0}, {70, 0}, {200, 0}, {3, 5}, {70, 92}, {200, 205}, } { n := test.n lda := test.lda if lda == 0 { lda = n } a := make([]float64, n*lda) for i := range a { a[i] = rand.Float64() + 1 // This keeps the matrices well conditioned. } aCopy := make([]float64, len(a)) copy(aCopy, a) impl.Dtrtri(uplo, diag, n, a, lda) if uplo == blas.Upper { for i := 1; i < n; i++ { for j := 0; j < i; j++ { aCopy[i*lda+j] = 0 a[i*lda+j] = 0 } } } else { for i := 1; i < n; i++ { for j := i + 1; j < n; j++ { aCopy[i*lda+j] = 0 a[i*lda+j] = 0 } } } if diag == blas.Unit { for i := 0; i < n; i++ { a[i*lda+i] = 1 aCopy[i*lda+i] = 1 } } ans := make([]float64, len(a)) bi.Dgemm(blas.NoTrans, blas.NoTrans, n, n, n, 1, a, lda, aCopy, lda, 0, ans, lda) iseye := true for i := 0; i < n; i++ { for j := 0; j < n; j++ { if i == j { if math.Abs(ans[i*lda+i]-1) > 1e-4 { iseye = false break } } else { if math.Abs(ans[i*lda+j]) > 1e-4 { iseye = false break } } } } if !iseye { t.Errorf("inv(A) * A != I. Upper = %v, unit = %v, n = %v, lda = %v", uplo == blas.Upper, diag == blas.Unit, n, lda) } } } } }
func DormbrTest(t *testing.T, impl Dormbrer) { rnd := rand.New(rand.NewSource(1)) bi := blas64.Implementation() for _, vect := range []lapack.DecompUpdate{lapack.ApplyQ, lapack.ApplyP} { for _, side := range []blas.Side{blas.Left, blas.Right} { for _, trans := range []blas.Transpose{blas.NoTrans, blas.Trans} { for _, test := range []struct { m, n, k, lda, ldc int }{ {3, 4, 5, 0, 0}, {3, 5, 4, 0, 0}, {4, 3, 5, 0, 0}, {4, 5, 3, 0, 0}, {5, 3, 4, 0, 0}, {5, 4, 3, 0, 0}, {3, 4, 5, 10, 12}, {3, 5, 4, 10, 12}, {4, 3, 5, 10, 12}, {4, 5, 3, 10, 12}, {5, 3, 4, 10, 12}, {5, 4, 3, 10, 12}, } { m := test.m n := test.n k := test.k ldc := test.ldc if ldc == 0 { ldc = n } nq := n if side == blas.Left { nq = m } // Compute a decomposition. var ma, na int var a []float64 if vect == lapack.ApplyQ { ma = nq na = k } else { ma = k na = nq } lda := test.lda if lda == 0 { lda = na } a = make([]float64, ma*lda) for i := range a { a[i] = rnd.NormFloat64() } nTau := min(nq, k) tauP := make([]float64, nTau) tauQ := make([]float64, nTau) d := make([]float64, nTau) e := make([]float64, nTau) lwork := -1 work := make([]float64, 1) impl.Dgebrd(ma, na, a, lda, d, e, tauQ, tauP, work, lwork) work = make([]float64, int(work[0])) lwork = len(work) impl.Dgebrd(ma, na, a, lda, d, e, tauQ, tauP, work, lwork) // Apply and compare update. c := make([]float64, m*ldc) for i := range c { c[i] = rnd.NormFloat64() } cCopy := make([]float64, len(c)) copy(cCopy, c) if vect == lapack.ApplyQ { impl.Dormbr(vect, side, trans, m, n, k, a, lda, tauQ, c, ldc, work, lwork) } else { impl.Dormbr(vect, side, trans, m, n, k, a, lda, tauP, c, ldc, work, lwork) } // Check that the multiplication was correct. cOrig := blas64.General{ Rows: m, Cols: n, Stride: ldc, Data: make([]float64, len(cCopy)), } copy(cOrig.Data, cCopy) cAns := blas64.General{ Rows: m, Cols: n, Stride: ldc, Data: make([]float64, len(cCopy)), } copy(cAns.Data, cCopy) nb := min(ma, na) var mulMat blas64.General if vect == lapack.ApplyQ { mulMat = constructQPBidiagonal(lapack.ApplyQ, ma, na, nb, a, lda, tauQ) } else { mulMat = constructQPBidiagonal(lapack.ApplyP, ma, na, nb, a, lda, tauP) } mulTrans := trans if side == blas.Left { bi.Dgemm(mulTrans, blas.NoTrans, m, n, m, 1, mulMat.Data, mulMat.Stride, cOrig.Data, cOrig.Stride, 0, cAns.Data, cAns.Stride) } else { bi.Dgemm(blas.NoTrans, mulTrans, m, n, n, 1, cOrig.Data, cOrig.Stride, mulMat.Data, mulMat.Stride, 0, cAns.Data, cAns.Stride) } if !floats.EqualApprox(cAns.Data, c, 1e-8) { isApplyQ := vect == lapack.ApplyQ isLeft := side == blas.Left isTrans := trans == blas.Trans t.Errorf("C mismatch. isApplyQ: %v, isLeft: %v, isTrans: %v, m = %v, n = %v, k = %v, lda = %v, ldc = %v", isApplyQ, isLeft, isTrans, m, n, k, lda, ldc) } } } } } }
// Dlarfb applies a block reflector to a matrix. // // In the call to Dlarfb, the mxn c is multiplied by the implicitly defined matrix h as follows: // c = h * c if side == Left and trans == NoTrans // c = c * h if side == Right and trans == NoTrans // c = h^T * c if side == Left and trans == Trans // c = c * h^t if side == Right and trans == Trans // h is a product of elementary reflectors. direct sets the direction of multiplication // h = h_1 * h_2 * ... * h_k if direct == Forward // h = h_k * h_k-1 * ... * h_1 if direct == Backward // The combination of direct and store defines the orientation of the elementary // reflectors. In all cases the ones on the diagonal are implicitly represented. // // If direct == lapack.Forward and store == lapack.ColumnWise // V = ( 1 ) // ( v1 1 ) // ( v1 v2 1 ) // ( v1 v2 v3 ) // ( v1 v2 v3 ) // If direct == lapack.Forward and store == lapack.RowWise // V = ( 1 v1 v1 v1 v1 ) // ( 1 v2 v2 v2 ) // ( 1 v3 v3 ) // If direct == lapack.Backward and store == lapack.ColumnWise // V = ( v1 v2 v3 ) // ( v1 v2 v3 ) // ( 1 v2 v3 ) // ( 1 v3 ) // ( 1 ) // If direct == lapack.Backward and store == lapack.RowWise // V = ( v1 v1 1 ) // ( v2 v2 v2 1 ) // ( v3 v3 v3 v3 1 ) // An elementary reflector can be explicitly constructed by extracting the // corresponding elements of v, placing a 1 where the diagonal would be, and // placing zeros in the remaining elements. // // t is a k×k matrix containing the block reflector, and this function will panic // if t is not of sufficient size. See Dlarft for more information. // // Work is a temporary storage matrix with stride ldwork. // Work must be of size at least n×k side == Left and m×k if side == Right, and // this function will panic if this size is not met. func (Implementation) Dlarfb(side blas.Side, trans blas.Transpose, direct lapack.Direct, store lapack.StoreV, m, n, k int, v []float64, ldv int, t []float64, ldt int, c []float64, ldc int, work []float64, ldwork int) { checkMatrix(m, n, c, ldc) if m == 0 || n == 0 { return } if k < 0 { panic("lapack: negative number of transforms") } if side != blas.Left && side != blas.Right { panic(badSide) } if trans != blas.Trans && trans != blas.NoTrans { panic(badTrans) } if direct != lapack.Forward && direct != lapack.Backward { panic(badDirect) } if store != lapack.ColumnWise && store != lapack.RowWise { panic(badStore) } rowsWork := n if side == blas.Right { rowsWork = m } checkMatrix(rowsWork, k, work, ldwork) bi := blas64.Implementation() transt := blas.Trans if trans == blas.Trans { transt = blas.NoTrans } // TODO(btracey): This follows the original Lapack code where the // elements are copied into the columns of the working array. The // loops should go in the other direction so the data is written // into the rows of work so the copy is not strided. A bigger change // would be to replace work with work^T, but benchmarks would be // needed to see if the change is merited. if store == lapack.ColumnWise { if direct == lapack.Forward { // V1 is the first k rows of C. V2 is the remaining rows. if side == blas.Left { // W = C^T V = C1^T V1 + C2^T V2 (stored in work). // W = C1. for j := 0; j < k; j++ { bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork) } // W = W * V1. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, k, 1, v, ldv, work, ldwork) if m > k { // W = W + C2^T V2. bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k, 1, c[k*ldc:], ldc, v[k*ldv:], ldv, 1, work, ldwork) } // W = W * T^T or W * T. bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k, 1, t, ldt, work, ldwork) // C -= V * W^T. if m > k { // C2 -= V2 * W^T. bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k, -1, v[k*ldv:], ldv, work, ldwork, 1, c[k*ldc:], ldc) } // W *= V1^T. bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k, 1, v, ldv, work, ldwork) // C1 -= W^T. // TODO(btracey): This should use blas.Axpy. for i := 0; i < n; i++ { for j := 0; j < k; j++ { c[j*ldc+i] -= work[i*ldwork+j] } } return } // Form C = C * H or C * H^T, where C = (C1 C2). // W = C1. for i := 0; i < k; i++ { bi.Dcopy(m, c[i:], ldc, work[i:], ldwork) } // W *= V1. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k, 1, v, ldv, work, ldwork) if n > k { bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k, 1, c[k:], ldc, v[k*ldv:], ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k, 1, t, ldt, work, ldwork) if n > k { bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k, -1, work, ldwork, v[k*ldv:], ldv, 1, c[k:], ldc) } // C -= W * V^T. bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k, 1, v, ldv, work, ldwork) // C -= W. // TODO(btracey): This should use blas.Axpy. for i := 0; i < m; i++ { for j := 0; j < k; j++ { c[i*ldc+j] -= work[i*ldwork+j] } } return } // V = (V1) // = (V2) (last k rows) // Where V2 is unit upper triangular. if side == blas.Left { // Form H * C or // W = C^T V. // W = C2^T. for j := 0; j < k; j++ { bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork) } // W *= V2. bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k, 1, v[(m-k)*ldv:], ldv, work, ldwork) if m > k { // W += C1^T * V1. bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k, 1, c, ldc, v, ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k, 1, t, ldt, work, ldwork) // C -= V * W^T. if m > k { bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k, -1, v, ldv, work, ldwork, 1, c, ldc) } // W *= V2^T. bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k, 1, v[(m-k)*ldv:], ldv, work, ldwork) // C2 -= W^T. // TODO(btracey): This should use blas.Axpy. for i := 0; i < n; i++ { for j := 0; j < k; j++ { c[(m-k+j)*ldc+i] -= work[i*ldwork+j] } } return } // Form C * H or C * H^T where C = (C1 C2). // W = C * V. // W = C2. for j := 0; j < k; j++ { bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork) } // W = W * V2. bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k, 1, v[(n-k)*ldv:], ldv, work, ldwork) if n > k { bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k, 1, c, ldc, v, ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k, 1, t, ldt, work, ldwork) // C -= W * V^T. if n > k { // C1 -= W * V1^T. bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k, -1, work, ldwork, v, ldv, 1, c, ldc) } // W *= V2^T. bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k, 1, v[(n-k)*ldv:], ldv, work, ldwork) // C2 -= W. // TODO(btracey): This should use blas.Axpy. for i := 0; i < m; i++ { for j := 0; j < k; j++ { c[i*ldc+n-k+j] -= work[i*ldwork+j] } } return } // Store = Rowwise. if direct == lapack.Forward { // V = (V1 V2) where v1 is unit upper triangular. if side == blas.Left { // Form H * C or H^T * C where C = (C1; C2). // W = C^T * V^T. // W = C1^T. for j := 0; j < k; j++ { bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork) } // W *= V1^T. bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k, 1, v, ldv, work, ldwork) if m > k { bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k, 1, c[k*ldc:], ldc, v[k:], ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k, 1, t, ldt, work, ldwork) // C -= V^T * W^T. if m > k { bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k, -1, v[k:], ldv, work, ldwork, 1, c[k*ldc:], ldc) } // W *= V1. bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k, 1, v, ldv, work, ldwork) // C1 -= W^T. // TODO(btracey): This should use blas.Axpy. for i := 0; i < n; i++ { for j := 0; j < k; j++ { c[j*ldc+i] -= work[i*ldwork+j] } } return } // Form C * H or C * H^T where C = (C1 C2). // W = C * V^T. // W = C1. for j := 0; j < k; j++ { bi.Dcopy(m, c[j:], ldc, work[j:], ldwork) } // W *= V1^T. bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k, 1, v, ldv, work, ldwork) if n > k { bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k, 1, c[k:], ldc, v[k:], ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k, 1, t, ldt, work, ldwork) // C -= W * V. if n > k { bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k, -1, work, ldwork, v[k:], ldv, 1, c[k:], ldc) } // W *= V1. bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k, 1, v, ldv, work, ldwork) // C1 -= W. // TODO(btracey): This should use blas.Axpy. for i := 0; i < m; i++ { for j := 0; j < k; j++ { c[i*ldc+j] -= work[i*ldwork+j] } } return } // V = (V1 V2) where V2 is the last k columns and is lower unit triangular. if side == blas.Left { // Form H * C or H^T C where C = (C1 ; C2). // W = C^T * V^T. // W = C2^T. for j := 0; j < k; j++ { bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork) } // W *= V2^T. bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k, 1, v[m-k:], ldv, work, ldwork) if m > k { bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k, 1, c, ldc, v, ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k, 1, t, ldt, work, ldwork) // C -= V^T * W^T. if m > k { bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k, -1, v, ldv, work, ldwork, 1, c, ldc) } // W *= V2. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, k, 1, v[m-k:], ldv, work, ldwork) // C2 -= W^T. // TODO(btracey): This should use blas.Axpy. for i := 0; i < n; i++ { for j := 0; j < k; j++ { c[(m-k+j)*ldc+i] -= work[i*ldwork+j] } } return } // Form C * H or C * H^T where C = (C1 C2). // W = C * V^T. // W = C2. for j := 0; j < k; j++ { bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork) } // W *= V2^T. bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k, 1, v[n-k:], ldv, work, ldwork) if n > k { bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k, 1, c, ldc, v, ldv, 1, work, ldwork) } // W *= T or T^T. bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k, 1, t, ldt, work, ldwork) // C -= W * V. if n > k { bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k, -1, work, ldwork, v, ldv, 1, c, ldc) } // W *= V2. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k, 1, v[n-k:], ldv, work, ldwork) // C1 -= W. // TODO(btracey): This should use blas.Axpy. for i := 0; i < m; i++ { for j := 0; j < k; j++ { c[i*ldc+n-k+j] -= work[i*ldwork+j] } } }
// Dgetri computes the inverse of the matrix A using the LU factorization computed // by Dgetrf. On entry, a contains the PLU decomposition of A as computed by // Dgetrf and on exit contains the reciprocal of the original matrix. // // Dgetri will not perform the inversion if the matrix is singular, and returns // a boolean indicating whether the inversion was successful. // // Work is temporary storage, and lwork specifies the usable memory length. // At minimum, lwork >= n and this function will panic otherwise. // Dgetri is a blocked inversion, but the block size is limited // by the temporary space available. If lwork == -1, instead of performing Dgetri, // the optimal work length will be stored into work[0]. func (impl Implementation) Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool) { checkMatrix(n, n, a, lda) if len(ipiv) < n { panic(badIpiv) } nb := impl.Ilaenv(1, "DGETRI", " ", n, -1, -1, -1) if lwork == -1 { work[0] = float64(n * nb) return true } if lwork < n { panic(badWork) } if len(work) < lwork { panic(badWork) } if n == 0 { return true } ok = impl.Dtrtri(blas.Upper, blas.NonUnit, n, a, lda) if !ok { return false } nbmin := 2 ldwork := nb if nb > 1 && nb < n { iws := max(ldwork*n, 1) if lwork < iws { nb = lwork / ldwork nbmin = max(2, impl.Ilaenv(2, "DGETRI", " ", n, -1, -1, -1)) } } bi := blas64.Implementation() // TODO(btracey): Replace this with a more row-major oriented algorithm. if nb < nbmin || nb >= n { // Unblocked code. for j := n - 1; j >= 0; j-- { for i := j + 1; i < n; i++ { work[i*ldwork] = a[i*lda+j] a[i*lda+j] = 0 } if j < n { bi.Dgemv(blas.NoTrans, n, n-j-1, -1, a[(j+1):], lda, work[(j+1)*ldwork:], ldwork, 1, a[j:], lda) } } } else { nn := ((n - 1) / nb) * nb for j := nn; j >= 0; j -= nb { jb := min(nb, n-j) for jj := j; jj < j+jb-1; jj++ { for i := jj + 1; i < n; i++ { work[i*ldwork+(jj-j)] = a[i*lda+jj] a[i*lda+jj] = 0 } } if j+jb < n { bi.Dgemm(blas.NoTrans, blas.NoTrans, n, jb, n-j-jb, -1, a[(j+jb):], lda, work[(j+jb)*ldwork:], ldwork, 1, a[j:], lda) bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, jb, 1, work[j*ldwork:], ldwork, a[j:], lda) } } } for j := n - 2; j >= 0; j-- { jp := ipiv[j] if jp != j { bi.Dswap(n, a[j:], lda, a[jp:], lda) } } return true }
// Dgesvd computes the singular value decomposition of the input matrix A. // // The singular value decomposition is // A = U * Sigma * V^T // where Sigma is an m×n diagonal matrix containing the singular values of A, // U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first // min(m,n) columns of U and V are the left and right singular vectors of A // respectively. // // jobU and jobVT are options for computing the singular vectors. The behavior // is as follows // jobU == lapack.SVDAll All m columns of U are returned in u // jobU == lapack.SVDInPlace The first min(m,n) columns are returned in u // jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a // jobU == lapack.SVDNone The columns of U are not computed. // The behavior is the same for jobVT and the rows of V^T. At most one of jobU // and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise. // // On entry, a contains the data for the m×n matrix A. During the call to Dgesvd // the data is overwritten. On exit, A contains the appropriate singular vectors // if either job is lapack.SVDOverwrite. // // s is a slice of length at least min(m,n) and on exit contains the singular // values in decreasing order. // // u contains the left singular vectors on exit, stored column-wise. If // jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDInPlace u is // of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is // not used. // // vt contains the left singular vectors on exit, stored row-wise. If // jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDInPlace vt is // of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is // not used. // // work is a slice for storing temporary memory, and lwork is the usable size of // the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)). // If lwork == -1, instead of performing Dgesvd, the optimal work length will be // stored into work[0]. Dgesvd will panic if the working memory has insufficient // storage. // // Dgesvd returns whether the decomposition successfully completed. func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) { minmn := min(m, n) checkMatrix(m, n, a, lda) if jobU == lapack.SVDAll { checkMatrix(m, m, u, ldu) } else if jobU == lapack.SVDInPlace { checkMatrix(m, minmn, u, ldu) } if jobVT == lapack.SVDAll { checkMatrix(n, n, vt, ldvt) } else if jobVT == lapack.SVDInPlace { checkMatrix(minmn, n, vt, ldvt) } if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite { panic("lapack: both jobU and jobVT are lapack.SVDOverwrite") } if len(s) < minmn { panic(badS) } if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite { panic(noSVDO) } if m == 0 || n == 0 { return true } wantua := jobU == lapack.SVDAll wantus := jobU == lapack.SVDInPlace wantuas := wantua || wantus wantuo := jobU == lapack.SVDOverwrite wantun := jobU == lapack.None wantva := jobVT == lapack.SVDAll wantvs := jobVT == lapack.SVDInPlace wantvas := wantva || wantvs wantvo := jobVT == lapack.SVDOverwrite wantvn := jobVT == lapack.None bi := blas64.Implementation() var mnthr int // TODO(btracey): The netlib implementation checks have this at only length 1. // Our implementation checks all input sizes before examining the l == -1 case. // Fix the failing cases to reduce the needed memory here. dum := make([]float64, m*n) // Compute optimal space for subroutines. maxwrk := 1 opts := string(jobU) + string(jobVT) var wrkbl, bdspac int if m >= n { mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0) bdspac = 5 * n impl.Dgeqrf(m, n, a, lda, dum, dum, -1) lwork_dgeqrf := int(dum[0]) impl.Dorgqr(m, n, n, a, lda, dum, dum, -1) lwork_dorgqr_n := int(dum[0]) impl.Dorgqr(m, m, n, a, lda, dum, dum, -1) lwork_dorgqr_m := int(dum[0]) impl.Dgebrd(n, n, a, lda, s, dum, dum, dum, dum, -1) lwork_dgebrd := int(dum[0]) impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, dum, dum, -1) lwork_dorgbr_p := int(dum[0]) impl.Dorgbr(lapack.ApplyQ, n, n, n, a, lda, dum, dum, -1) lwork_dorgbr_q := int(dum[0]) if m >= mnthr { // m >> n if wantun { // Path 1 maxwrk = n + lwork_dgeqrf maxwrk = max(maxwrk, 3*n+lwork_dgebrd) if wantvo || wantvas { maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p) } maxwrk = max(maxwrk, bdspac) } else if wantuo && wantvn { // Path 2 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_n) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = max(n*n+wrkbl, n*n+m*n+n) } else if wantuo && wantvs { // Path 3 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_n) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = max(n*n+wrkbl, n*n+m*n+n) } else if wantus && wantvn { // Path 4 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_n) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = n*n + wrkbl } else if wantus && wantvo { // Path 5 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_n) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = 2*n*n + wrkbl } else if wantus && wantvas { // Path 6 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_n) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = n*n + wrkbl } else if wantua && wantvn { // Path 7 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_m) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = n*n + wrkbl } else if wantua && wantvo { // Path 8 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_m) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = 2*n*n + wrkbl } else if wantua && wantvas { // Path 9 wrkbl = n + lwork_dgeqrf wrkbl = max(wrkbl, n+lwork_dorgqr_m) wrkbl = max(wrkbl, 3*n+lwork_dgebrd) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q) wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = n*n + wrkbl } } else { // Path 10: m > n impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1) lwork_dgebrd := int(dum[0]) maxwrk = 3*n + lwork_dgebrd if wantus || wantuo { impl.Dorgbr(lapack.ApplyQ, m, n, n, a, lda, dum, dum, -1) lwork_dorgbr_q = int(dum[0]) maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q) } if wantua { impl.Dorgbr(lapack.ApplyQ, m, m, n, a, lda, dum, dum, -1) lwork_dorgbr_q := int(dum[0]) maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q) } if !wantvn { maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p) } maxwrk = max(maxwrk, bdspac) } } else { mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0) bdspac = 5 * m impl.Dgelqf(m, n, a, lda, dum, dum, -1) lwork_dgelqf := int(dum[0]) impl.Dorglq(n, n, m, dum, n, dum, dum, -1) lwork_dorglq_n := int(dum[0]) impl.Dorglq(m, n, m, a, lda, dum, dum, -1) lwork_dorglq_m := int(dum[0]) impl.Dgebrd(m, m, a, lda, s, dum, dum, dum, dum, -1) lwork_dgebrd := int(dum[0]) impl.Dorgbr(lapack.ApplyP, m, m, m, a, n, dum, dum, -1) lwork_dorgbr_p := int(dum[0]) impl.Dorgbr(lapack.ApplyQ, m, m, m, a, n, dum, dum, -1) lwork_dorgbr_q := int(dum[0]) if n >= mnthr { // n >> m if wantvn { // Path 1t maxwrk = m + lwork_dgelqf maxwrk = max(maxwrk, 3*m+lwork_dgebrd) if wantuo || wantuas { maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q) } maxwrk = max(maxwrk, bdspac) } else if wantvo && wantun { // Path 2t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_m) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = max(m*m+wrkbl, m*m+m*n+m) } else if wantvo && wantuas { // Path 3t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_m) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = max(m*m+wrkbl, m*m+m*n+m) } else if wantvs && wantun { // Path 4t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_m) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = m*m + wrkbl } else if wantvs && wantuo { // Path 5t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_m) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = 2*m*m + wrkbl } else if wantvs && wantuas { // Path 6t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_m) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = m*m + wrkbl } else if wantva && wantun { // Path 7t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_n) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, bdspac) maxwrk = m*m + wrkbl } else if wantva && wantuo { // Path 8t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_n) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = 2*m*m + wrkbl } else if wantva && wantuas { // Path 9t wrkbl = m + lwork_dgelqf wrkbl = max(wrkbl, m+lwork_dorglq_n) wrkbl = max(wrkbl, 3*m+lwork_dgebrd) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p) wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q) wrkbl = max(wrkbl, bdspac) maxwrk = m*m + wrkbl } } else { // Path 10t, n > m impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1) lwork_dgebrd = int(dum[0]) maxwrk := 3*m + lwork_dgebrd if wantvs || wantvo { impl.Dorgbr(lapack.ApplyP, m, n, m, a, n, dum, dum, -1) lwork_dorgbr_p = int(dum[0]) maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p) } if wantva { impl.Dorgbr(lapack.ApplyP, n, n, m, a, n, dum, dum, -1) lwork_dorgbr_p = int(dum[0]) maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p) } if !wantun { maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q) } maxwrk = max(maxwrk, bdspac) } } minWork := max(1, 5*minmn) if !((wantun && m >= mnthr) || (wantvn && n >= mnthr)) { minWork = max(minWork, 3*minmn+max(m, n)) } if lwork != -1 { if len(work) < lwork { panic(badWork) } if lwork < minWork { panic(badWork) } } if m == 0 || n == 0 { return true } maxwrk = max(maxwrk, minWork) work[0] = float64(maxwrk) if lwork == -1 { return true } // Perform decomposition. eps := dlamchE smlnum := math.Sqrt(dlamchS) / eps bignum := 1 / smlnum // Scale A if max element outside range [smlnum, bignum]. anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, dum) var iscl bool if anrm > 0 && anrm < smlnum { iscl = true impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda) } else if anrm > bignum { iscl = true impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda) } var ie int if m >= n { // If A has sufficiently more rows than columns, use the QR decomposition. if m >= mnthr { // m >> n if wantun { // Path 1. itau := 0 iwork := itau + n // Compute A = Q * R. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Zero out below R. impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda) ie = 0 itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in A. impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) ncvt := 0 if wantvo || wantvas { // Generate P^T. impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, work[itaup:], work[iwork:], lwork-iwork) ncvt = n } iwork = ie + n // Perform bidiagonal QR iteration computing right singular vectors // of A in A if desired. ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:], a, lda, dum, 1, dum, 1, work[iwork:]) // If right singular vectors desired in VT, copy them there. if wantvas { impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt) } } else if wantuo && wantvn { // Path 2 panic(noSVDO) } else if wantuo && wantvas { // Path 3 panic(noSVDO) } else if wantus { if wantvn { // Path 4 if lwork >= n*n+max(4*n, bdspac) { // Sufficient workspace for a fast algorithm. ir := 0 var ldworkr int if lwork >= wrkbl+lda*n { ldworkr = lda } else { ldworkr = n } itau := ir + ldworkr*n iwork := itau + n // Compute A = Q * R. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Copy R to work[ir:], zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr) // Generate Q in A. impl.Dorgqr(m, n, n, a, lda, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in work[ir:]. impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Generate left vectors bidiagonalizing R in work[ir:]. impl.Dorgbr(lapack.ApplyQ, n, n, n, work[ir:], ldworkr, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, compuing left singular // vectors of R in work[ir:]. ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], dum, 1, work[ir:], ldworkr, dum, 1, work[iwork:]) // Multiply Q in A by left singular vectors of R in // work[ir:], storing result in U. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda, work[ir:], ldworkr, 0, u, ldu) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + n // Compute A = Q*R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Generate Q in U. impl.Dorgqr(m, n, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Zero out below R in A. impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda) // Bidiagonalize R in A. impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply Q in U by left vectors bidiagonalizing R. impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left // singular vectors of A in U. ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:], dum, 1, u, ldu, dum, 1, work[iwork:]) } } else if wantvo { // Path 5 panic(noSVDO) } else if wantvas { // Path 6 if lwork >= n*n+max(4*n, bdspac) { // Sufficient workspace for a fast algorithm. iu := 0 var ldworku int if lwork >= wrkbl+lda*n { ldworku = lda } else { ldworku = n } itau := iu + ldworku*n iwork := itau + n // Compute A = Q * R. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Copy R to work[iu:], zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku) // Generate Q in A. impl.Dorgqr(m, n, n, a, lda, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in work[iu:], copying result to VT. impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt) // Generate left bidiagonalizing vectors in work[iu:]. impl.Dorgbr(lapack.ApplyQ, n, n, n, work[iu:], ldworku, work[itauq:], work[iwork:], lwork-iwork) // Generate right bidiagonalizing vectors in VT. impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of R in work[iu:], and computing right singular // vectors of R in VT. ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:], vt, ldvt, work[iu:], ldworku, dum, 1, work[iwork:]) // Multiply Q in A by left singular vectors of R in // work[iu:], storing result in U. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda, work[iu:], ldworku, 0, u, ldu) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + n // Compute A = Q * R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Generate Q in U. impl.Dorgqr(m, n, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) // Copy R to VT, zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in VT. impl.Dgebrd(n, n, vt, ldvt, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply Q in U by left bidiagonalizing vectors in VT. impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork) // Generate right bidiagonalizing vectors in VT. impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of A in U and computing right singular vectors // of A in VT. ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } } } else if wantua { if wantvn { // Path 7 if lwork >= n*n+max(max(n+m, 4*n), bdspac) { // Sufficient workspace for a fast algorithm. ir := 0 var ldworkr int if lwork >= wrkbl+lda*n { ldworkr = lda } else { ldworkr = n } itau := ir + ldworkr*n iwork := itau + n // Compute A = Q*R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Copy R to work[ir:], zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr) // Generate Q in U. impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in work[ir:]. impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Generate left bidiagonalizing vectors in work[ir:]. impl.Dorgbr(lapack.ApplyQ, n, n, n, work[ir:], ldworkr, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of R in work[ir:]. ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], dum, 1, work[ir:], ldworkr, dum, 1, work[iwork:]) // Multiply Q in U by left singular vectors of R in // work[ir:], storing result in A. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, u, ldu, work[ir:], ldworkr, 0, a, lda) // Copy left singular vectors of A from A to U. impl.Dlacpy(blas.All, m, n, a, lda, u, ldu) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + n // Compute A = Q*R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Generate Q in U. impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Zero out below R in A. impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda) // Bidiagonalize R in A. impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply Q in U by left bidiagonalizing vectors in A. impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left // singular vectors of A in U. ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:], dum, 1, u, ldu, dum, 1, work[iwork:]) } } else if wantvo { // Path 8. panic(noSVDO) } else if wantvas { // Path 9. if lwork >= n*n+max(max(n+m, 4*n), bdspac) { // Sufficient workspace for a fast algorithm. iu := 0 var ldworku int if lwork >= wrkbl+lda*n { ldworku = lda } else { ldworku = n } itau := iu + ldworku*n iwork := itau + n // Compute A = Q * R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Generate Q in U. impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) // Copy R to work[iu:], zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku) ie = itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in work[iu:], copying result to VT. impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt) // Generate left bidiagonalizing vectors in work[iu:]. impl.Dorgbr(lapack.ApplyQ, n, n, n, work[iu:], ldworku, work[itauq:], work[iwork:], lwork-iwork) // Generate right bidiagonalizing vectors in VT. impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of R in work[iu:] and computing right // singular vectors of R in VT. ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:], vt, ldvt, work[iu:], ldworku, dum, 1, work[iwork:]) // Multiply Q in U by left singular vectors of R in // work[iu:], storing result in A. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, u, ldu, work[iu:], ldworku, 0, a, lda) // Copy left singular vectors of A from A to U. impl.Dlacpy(blas.All, m, n, a, lda, u, ldu) /* // Bidiagonalize R in VT. impl.Dgebrd(n, n, vt, ldvt, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply Q in U by left bidiagonalizing vectors in VT. impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork) // Generate right bidiagonalizing vectors in VT. impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of A in U and computing right singular vectors // of A in VT. ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) */ } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + n // Compute A = Q*R, copying result to U. impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) // Generate Q in U. impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork) // Copy R from A to VT, zeroing out below it. impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt) impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt) ie := itau itauq := ie + n itaup := itauq + n iwork = itaup + n // Bidiagonalize R in VT. impl.Dgebrd(n, n, vt, ldvt, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply Q in U by left bidiagonalizing vectors in VT. impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork) // Generate right bidiagonizing vectors in VT. impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + n // Perform bidiagonal QR iteration, computing left singular // vectors of A in U and computing right singular vectors // of A in VT. impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } } } } else { // Path 10. // M at least N, but not much larger. ie = 0 itauq := ie + n itaup := itauq + n iwork := itaup + n // Bidiagonalize A. impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) if wantuas { // Left singular vectors are desired in U. Copy result to U and // generate left biadiagonalizing vectors in U. impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu) var ncu int if wantus { ncu = n } if wantua { ncu = m } impl.Dorgbr(lapack.ApplyQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork) } if wantvas { // Right singular vectors are desired in VT. Copy result to VT and // generate left biadiagonalizing vectors in VT. impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt) impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) } if wantuo { panic(noSVDO) } if wantvo { panic(noSVDO) } iwork = ie + n var nru, ncvt int if wantuas || wantuo { nru = m } if wantun { nru = 0 } if wantvas || wantvo { ncvt = n } if wantvn { ncvt = 0 } if !wantuo && !wantvo { // Perform bidiagonal QR iteration, if desired, computing left // singular vectors in U and right singular vectors in VT. ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } else { // There will be two branches when the implementation is complete. panic(noSVDO) } } } else { // A has more columns than rows. If A has sufficiently more columns than // rows, first reduce using the LQ decomposition. if n >= mnthr { // n >> m. if wantvn { // Path 1t. itau := 0 iwork := itau + m // Compute A = L*Q. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Zero out above L. impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda) ie := 0 itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in A. impl.Dgebrd(m, m, a, lda, s, work[ie:itauq], work[itauq:itaup], work[itaup:iwork], work[iwork:], lwork-iwork) if wantuo || wantuas { impl.Dorgbr(lapack.ApplyQ, m, m, m, a, lda, work[itauq:], work[iwork:], lwork-iwork) } iwork = ie + m nru := 0 if wantuo || wantuas { nru = m } // Perform bidiagonal QR iteration, computing left singular vectors // of A in A if desired. ok = impl.Dbdsqr(blas.Upper, m, 0, nru, 0, s, work[ie:], dum, 1, a, lda, dum, 1, work[iwork:]) // If left singular vectors desired in U, copy them there. if wantuas { impl.Dlacpy(blas.All, m, m, a, lda, u, ldu) } } else if wantvo && wantun { // Path 2t. panic(noSVDO) } else if wantvo && wantuas { // Path 3t. panic(noSVDO) } else if wantvs { if wantun { // Path 4t. if lwork >= m*m+max(4*m, bdspac) { // Sufficient workspace for a fast algorithm. ir := 0 var ldworkr int if lwork >= wrkbl+lda*m { ldworkr = lda } else { ldworkr = m } itau := ir + ldworkr*m iwork := itau + m // Compute A = L*Q. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Copy L to work[ir:], zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr) // Generate Q in A. impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in work[ir:]. impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Generate right vectors bidiagonalizing L in work[ir:]. impl.Dorgbr(lapack.ApplyP, m, m, m, work[ir:], ldworkr, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing right singular // vectors of L in work[ir:]. ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:], work[ir:], ldworkr, dum, 1, dum, 1, work[iwork:]) // Multiply right singular vectors of L in work[ir:] by // Q in A, storing result in VT. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[ir:], ldworkr, a, lda, 0, vt, ldvt) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + m // Compute A = L*Q. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Copy result to VT. impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Generate Q in VT. impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Zero out above L in A. impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda) // Bidiagonalize L in A. impl.Dgebrd(m, m, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply right vectors bidiagonalizing L by Q in VT. impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing right // singular vectors of A in VT. ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:], vt, ldvt, dum, 1, dum, 1, work[iwork:]) } } else if wantuo { // Path 5t. panic(noSVDO) } else if wantuas { // Path 6t. if lwork >= m*m+max(4*m, bdspac) { // Sufficient workspace for a fast algorithm. iu := 0 var ldworku int if lwork >= wrkbl+lda*m { ldworku = lda } else { ldworku = m } itau := iu + ldworku*m iwork := itau + m // Compute A = L*Q. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) // Copy L to work[iu:], zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku) // Generate Q in A. impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in work[iu:], copying result to U. impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu) // Generate right bidiagionalizing vectors in work[iu:]. impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku, work[itaup:], work[iwork:], lwork-iwork) // Generate left bidiagonalizing vectors in U. impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing left singular // vectors of L in U and computing right singular vectors of // L in work[iu:]. ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:], work[iu:], ldworku, u, ldu, dum, 1, work[iwork:]) // Multiply right singular vectors of L in work[iu:] by // Q in A, storing result in VT. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[iu:], ldworku, a, lda, 0, vt, ldvt) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + m // Compute A = L*Q, copying result to VT. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Generate Q in VT. impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) // Copy L to U, zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in U. impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply right bidiagonalizing vectors in U by Q in VT. impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork) // Generate left bidiagonalizing vectors in U. impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing left singular // vectors of A in U and computing right singular vectors // of A in VT. impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } } } else if wantva { if wantun { // Path 7t. if lwork >= m*m+max(max(n+m, 4*m), bdspac) { // Sufficient workspace for a fast algorithm. ir := 0 var ldworkr int if lwork >= wrkbl+lda*m { ldworkr = lda } else { ldworkr = m } itau := ir + ldworkr*m iwork := itau + m // Compute A = L*Q, copying result to VT. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Copy L to work[ir:], zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr) // Generate Q in VT. impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in work[ir:]. impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Generate right bidiagonalizing vectors in work[ir:]. impl.Dorgbr(lapack.ApplyP, m, m, m, work[ir:], ldworkr, work[itaup:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing right // singular vectors of L in work[ir:]. ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:], work[ir:], ldworkr, dum, 1, dum, 1, work[iwork:]) // Multiply right singular vectors of L in work[ir:] by // Q in VT, storing result in A. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[ir:], ldworkr, vt, ldvt, 0, a, lda) // Copy right singular vectors of A from A to VT. impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + m // Compute A = L * Q, copying result to VT. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Generate Q in VT. impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) ie := itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Zero out above L in A. impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda) // Bidiagonalize L in A. impl.Dgebrd(m, m, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply right bidiagonalizing vectors in A by Q in VT. impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing right singular // vectors of A in VT. ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:], vt, ldvt, dum, 1, dum, 1, work[iwork:]) } } else if wantuo { panic(noSVDO) } else if wantuas { // Path 9t. if lwork >= m*m+max(max(m+n, 4*m), bdspac) { // Sufficient workspace for a fast algorithm. iu := 0 var ldworku int if lwork >= wrkbl+lda*m { ldworku = lda } else { ldworku = m } itau := iu + ldworku*m iwork := itau + m // Generate A = L * Q copying result to VT. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Generate Q in VT. impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) // Copy L to work[iu:], zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku) ie = itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in work[iu:], copying result to U. impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu) // Generate right bidiagonalizing vectors in work[iu:]. impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku, work[itaup:], work[iwork:], lwork-iwork) // Generate left bidiagonalizing vectors in U. impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing left singular // vectors of L in U and computing right singular vectors // of L in work[iu:]. ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:], work[iu:], ldworku, u, ldu, dum, 1, work[iwork:]) // Multiply right singular vectors of L in work[iu:] // Q in VT, storing result in A. bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[iu:], ldworku, vt, ldvt, 0, a, lda) // Copy right singular vectors of A from A to VT. impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt) } else { // Insufficient workspace for a fast algorithm. itau := 0 iwork := itau + m // Compute A = L * Q, copying result to VT. impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork) impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) // Generate Q in VT. impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork) // Copy L to U, zeroing out above it. impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu) impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu) ie = itau itauq := ie + m itaup := itauq + m iwork = itaup + m // Bidiagonalize L in U. impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) // Multiply right bidiagonalizing vectors in U by Q in VT. impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork) // Generate left bidiagonalizing vectors in U. impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork) iwork = ie + m // Perform bidiagonal QR iteration, computing left singular // vectors of A in U and computing right singular vectors // of A in VT. ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } } } } else { // Path 10t. // N at least M, but not much larger. ie = 0 itauq := ie + m itaup := itauq + m iwork := itaup + m // Bidiagonalize A. impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork) if wantuas { // If left singular vectors desired in U, copy result to U and // generate left bidiagonalizing vectors in U. impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu) impl.Dorgbr(lapack.ApplyQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork) } if wantvas { // If right singular vectors desired in VT, copy result to VT // and generate right bidiagonalizing vectors in VT. impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt) var nrvt int if wantva { nrvt = n } else { nrvt = m } impl.Dorgbr(lapack.ApplyP, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork) } if wantuo { panic(noSVDO) } if wantvo { panic(noSVDO) } iwork = ie + m var nru, ncvt int if wantuas || wantuo { nru = m } if wantvas || wantvo { ncvt = n } if !wantuo && !wantvo { // Perform bidiagonal QR iteration, if desired, computing left // singular vectors in U and computing right singular vectors in // VT. ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:]) } else { // There will be two branches when the implementation is complete. panic(noSVDO) } } } if !ok { if ie > 1 { for i := 0; i < minmn-1; i++ { work[i+1] = work[i+ie] } } if ie < 1 { for i := minmn - 2; i >= 0; i-- { work[i+1] = work[i+ie] } } } // Undo scaling if necessary. if iscl { if anrm > bignum { impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn, 1, s, minmn) } if !ok && anrm > bignum { impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn-1, 1, work[minmn:], minmn) } if anrm < smlnum { impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn, 1, s, minmn) } if !ok && anrm < smlnum { impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn-1, 1, work[minmn:], minmn) } } work[0] = float64(maxwrk) return ok }
// Dgebal balances an n×n matrix A. Balancing consists of two stages, permuting // and scaling. Both steps are optional and depend on the value of job. // // Permuting consists of applying a permutation matrix P such that the matrix // that results from P^T*A*P takes the upper block triangular form // [ T1 X Y ] // P^T A P = [ 0 B Z ], // [ 0 0 T2 ] // where T1 and T2 are upper triangular matrices and B contains at least one // nonzero off-diagonal element in each row and column. The indices ilo and ihi // mark the starting and ending columns of the submatrix B. The eigenvalues of A // isolated in the first 0 to ilo-1 and last ihi+1 to n-1 elements on the // diagonal can be read off without any roundoff error. // // Scaling consists of applying a diagonal similarity transformation D such that // D^{-1}*B*D has the 1-norm of each row and its corresponding column nearly // equal. The output matrix is // [ T1 X*D Y ] // [ 0 inv(D)*B*D inv(D)*Z ]. // [ 0 0 T2 ] // Scaling may reduce the 1-norm of the matrix, and improve the accuracy of // the computed eigenvalues and/or eigenvectors. // // job specifies the operations that will be performed on A. // If job is lapack.None, Dgebal sets scale[i] = 1 for all i and returns ilo=0, ihi=n-1. // If job is lapack.Permute, only permuting will be done. // If job is lapack.Scale, only scaling will be done. // If job is lapack.PermuteScale, both permuting and scaling will be done. // // On return, if job is lapack.Permute or lapack.PermuteScale, it will hold that // A[i,j] == 0, for i > j and j ∈ {0, ..., ilo-1, ihi+1, ..., n-1}. // If job is lapack.None or lapack.Scale, or if n == 0, it will hold that // ilo == 0 and ihi == n-1. // // On return, scale will contain information about the permutations and scaling // factors applied to A. If π(j) denotes the index of the column interchanged // with column j, and D[j,j] denotes the scaling factor applied to column j, // then // scale[j] == π(j), for j ∈ {0, ..., ilo-1, ihi+1, ..., n-1}, // == D[j,j], for j ∈ {ilo, ..., ihi}. // scale must have length equal to n, otherwise Dgebal will panic. // // Dgebal is an internal routine. It is exported for testing purposes. func (impl Implementation) Dgebal(job lapack.Job, n int, a []float64, lda int, scale []float64) (ilo, ihi int) { switch job { default: panic(badJob) case lapack.None, lapack.Permute, lapack.Scale, lapack.PermuteScale: } checkMatrix(n, n, a, lda) if len(scale) != n { panic("lapack: bad length of scale") } ilo = 0 ihi = n - 1 if n == 0 || job == lapack.None { for i := range scale { scale[i] = 1 } return ilo, ihi } bi := blas64.Implementation() swapped := true if job == lapack.Scale { goto scaling } // Permutation to isolate eigenvalues if possible. // // Search for rows isolating an eigenvalue and push them down. for swapped { swapped = false rows: for i := ihi; i >= 0; i-- { for j := 0; j <= ihi; j++ { if i == j { continue } if a[i*lda+j] != 0 { continue rows } } // Row i has only zero off-diagonal elements in the // block A[ilo:ihi+1,ilo:ihi+1]. scale[ihi] = float64(i) if i != ihi { bi.Dswap(ihi+1, a[i:], lda, a[ihi:], lda) bi.Dswap(n, a[i*lda:], 1, a[ihi*lda:], 1) } if ihi == 0 { scale[0] = 1 return ilo, ihi } ihi-- swapped = true break } } // Search for columns isolating an eigenvalue and push them left. swapped = true for swapped { swapped = false columns: for j := ilo; j <= ihi; j++ { for i := ilo; i <= ihi; i++ { if i == j { continue } if a[i*lda+j] != 0 { continue columns } } // Column j has only zero off-diagonal elements in the // block A[ilo:ihi+1,ilo:ihi+1]. scale[ilo] = float64(j) if j != ilo { bi.Dswap(ihi+1, a[j:], lda, a[ilo:], lda) bi.Dswap(n-ilo, a[j*lda+ilo:], 1, a[ilo*lda+ilo:], 1) } swapped = true ilo++ break } } scaling: for i := ilo; i <= ihi; i++ { scale[i] = 1 } if job == lapack.Permute { return ilo, ihi } // Balance the submatrix in rows ilo to ihi. const ( // sclfac should be a power of 2 to avoid roundoff errors. // Elements of scale are restricted to powers of sclfac, // therefore the matrix will be only nearly balanced. sclfac = 2 // factor determines the minimum reduction of the row and column // norms that is considered non-negligible. It must be less than 1. factor = 0.95 ) sfmin1 := dlamchS / dlamchP sfmax1 := 1 / sfmin1 sfmin2 := sfmin1 * sclfac sfmax2 := 1 / sfmin2 // Iterative loop for norm reduction. var conv bool for !conv { conv = true for i := ilo; i <= ihi; i++ { c := bi.Dnrm2(ihi-ilo+1, a[ilo*lda+i:], lda) r := bi.Dnrm2(ihi-ilo+1, a[i*lda+ilo:], 1) ica := bi.Idamax(ihi+1, a[i:], lda) ca := math.Abs(a[ica*lda+i]) ira := bi.Idamax(n-ilo, a[i*lda+ilo:], 1) ra := math.Abs(a[i*lda+ilo+ira]) // Guard against zero c or r due to underflow. if c == 0 || r == 0 { continue } g := r / sclfac f := 1.0 s := c + r for c < g && math.Max(f, math.Max(c, ca)) < sfmax2 && math.Min(r, math.Min(g, ra)) > sfmin2 { if math.IsNaN(c + f + ca + r + g + ra) { // Panic if NaN to avoid infinite loop. panic("lapack: NaN") } f *= sclfac c *= sclfac ca *= sclfac g /= sclfac r /= sclfac ra /= sclfac } g = c / sclfac for r <= g && math.Max(r, ra) < sfmax2 && math.Min(math.Min(f, c), math.Min(g, ca)) > sfmin2 { f /= sclfac c /= sclfac ca /= sclfac g /= sclfac r *= sclfac ra *= sclfac } if c+r >= factor*s { // Reduction would be negligible. continue } if f < 1 && scale[i] < 1 && f*scale[i] <= sfmin1 { continue } if f > 1 && scale[i] > 1 && scale[i] >= sfmax1/f { continue } // Now balance. scale[i] *= f bi.Dscal(n-ilo, 1/f, a[i*lda+ilo:], 1) bi.Dscal(ihi+1, f, a[i:], lda) conv = false } } return ilo, ihi }
// Dsteqr computes the eigenvalues and optionally the eigenvectors of a symmetric // tridiagonal matrix using the implicit QL or QR method. The eigenvectors of a // full or band symmetric matrix can also be found if Dsytrd, Dsptrd, or Dsbtrd // have been used to reduce this matrix to tridiagonal form. // // d, on entry, contains the diagonal elements of the tridiagonal matrix. On exit, // d contains the eigenvalues in ascending order. d must have length n and // Dsteqr will panic otherwise. // // e, on entry, contains the off-diagonal elements of the tridiagonal matrix on // entry, and is overwritten during the call to Dsteqr. e must have length n-1 and // Dsteqr will panic otherwise. // // z, on entry, contains the n×n orthogonal matrix used in the reduction to // tridiagonal form if compz == lapack.EigDecomp. On exit, if // compz == lapack.EigBoth, z contains the orthonormal eigenvectors of the // original symmetric matrix, and if compz == lapack.EigDecomp, z contains the // orthonormal eigenvectors of the symmetric tridiagonal matrix. z is not used // if compz == lapack.EigValueOnly. // // work must have length at least max(1, 2*n-2) if the eigenvectors are computed, // and Dsteqr will panic otherwise. // // Dsteqr is an internal routine. It is exported for testing purposes. func (impl Implementation) Dsteqr(compz lapack.EigComp, n int, d, e, z []float64, ldz int, work []float64) (ok bool) { if len(d) < n { panic(badD) } if len(e) < n-1 { panic(badE) } if compz != lapack.EigValueOnly && compz != lapack.EigBoth && compz != lapack.EigDecomp { panic(badEigComp) } if compz != lapack.EigValueOnly { if len(work) < max(1, 2*n-2) { panic(badWork) } checkMatrix(n, n, z, ldz) } var icompz int if compz == lapack.EigDecomp { icompz = 1 } else if compz == lapack.EigBoth { icompz = 2 } if n == 0 { return true } if n == 1 { if icompz == 2 { z[0] = 1 } return true } bi := blas64.Implementation() eps := dlamchE eps2 := eps * eps safmin := dlamchS safmax := 1 / safmin ssfmax := math.Sqrt(safmax) / 3 ssfmin := math.Sqrt(safmin) / eps2 // Compute the eigenvalues and eigenvectors of the tridiagonal matrix. if icompz == 2 { impl.Dlaset(blas.All, n, n, 0, 1, z, ldz) } const maxit = 30 nmaxit := n * maxit jtot := 0 // Determine where the matrix splits and choose QL or QR iteration for each // block, according to whether top or bottom diagonal element is smaller. l1 := 0 nm1 := n - 1 type scaletype int const ( none scaletype = iota down up ) var iscale scaletype for { if l1 > n-1 { // Order eigenvalues and eigenvectors. if icompz == 0 { impl.Dlasrt(lapack.SortIncreasing, n, d) } else { // TODO(btracey): Consider replacing this sort with a call to sort.Sort. for ii := 1; ii < n; ii++ { i := ii - 1 k := i p := d[i] for j := ii; j < n; j++ { if d[j] < p { k = j p = d[j] } } if k != i { d[k] = d[i] d[i] = p bi.Dswap(n, z[i:], ldz, z[k:], ldz) } } } return true } if l1 > 0 { e[l1-1] = 0 } var m int if l1 <= nm1 { for m = l1; m < nm1; m++ { test := math.Abs(e[m]) if test == 0 { break } if test <= (math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1])))*eps { e[m] = 0 break } } } l := l1 lsv := l lend := m lendsv := lend l1 = m + 1 if lend == l { continue } // Scale submatrix in rows and columns L to Lend anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:]) switch { case anorm == 0: continue case anorm > ssfmax: iscale = down // TODO(btracey): Why is lda n? impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], n) impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], n) case anorm < ssfmin: iscale = up // TODO(btracey): Why is lda n? impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], n) impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], n) } // Choose between QL and QR. if math.Abs(d[lend]) < math.Abs(d[l]) { lend = lsv l = lendsv } if lend > l { // QL Iteration. Look for small subdiagonal element. for { if l != lend { for m = l; m < lend; m++ { v := math.Abs(e[m]) if v*v <= (eps2*math.Abs(d[m]))*math.Abs(d[m+1])+safmin { break } } } else { m = lend } if m < lend { e[m] = 0 } p := d[l] if m == l { // Eigenvalue found. l++ if l > lend { break } continue } // If remaining matrix is 2×2, use Dlae2 to compute its eigensystem. if m == l+1 { if icompz > 0 { d[l], d[l+1], work[l], work[n-1+l] = impl.Dlaev2(d[l], e[l], d[l+1]) impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, n, 2, work[l:], work[n-1+l:], z[l:], ldz) } else { d[l], d[l+1] = impl.Dlae2(d[l], e[l], d[l+1]) } e[l] = 0 l += 2 if l > lend { break } continue } if jtot == nmaxit { break } jtot++ // Form shift g := (d[l+1] - p) / (2 * e[l]) r := impl.Dlapy2(g, 1) g = d[m] - p + e[l]/(g+math.Copysign(r, g)) s := 1.0 c := 1.0 p = 0.0 // Inner loop for i := m - 1; i >= l; i-- { f := s * e[i] b := c * e[i] c, s, r = impl.Dlartg(g, f) if i != m-1 { e[i+1] = r } g = d[i+1] - p r = (d[i]-g)*s + 2*c*b p = s * r d[i+1] = g + p g = c*r - b // If eigenvectors are desired, then save rotations. if icompz > 0 { work[i] = c work[n-1+i] = -s } } // If eigenvectors are desired, then apply saved rotations. if icompz > 0 { mm := m - l + 1 impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, n, mm, work[l:], work[n-1+l:], z[l:], ldz) } d[l] -= p e[l] = g } } else { // QR Iteration. // Look for small superdiagonal element. for { if l != lend { for m = l; m > lend; m-- { v := math.Abs(e[m-1]) if v*v <= (eps2*math.Abs(d[m])*math.Abs(d[m-1]) + safmin) { break } } } else { m = lend } if m > lend { e[m-1] = 0 } p := d[l] if m == l { // Eigenvalue found l-- if l < lend { break } continue } // If remaining matrix is 2×2, use Dlae2 to compute its eigenvalues. if m == l-1 { if icompz > 0 { d[l-1], d[l], work[m], work[n-1+m] = impl.Dlaev2(d[l-1], e[l-1], d[l]) impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, n, 2, work[m:], work[n-1+m:], z[l-1:], ldz) } else { d[l-1], d[l] = impl.Dlae2(d[l-1], e[l-1], d[l]) } e[l-1] = 0 l -= 2 if l < lend { break } continue } if jtot == nmaxit { break } jtot++ // Form shift. g := (d[l-1] - p) / (2 * e[l-1]) r := impl.Dlapy2(g, 1) g = d[m] - p + (e[l-1])/(g+math.Copysign(r, g)) s := 1.0 c := 1.0 p = 0.0 // Inner loop. for i := m; i < l; i++ { f := s * e[i] b := c * e[i] c, s, r = impl.Dlartg(g, f) if i != m { e[i-1] = r } g = d[i] - p r = (d[i+1]-g)*s + 2*c*b p = s * r d[i] = g + p g = c*r - b // If eigenvectors are desired, then save rotations. if icompz > 0 { work[i] = c work[n-1+i] = s } } // If eigenvectors are desired, then apply saved rotations. if icompz > 0 { mm := l - m + 1 impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, n, mm, work[m:], work[n-1+m:], z[m:], ldz) } d[l] -= p e[l-1] = g } } // Undo scaling if necessary. switch iscale { case down: impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], n) impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv, 1, e[lsv:], n) case up: impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], n) impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv, 1, e[lsv:], n) } // Check for no convergence to an eigenvalue after a total of n*maxit iterations. if jtot >= nmaxit { break } } for i := 0; i < n-1; i++ { if e[i] != 0 { return false } } return true }
// Dsytd2 reduces a symmetric n×n matrix A to symmetric tridiagonal form T by an // orthogonal similarity transformation // Q^T * A * Q = T // On entry, the matrix is contained in the specified triangle of a. On exit, // if uplo == blas.Upper, the diagonal and first super-diagonal of a are // overwritten with the elements of T. The elements above the first super-diagonal // are overwritten with the the elementary reflectors that are used with the // elements written to tau in order to construct Q. If uplo == blas.Lower, the // elements are written in the lower triangular region. // // d must have length at least n. e and tau must have length at least n-1. Dsytd2 // will panic if these sizes are not met. // // Q is represented as a product of elementary reflectors. // If uplo == blas.Upper // Q = H_{n-2} * ... * H_1 * H_0 // and if uplo == blas.Lower // Q = H_0 * H_1 * ... * H_{n-2} // where // H_i = I - tau * v * v^T // where tau is stored in tau[i], and v is stored in a. // // If uplo == blas.Upper, v[0:i-1] is stored in A[0:i-1,i+1], v[i] = 1, and // v[i+1:] = 0. The elements of a are // [ d e v2 v3 v4] // [ d e v3 v4] // [ d e v4] // [ d e] // [ d] // If uplo == blas.Lower, v[0:i+1] = 0, v[i+1] = 1, and v[i+2:] is stored in // A[i+2:n,i]. // The elements of a are // [ d ] // [ e d ] // [v1 e d ] // [v1 v2 e d ] // [v1 v2 v3 e d] // // Dsytd2 is an internal routine. It is exported for testing purposes. func (impl Implementation) Dsytd2(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau []float64) { checkMatrix(n, n, a, lda) if len(d) < n { panic(badD) } if len(e) < n-1 { panic(badE) } if len(tau) < n-1 { panic(badTau) } if n <= 0 { return } bi := blas64.Implementation() if uplo == blas.Upper { // Reduce the upper triangle of A. for i := n - 2; i >= 0; i-- { // Generate elementary reflector H_i = I - tau * v * v^T to // annihilate A[i:i-1, i+1]. var taui float64 a[i*lda+i+1], taui = impl.Dlarfg(i+1, a[i*lda+i+1], a[i+1:], lda) e[i] = a[i*lda+i+1] if taui != 0 { // Apply H_i from both sides to A[0:i,0:i]. a[i*lda+i+1] = 1 // Compute x := tau * A * v storing x in tau[0:i]. bi.Dsymv(uplo, i+1, taui, a, lda, a[i+1:], lda, 0, tau, 1) // Compute w := x - 1/2 * tau * (x^T * v) * v. alpha := -0.5 * taui * bi.Ddot(i+1, tau, 1, a[i+1:], lda) bi.Daxpy(i+1, alpha, a[i+1:], lda, tau, 1) // Apply the transformation as a rank-2 update // A = A - v * w^T - w * v^T. bi.Dsyr2(uplo, i+1, -1, a[i+1:], lda, tau, 1, a, lda) a[i*lda+i+1] = e[i] } d[i+1] = a[(i+1)*lda+i+1] tau[i] = taui } d[0] = a[0] return } // Reduce the lower triangle of A. for i := 0; i < n-1; i++ { // Generate elementary reflector H_i = I - tau * v * v^T to // annihilate A[i+2:n, i]. var taui float64 a[(i+1)*lda+i], taui = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda) e[i] = a[(i+1)*lda+i] if taui != 0 { // Apply H_i from both sides to A[i+1:n, i+1:n]. a[(i+1)*lda+i] = 1 // Compute x := tau * A * v, storing y in tau[i:n-1]. bi.Dsymv(uplo, n-i-1, taui, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, tau[i:], 1) // Compute w := x - 1/2 * tau * (x^T * v) * v. alpha := -0.5 * taui * bi.Ddot(n-i-1, tau[i:], 1, a[(i+1)*lda+i:], lda) bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda, tau[i:], 1) // Apply the transformation as a rank-2 update // A = A - v * w^T - w * v^T. bi.Dsyr2(uplo, n-i-1, -1, a[(i+1)*lda+i:], lda, tau[i:], 1, a[(i+1)*lda+i+1:], lda) a[(i+1)*lda+i] = e[i] } d[i] = a[i*lda+i] tau[i] = taui } d[n-1] = a[(n-1)*lda+n-1] }
// Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated // block of a Hessenberg matrix. // // wantt and wantz determine whether the quasi-triangular Schur factor and the // orthogonal Schur factor, respectively, will be computed. // // kacc22 specifies the computation mode of far-from-diagonal orthogonal // updates. Permitted values are: // 0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix // multiply to update far-from-diagonal matrix entries. // 1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to // update far-from-diagonal matrix entries. // 2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update // far-from-diagonal matrix entries, and take advantage of 2×2 block // structure during matrix multiplies. // For other values of kacc2 Dlaqr5 will panic. // // n is the order of the Hessenberg matrix H. // // ktop and kbot are indices of the first and last row and column of an isolated // diagonal block upon which the QR sweep will be applied. It must hold that // ktop == 0, or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and // kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0, // otherwise Dlaqr5 will panic. // // nshfts is the number of simultaneous shifts. It must be positive and even, // otherwise Dlaqr5 will panic. // // sr and si contain the real and imaginary parts, respectively, of the shifts // of origin that define the multi-shift QR sweep. On return both slices may be // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5 // will panic. // // h and ldh represent the Hessenberg matrix H of size n×n. On return // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated // diagonal block in rows and columns ktop through kbot, inclusive. // // iloz and ihiz specify the rows of Z to which transformations will be applied // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5 // will panic. // // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep // orthogonal similarity transformation is accumulated into // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced. // // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V // is transposed with respect to the reference netlib implementation. // // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3). // // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh. // // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3). // // Dlaqr5 is an internal routine. It is exported for testing purposes. func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) { checkMatrix(n, n, h, ldh) if ktop < 0 || n <= ktop { panic("lapack: invalid value of ktop") } if ktop > 0 && h[ktop*ldh+ktop-1] != 0 { panic("lapack: diagonal block is not isolated") } if kbot < 0 || n <= kbot { panic("lapack: invalid value of kbot") } if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 { panic("lapack: diagonal block is not isolated") } if nshfts < 0 || nshfts&0x1 != 0 { panic("lapack: invalid number of shifts") } if len(sr) != nshfts || len(si) != nshfts { panic(badSlice) // TODO(vladimir-ch) Another message? } if wantz { if ihiz >= n { panic("lapack: invalid value of ihiz") } if iloz < 0 || ihiz < iloz { panic("lapack: invalid value of iloz") } checkMatrix(n, n, z, ldz) } checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack. checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu) checkMatrix(nv, 3*nshfts-3, wv, ldwv) checkMatrix(3*nshfts-3, nh, wh, ldwh) if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 { panic("lapack: invalid value of kacc22") } // If there are no shifts, then there is nothing to do. if nshfts < 2 { return } // If the active block is empty or 1×1, then there is nothing to do. if ktop >= kbot { return } // Shuffle shifts into pairs of real shifts and pairs of complex // conjugate shifts assuming complex conjugate shifts are already // adjacent to one another. for i := 0; i < nshfts-2; i += 2 { if si[i] == -si[i+1] { continue } sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i] si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i] } // Note: lapack says that nshfts must be even but allows it to be odd // anyway. At the moment, we panic above if nshfts is not even, so // reducing it by one is unnecessary (it seems that Dlaqr0 and Dlaqr4 // indeed use only even nshfts). // // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341: // * ==== NSHFTS is supposed to be even, but if it is odd, // * . then simply reduce it by one. The shuffle above // * . ensures that the dropped shift is real and that // * . the remaining shifts are paired. ==== // * // NS = NSHFTS - MOD( NSHFTS, 2 ) ns := nshfts safmin := dlamchS ulp := dlamchP smlnum := safmin * float64(n) / ulp // Use accumulated reflections to update far-from-diagonal entries? accum := kacc22 == 1 || kacc22 == 2 // If so, exploit the 2×2 block structure? blk22 := ns > 2 && kacc22 == 2 // Clear trash. if ktop+2 <= kbot { h[(ktop+2)*ldh+ktop] = 0 } // nbmps = number of 2-shift bulges in the chain. nbmps := ns / 2 // kdu = width of slab. kdu := 6*nbmps - 3 // Create and chase chains of nbmps bulges. for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 { ndcol := incol + kdu if accum { impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu) } // Near-the-diagonal bulge chase. The following loop performs // the near-the-diagonal part of a small bulge multi-shift QR // sweep. Each 6*nbmps-2 column diagonal chunk extends from // column incol to column ndcol (including both column incol and // column ndcol). The following loop chases a 3*nbmps column // long chain of nbmps bulges 3*nbmps-2 columns to the right. // (incol may be less than ktop and ndcol may be greater than // kbot indicating phantom columns from which to chase bulges // before they are actually introduced or to which to chase // bulges beyond column kbot.) for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ { // Bulges number mtop to mbot are active double implicit // shift bulges. There may or may not also be small 2×2 // bulge, if there is room. The inactive bulges (if any) // must wait until the active bulges have moved down the // diagonal to make room. The phantom matrix paradigm // described above helps keep track. mtop := max(0, ((ktop-1)-krcol+2)/3) mbot := min(nbmps, (kbot-krcol)/3) - 1 m22 := mbot + 1 bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2) // Generate reflections to chase the chain right one // column. (The minimum value of k is ktop-1.) for m := mtop; m <= mbot; m++ { k := krcol + 3*m if k == ktop-1 { impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh, sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], v[m*ldv:m*ldv+3]) alpha := v[m*ldv] _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1) continue } beta := h[(k+1)*ldh+k] v[m*ldv+1] = h[(k+2)*ldh+k] v[m*ldv+2] = h[(k+3)*ldh+k] beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1) // A bulge may collapse because of vigilant deflation or // destructive underflow. In the underflow case, try the // two-small-subdiagonals trick to try to reinflate the // bulge. if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 { // Typical case: not collapsed (yet). h[(k+1)*ldh+k] = beta h[(k+2)*ldh+k] = 0 h[(k+3)*ldh+k] = 0 continue } // Atypical case: collapsed. Attempt to reintroduce // ignoring H[k+1,k] and H[k+2,k]. If the fill // resulting from the new reflector is too large, // then abandon it. Otherwise, use the new one. var vt [3]float64 impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m], si[2*m], sr[2*m+1], si[2*m+1], vt[:]) alpha := vt[0] _, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1) refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]) dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2]) if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum { // Starting a new bulge here would create // non-negligible fill. Use the old one with // trepidation. h[(k+1)*ldh+k] = beta h[(k+2)*ldh+k] = 0 h[(k+3)*ldh+k] = 0 continue } else { // Starting a new bulge here would create // only negligible fill. Replace the old // reflector with the new one. h[(k+1)*ldh+k] -= refsum h[(k+2)*ldh+k] = 0 h[(k+3)*ldh+k] = 0 v[m*ldv] = vt[0] v[m*ldv+1] = vt[1] v[m*ldv+2] = vt[2] } } // Generate a 2×2 reflection, if needed. if bmp22 { k := krcol + 3*m22 if k == ktop-1 { impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh, sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1], v[m22*ldv:m22*ldv+2]) beta := v[m22*ldv] _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) } else { beta := h[(k+1)*ldh+k] v[m22*ldv+1] = h[(k+2)*ldh+k] beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1) h[(k+1)*ldh+k] = beta h[(k+2)*ldh+k] = 0 } } // Multiply H by reflections from the left. var jbot int switch { case accum: jbot = min(ndcol, kbot) case wantt: jbot = n - 1 default: jbot = kbot } for j := max(ktop, krcol); j <= jbot; j++ { mend := min(mbot+1, (j-krcol+2)/3) - 1 for m := mtop; m <= mend; m++ { k := krcol + 3*m refsum := v[m*ldv] * (h[(k+1)*ldh+j] + v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]) h[(k+1)*ldh+j] -= refsum h[(k+2)*ldh+j] -= refsum * v[m*ldv+1] h[(k+3)*ldh+j] -= refsum * v[m*ldv+2] } } if bmp22 { k := krcol + 3*m22 for j := max(k+1, ktop); j <= jbot; j++ { refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]) h[(k+1)*ldh+j] -= refsum h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1] } } // Multiply H by reflections from the right. Delay filling in the last row // until the vigilant deflation check is complete. var jtop int switch { case accum: jtop = max(ktop, incol) case wantt: jtop = 0 default: jtop = ktop } for m := mtop; m <= mbot; m++ { if v[m*ldv] == 0 { continue } k := krcol + 3*m for j := jtop; j <= min(kbot, k+3); j++ { refsum := v[m*ldv] * (h[j*ldh+k+1] + v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]) h[j*ldh+k+1] -= refsum h[j*ldh+k+2] -= refsum * v[m*ldv+1] h[j*ldh+k+3] -= refsum * v[m*ldv+2] } if accum { // Accumulate U. (If necessary, update Z later with with an // efficient matrix-matrix multiply.) kms := k - incol for j := max(0, ktop-incol-1); j < kdu; j++ { refsum := v[m*ldv] * (u[j*ldu+kms] + v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2]) u[j*ldu+kms] -= refsum u[j*ldu+kms+1] -= refsum * v[m*ldv+1] u[j*ldu+kms+2] -= refsum * v[m*ldv+2] } } else if wantz { // U is not accumulated, so update Z now by multiplying by // reflections from the right. for j := iloz; j <= ihiz; j++ { refsum := v[m*ldv] * (z[j*ldz+k+1] + v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]) z[j*ldz+k+1] -= refsum z[j*ldz+k+2] -= refsum * v[m*ldv+1] z[j*ldz+k+3] -= refsum * v[m*ldv+2] } } } // Special case: 2×2 reflection (if needed). if bmp22 && v[m22*ldv] != 0 { k := krcol + 3*m22 for j := jtop; j <= min(kbot, k+3); j++ { refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]) h[j*ldh+k+1] -= refsum h[j*ldh+k+2] -= refsum * v[m22*ldv+1] } if accum { kms := k - incol for j := max(0, ktop-incol-1); j < kdu; j++ { refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1]) u[j*ldu+kms] -= refsum u[j*ldu+kms+1] -= refsum * v[m22*ldv+1] } } else if wantz { for j := iloz; j <= ihiz; j++ { refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]) z[j*ldz+k+1] -= refsum z[j*ldz+k+2] -= refsum * v[m22*ldv+1] } } } // Vigilant deflation check. mstart := mtop if krcol+3*mstart < ktop { mstart++ } mend := mbot if bmp22 { mend++ } if krcol == kbot-2 { mend++ } for m := mstart; m <= mend; m++ { k := min(kbot-1, krcol+3*m) // The following convergence test requires that the tradition // small-compared-to-nearby-diagonals criterion and the Ahues & // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter // improves accuracy in some examples. Falling back on an alternate // convergence criterion when tst1 or tst2 is zero (as done here) is // traditional but probably unnecessary. if h[(k+1)*ldh+k] == 0 { continue } tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) if tst1 == 0 { if k >= ktop+1 { tst1 += math.Abs(h[k*ldh+k-1]) } if k >= ktop+2 { tst1 += math.Abs(h[k*ldh+k-2]) } if k >= ktop+3 { tst1 += math.Abs(h[k*ldh+k-3]) } if k <= kbot-2 { tst1 += math.Abs(h[(k+2)*ldh+k+1]) } if k <= kbot-3 { tst1 += math.Abs(h[(k+3)*ldh+k+1]) } if k <= kbot-4 { tst1 += math.Abs(h[(k+4)*ldh+k+1]) } } if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) { h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1])) h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1])) scl := h11 + h12 tst2 := h22 * (h11 / scl) if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) { h[(k+1)*ldh+k] = 0 } } } // Fill in the last row of each bulge. mend = min(nbmps, (kbot-krcol-1)/3) - 1 for m := mtop; m <= mend; m++ { k := krcol + 3*m refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3] h[(k+4)*ldh+k+1] = -refsum h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1] h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2] } } // Use U (if accumulated) to update far-from-diagonal entries in H. // If required, use U to update Z as well. if !accum { continue } var jtop, jbot int if wantt { jtop = 0 jbot = n - 1 } else { jtop = ktop jbot = kbot } bi := blas64.Implementation() if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 { // Updates not exploiting the 2×2 block structure of U. k0 and nu keep track // of the location and size of U in the special cases of introducing bulges // and chasing bulges off the bottom. In these special cases and in case the // number of shifts is ns = 2, there is no 2×2 block structure to exploit. k0 := max(0, ktop-incol-1) nu := kdu - max(0, ndcol-kbot) - k0 // Horizontal multiply. for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { jlen := min(nh, jbot-jcol+1) bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu, 1, u[k0*ldu+k0:], ldu, h[(incol+k0+1)*ldh+jcol:], ldh, 0, wh, ldwh) impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh) } // Vertical multiply. for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv { jlen := min(nv, max(ktop, incol)-jrow) bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 1, h[jrow*ldh+incol+k0+1:], ldh, u[k0*ldu+k0:], ldu, 0, wv, ldwv) impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh) } // Z multiply (also vertical). if wantz { for jrow := iloz; jrow <= ihiz; jrow += nv { jlen := min(nv, ihiz-jrow+1) bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu, 1, z[jrow*ldz+incol+k0+1:], ldz, u[k0*ldu+k0:], ldu, 0, wv, ldwv) impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz) } } continue } // Updates exploiting U's 2×2 block structure. // i2, i4, j2, j4 are the last rows and columns of the blocks. i2 := (kdu + 1) / 2 i4 := kdu j2 := i4 - i2 j4 := kdu // kzs and knz deal with the band of zeros along the diagonal of one of the // triangular blocks. kzs := (j4 - j2) - (ns + 1) knz := ns + 1 // Horizontal multiply. for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh { jlen := min(nh, jbot-jcol+1) // Copy bottom of H to top+kzs of scratch (the first kzs // rows get multiplied by zero). impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh) // Multiply by U21^T. impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh) bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen, 1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh) // Multiply top of H by U11^T. bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2, 1, u, ldu, h[(incol+1)*ldh+jcol:], ldh, 1, wh, ldwh) // Copy top of H to bottom of WH. impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh) // Multiply by U21^T. bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen, 1, u[i2:], ldu, wh[i2*ldwh:], ldwh) // Multiply by U22. bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2, 1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh, 1, wh[i2*ldwh:], ldwh) // Copy it back. impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh) } // Vertical multiply. for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv { jlen := min(nv, max(incol, ktop)-jrow) // Copy right of H to scratch (the first kzs columns get multiplied // by zero). impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv) // Multiply by U21. impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) // Multiply by U11. bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 1, h[jrow*ldh+incol+1:], ldh, u, ldu, 1, wv, ldwv) // Copy left of H to right of scratch. impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv) // Multiply by U21. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 1, u[i2:], ldu, wv[i2:], ldwv) // Multiply by U22. bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu, 1, wv[i2:], ldwv) // Copy it back. impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh) } if !wantz { continue } // Multiply Z (also vertical). for jrow := iloz; jrow <= ihiz; jrow += nv { jlen := min(nv, ihiz-jrow+1) // Copy right of Z to left of scratch (first kzs columns get // multiplied by zero). impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv) // Multiply by U12. impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv) bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz, 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv) // Multiply by U11. bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2, 1, z[jrow*ldz+incol+1:], ldz, u, ldu, 1, wv, ldwv) // Copy left of Z to right of scratch. impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv) // Multiply by U21. bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2, 1, u[i2:], ldu, wv[i2:], ldwv) // Multiply by U22. bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2, 1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu, 1, wv[i2:], ldwv) // Copy the result back to Z. impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz) } } }
// Dlatrs solves a triangular system of equations scaled to prevent overflow. It // solves // A * x = scale * b if trans == blas.NoTrans // A^T * x = scale * b if trans == blas.Trans // where the scale s is set for numeric stability. // // A is an n×n triangular matrix. On entry, the slice x contains the values of // of b, and on exit it contains the solution vector x. // // If normin == true, cnorm is an input and cnorm[j] contains the norm of the off-diagonal // part of the j^th column of A. If trans == blas.NoTrans, cnorm[j] must be greater // than or equal to the infinity norm, and greater than or equal to the one-norm // otherwise. If normin == false, then cnorm is treated as an output, and is set // to contain the 1-norm of the off-diagonal part of the j^th column of A. func (impl Implementation) Dlatrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n int, a []float64, lda int, x []float64, cnorm []float64) (scale float64) { if uplo != blas.Upper && uplo != blas.Lower { panic(badUplo) } if trans != blas.Trans && trans != blas.NoTrans { panic(badTrans) } if diag != blas.Unit && diag != blas.NonUnit { panic(badDiag) } upper := uplo == blas.Upper noTrans := trans == blas.NoTrans nonUnit := diag == blas.NonUnit if n < 0 { panic(nLT0) } checkMatrix(n, n, a, lda) checkVector(n, x, 1) checkVector(n, cnorm, 1) if n == 0 { return } scale = 1 bi := blas64.Implementation() if !normin { if upper { for j := 0; j < n; j++ { cnorm[j] = bi.Dasum(j, a[j:], lda) } } else { for j := 0; j < n-1; j++ { cnorm[j] = bi.Dasum(n-j-1, a[(j+1)*lda+j:], lda) } cnorm[n-1] = 0 } } // Scale the column norms by tscal if the maximum element in cnorm is greater than bignum. imax := bi.Idamax(n, cnorm, 1) tmax := cnorm[imax] var tscal float64 if tmax <= bignum { tscal = 1 } else { tscal = 1 / (smlnum * tmax) bi.Dscal(n, tscal, cnorm, 1) } // Compute a bound on the computed solution vector to see if bi.Dtrsv can be used. j := bi.Idamax(n, x, 1) xmax := math.Abs(x[j]) xbnd := xmax var grow float64 var jfirst, jlast, jinc int if noTrans { if upper { jfirst = n - 1 jlast = 0 jinc = -1 } else { jfirst = 0 jlast = n - 1 jinc = 1 } // Compute the growth in A * x = b. if tscal != 1 { grow = 0 goto Finish } if nonUnit { grow = 1 / math.Max(xbnd, smlnum) xbnd = grow for j := jfirst; j != jlast; j += jinc { if grow <= smlnum { goto Finish } tjj := math.Abs(a[j*lda+j]) xbnd = math.Min(xbnd, math.Min(1, tjj)*grow) if tjj+cnorm[j] >= smlnum { grow *= tjj / (tjj + cnorm[j]) } else { grow = 0 } } grow = xbnd } else { grow = math.Min(1, 1/math.Max(xbnd, smlnum)) for j := jfirst; j != jlast; j += jinc { if grow <= smlnum { goto Finish } grow *= 1 / (1 + cnorm[j]) } } } else { if upper { jfirst = 0 jlast = n - 1 jinc = 1 } else { jfirst = n - 1 jlast = 0 jinc = -1 } if tscal != 1 { grow = 0 goto Finish } if nonUnit { grow = 1 / (math.Max(xbnd, smlnum)) xbnd = grow for j := jfirst; j != jlast; j += jinc { if grow <= smlnum { goto Finish } xj := 1 + cnorm[j] grow = math.Min(grow, xbnd/xj) tjj := math.Abs(a[j*lda+j]) if xj > tjj { xbnd *= tjj / xj } } grow = math.Min(grow, xbnd) } else { grow = math.Min(1, 1/math.Max(xbnd, smlnum)) for j := jfirst; j != jlast; j += jinc { if grow <= smlnum { goto Finish } xj := 1 + cnorm[j] grow /= xj } } } Finish: if grow*tscal > smlnum { bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1) // TODO(btracey): check if this else is everything } else { if xmax > bignum { scale = bignum / xmax bi.Dscal(n, scale, x, 1) xmax = bignum } if noTrans { for j := jfirst; j != jlast; j += jinc { xj := math.Abs(x[j]) var tjjs float64 if nonUnit { tjjs = a[j*lda+j] * tscal } else { tjjs = tscal if tscal == 1 { break } } tjj := math.Abs(tjjs) if tjj > smlnum { if tjj < 1 { if xj > tjj*bignum { rec := 1 / xj bi.Dscal(n, rec, x, 1) scale *= rec xmax *= rec } } x[j] /= tjjs xj = math.Abs(x[j]) } else if tjj > 0 { if xj > tjj*bignum { rec := (tjj * bignum) / xj if cnorm[j] > 1 { rec /= cnorm[j] } bi.Dscal(n, rec, x, 1) scale *= rec xmax *= rec } x[j] /= tjjs xj = math.Abs(x[j]) } else { for i := 0; i < n; i++ { x[i] = 0 } x[j] = 1 xj = 1 scale = 0 xmax = 0 } if xj > 1 { rec := 1 / xj if cnorm[j] > (bignum-xmax)*rec { rec *= 0.5 bi.Dscal(n, rec, x, 1) scale *= rec } } else if xj*cnorm[j] > bignum-xmax { bi.Dscal(n, 0.5, x, 1) scale *= 0.5 } if upper { if j > 0 { bi.Daxpy(j, -x[j]*tscal, a[j:], lda, x, 1) i := bi.Idamax(j, x, 1) xmax = math.Abs(x[i]) } } else { if j < n-1 { bi.Daxpy(n-j-1, -x[j]*tscal, a[(j+1)*lda+j:], lda, x[j+1:], 1) i := j + bi.Idamax(n-j-1, x[j+1:], 1) xmax = math.Abs(x[i]) } } } } else { for j := jfirst; j != jlast; j += jinc { xj := math.Abs(x[j]) uscal := tscal rec := 1 / math.Max(xmax, 1) var tjjs float64 if cnorm[j] > (bignum-xj)*rec { rec *= 0.5 if nonUnit { tjjs = a[j*lda+j] * tscal } else { tjjs = tscal } tjj := math.Abs(tjjs) if tjj > 1 { rec = math.Min(1, rec*tjj) uscal /= tjjs } if rec < 1 { bi.Dscal(n, rec, x, 1) scale *= rec xmax *= rec } } var sumj float64 if uscal == 1 { if upper { sumj = bi.Ddot(j, a[j:], lda, x, 1) } else if j < n-1 { sumj = bi.Ddot(n-j-1, a[(j+1)*lda+j:], lda, x[j+1:], 1) } } else { if upper { for i := 0; i < j; i++ { sumj += (a[i*lda+j] * uscal) * x[i] } } else if j < n { for i := j + 1; i < n; i++ { sumj += (a[i*lda+j] * uscal) * x[i] } } } if uscal == tscal { x[j] -= sumj xj := math.Abs(x[j]) var tjjs float64 if nonUnit { tjjs = a[j*lda+j] * tscal } else { tjjs = tscal if tscal == 1 { goto Out2 } } tjj := math.Abs(tjjs) if tjj > smlnum { if tjj < 1 { if xj > tjj*bignum { rec = 1 / xj bi.Dscal(n, rec, x, 1) scale *= rec xmax *= rec } } x[j] /= tjjs } else if tjj > 0 { if xj > tjj*bignum { rec = (tjj * bignum) / xj bi.Dscal(n, rec, x, 1) scale *= rec xmax *= rec } x[j] /= tjjs } else { for i := 0; i < n; i++ { x[i] = 0 } x[j] = 1 scale = 0 xmax = 0 } } else { x[j] = x[j]/tjjs - sumj } Out2: xmax = math.Max(xmax, math.Abs(x[j])) } } scale /= tscal } if tscal != 1 { bi.Dscal(n, 1/tscal, cnorm, 1) } return scale }
// Dtrcon estimates the reciprocal of the condition number of a triangular matrix A. // The condition number computed may be based on the 1-norm or the ∞-norm. // // work is a temporary data slice of length at least 3*n and Dtrcon will panic otherwise. // // iwork is a temporary data slice of length at least n and Dtrcon will panic otherwise. func (impl Implementation) Dtrcon(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64 { if norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum { panic(badNorm) } if uplo != blas.Upper && uplo != blas.Lower { panic(badUplo) } if diag != blas.NonUnit && diag != blas.Unit { panic(badDiag) } if len(work) < 3*n { panic(badWork) } if len(iwork) < n { panic(badWork) } if n == 0 { return 1 } bi := blas64.Implementation() var rcond float64 smlnum := dlamchS * float64(n) anorm := impl.Dlantr(norm, uplo, diag, n, n, a, lda, work) if anorm <= 0 { return rcond } var ainvnm float64 var normin bool kase1 := 2 if norm == lapack.MaxColumnSum { kase1 = 1 } var kase int isave := new([3]int) var scale float64 for { ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave) if kase == 0 { if ainvnm != 0 { rcond = (1 / anorm) / ainvnm } return rcond } if kase == kase1 { scale = impl.Dlatrs(uplo, blas.NoTrans, diag, normin, n, a, lda, work, work[2*n:]) } else { scale = impl.Dlatrs(uplo, blas.Trans, diag, normin, n, a, lda, work, work[2*n:]) } normin = true if scale != 1 { ix := bi.Idamax(n, work, 1) xnorm := math.Abs(work[ix]) if scale == 0 || scale < xnorm*smlnum { return rcond } impl.Drscl(n, scale, work, 1) } } }