func getPdbChain(fp string) *pdb.Chain { b := path.Base(fp) if !strings.HasSuffix(b, ".fmap") { util.Fatalf("Expected file named 'something.fmap' but got '%s'.", b) } idAndChain := b[0 : len(b)-5] if len(idAndChain) != 5 { util.Fatalf("Expected 4-letter PDB id concatenated with 1-letter "+ "chain identifier, but got '%s' instead.", idAndChain) } pdbName := idAndChain[0:4] chainId := idAndChain[4] pdbCat := idAndChain[1:3] pdbFile := fmt.Sprintf("pdb%s.ent.gz", pdbName) pdbPath := path.Join(util.FlagPdbDir, pdbCat, pdbFile) entry := util.PDBRead(pdbPath) chain := entry.Chain(chainId) if chain == nil { util.Fatalf("Could not find chain '%c' in PDB entry '%s'.", chainId, pdbPath) } return chain }
func main() { var f io.Reader var err error f = util.OpenFile(flag.Arg(0)) if strings.HasSuffix(flag.Arg(0), ".gz") { f, err = gzip.NewReader(f) util.Assert(err) } cifEntry, err := pdbx.Read(f) util.Assert(err, "Could not read PDBx/mmCIF file") fasEntries := make([]seq.Sequence, 0, 5) for _, ent := range cifEntry.Entities { for _, chain := range ent.Chains { if !isChainUsable(chain) || len(ent.Seq) == 0 { continue } fasEntry := seq.Sequence{ Name: chainHeader(chain), Residues: ent.Seq, } fasEntries = append(fasEntries, fasEntry) } } if len(fasEntries) == 0 { util.Fatalf("Could not find any chains with amino acids.") } var fasOut io.Writer if flag.NArg() == 1 { fasOut = os.Stdout } else { if len(flagSplit) > 0 { util.Fatalf("The '--split' option is incompatible with a single " + "output file.") } fasOut = util.CreateFile(util.Arg(1)) } if len(flagSplit) == 0 { util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries), "Could not write FASTA file '%s'", fasOut) } else { for _, entry := range fasEntries { fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name)) out := util.CreateFile(fp) w := fasta.NewWriter(out) util.Assert(w.Write(entry), "Could not write to '%s'", fp) util.Assert(w.Flush(), "Could not write to '%s'", fp) } } }
func mkPaired(c *command) { c.assertNArg(2) in := util.Library(c.flags.Arg(0)) outPath := c.flags.Arg(1) util.AssertOverwritable(outPath, flagOverwrite) if _, ok := in.(fragbag.WeightedLibrary); ok { util.Fatalf("%s is a weighted library (not allowed)", in.Name()) } name := fmt.Sprintf("paired-%s", in.Name()) if fragbag.IsStructure(in) { var pairs [][]structure.Coords lib := in.(fragbag.StructureLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Atoms(i), lib.Atoms(j) pairs = append(pairs, append(f1, f2...)) } } pairLib, err := fragbag.NewStructureAtoms(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "hmm") { var pairs []*seq.HMM lib := in.(fragbag.SequenceLibrary) nfrags := lib.Size() for i := 0; i < nfrags; i++ { for j := 0; j < nfrags; j++ { if i == j { continue } f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM) pairs = append(pairs, seq.HMMCat(f1, f2)) } } pairLib, err := fragbag.NewSequenceHMM(name, pairs) util.Assert(err) fragbag.Save(util.CreateFile(outPath), pairLib) } else if strings.Contains(in.Tag(), "profile") { util.Fatalf("Sequence profiles not implemented.") } else { util.Fatalf("Unrecognized fragment library: %s", in.Tag()) } }
func main() { entry := util.PDBRead(flag.Arg(0)) if len(flagChain) > 0 { if len(flagChain) != 1 { util.Fatalf("Chain identifiers must be a single character.") } chain := entry.Chain(flagChain[0]) if chain == nil { util.Fatalf("Could not find chain '%c' in PDB entry '%s'.", chain.Ident, entry.Path) } showMapping(chain, chain.SequenceAtoms()) } else { for _, chain := range entry.Chains { showMapping(chain, chain.SequenceAtoms()) } } }
func coords(num int, atomRecords []byte) []structure.Coords { r := bytes.NewReader(atomRecords) name := fmt.Sprintf("fragment %d", num) entry, err := pdb.Read(r, name) util.Assert(err, "Fragment contents could not be read in PDB format") atoms := entry.OneChain().CaAtoms() if len(atoms) == 0 { util.Fatalf("Fragment %d has no ATOM coordinates.", num) } return atoms }
func ioFromFile(fpath, force string) msaIO { var fmt string if len(force) > 0 { fmt = force } else { var ok bool ext := path.Ext(fpath) if len(ext) > 0 { ext = ext[1:] } fmt, ok = extToFmt[ext] if !ok { util.Fatalf("Could not detect format from extension '%s'.", ext) } } io, ok := fmtToIO[fmt] if !ok { util.Fatalf("BUG: Could not find converters for format '%s'.", fmt) } return io }
func main() { lib = util.StructureLibrary(util.Arg(0)) pdbEntry := util.PDBRead(util.Arg(1)) if util.NArg() == 2 { for _, chain := range pdbEntry.Chains { atoms := chain.CaAtoms() bestFragsForRegion(chain, atoms, 0, len(atoms)) } } else { chainId := util.Arg(2) chain := pdbEntry.Chain(chainId[0]) if chain == nil || !chain.IsProtein() { util.Fatalf("Could not find protein chain with id '%c'.", chainId) } atoms := chain.CaAtoms() if util.NArg() == 3 { bestFragsForRegion(chain, atoms, 0, len(atoms)) } else { if util.NArg() != 5 { log.Println("Both a start and end must be provided.") util.Usage() } s, e := util.Arg(3), util.Arg(4) sn, en := util.ParseInt(s)-1, util.ParseInt(e) if en-sn < lib.FragmentSize() { util.Fatalf("The range [%s, %s] specifies %d alpha-carbon "+ "atoms while at least %d alpha-carbon atoms are required "+ "for the given fragment library.", s, e, en-sn, lib.FragmentSize()) } bestFragsForRegion(chain, atoms, sn, en) } } }
func search(c *command) { c.assertLeastNArg(2) // Some search options don't translate directly to command line parameters // specified by the flag package. if flagSearchDesc { flagSearchOpts.Order = bowdb.OrderDesc } switch flagSearchSort { case "cosine": flagSearchOpts.SortBy = bowdb.SortByCosine case "euclid": flagSearchOpts.SortBy = bowdb.SortByEuclid default: util.Fatalf("Unknown sort field '%s'.", flagSearchSort) } db := util.OpenBowDB(c.flags.Arg(0)) bowPaths := c.flags.Args()[1:] _, err := db.ReadAll() util.Assert(err, "Could not read BOW database entries") // always hide the progress bar here. bows := util.ProcessBowers(bowPaths, db.Lib, false, flagCpu, true) out, outDone := outputter() // launch goroutines to search queries in parallel wgSearch := new(sync.WaitGroup) for i := 0; i < flagCpu; i++ { wgSearch.Add(1) go func() { defer wgSearch.Done() for b := range bows { sr := db.Search(flagSearchOpts, b) out <- searchResult{b, sr} } }() } wgSearch.Wait() close(out) <-outDone util.Assert(db.Close()) }
func init() { flag.Float64Var(&flagThreshold, "threshold", flagThreshold, "Set the distance threshold to use when computing AUC.") util.FlagUse("cpu", "cpuprof") util.FlagParse( "cath-domain-labels best-of-all-matrix"+ "(bowdb | matrix-file) out-file "+ "[ (bowdb | matrix-file) out-file ... ]", "Computes the AUC of each aligner matrix (or BOW database) given\n"+ "with respect to the 'best-of-all' matrix given. Each AUC is\n"+ "written to a separate out-file. The sizes of all matrices must\n"+ "be exactly equivalent.\n"+ "Files are interpreted as BOW databases if they have a '.bowdb'\n"+ "file extension.") util.AssertLeastNArg(4) if util.NArg()%2 != 0 { util.Fatalf("There must be an out file for each matrix or bowdb file.") } }
func main() { libPath := util.Arg(0) chain := util.Arg(1) pdbEntryPath := util.Arg(2) bowOut := util.Arg(3) lib := util.StructureLibrary(libPath) entry := util.PDBRead(pdbEntryPath) thechain := entry.Chain(chain[0]) if thechain == nil || !thechain.IsProtein() { util.Fatalf("Could not find chain with identifier '%c'.", chain[0]) } bow := bow.BowerFromChain(thechain).StructureBow(lib) if bowOut == "--" { fmt.Println(bow) } else { util.BowWrite(util.CreateFile(bowOut), bow) } }
func recordToDist(record []string) pair { namePieces := strings.SplitN(record[0], ".ent_", 2) if len(namePieces) != 2 { util.Fatalf("Invalid alignment pair: '%s'.", record[0]) } p1, p2 := namePieces[0], namePieces[1] p2 = p2[0 : len(p2)-5] rf := func(i int) float64 { return readFloat(record[i]) } corelen, rmsd := rf(1), rf(2) l1, l2 := rf(7), rf(8) coreval := (2.0 * corelen) / (l1 + l2) dist := -6.04979701*(rmsd-coreval*corelen*0.155+1.6018) + 1000 dist = 1.0 / dist dist *= 100.0 if p1 < p2 { return pair{[2]string{p1, p2}, dist} } return pair{[2]string{p2, p1}, dist} }
func outputter() (chan searchResult, chan struct{}) { out := make(chan searchResult) done := make(chan struct{}) go func() { if flagSearchOutFmt == "csv" { fmt.Printf("QueryID\tHitID\tCosine\tEuclid\n") } first := true for sr := range out { switch flagSearchOutFmt { case "plain": outputPlain(sr, first) case "csv": outputCsv(sr, first) default: util.Fatalf("Invalid output format '%s'.", flagSearchOutFmt) } first = false } done <- struct{}{} }() return out, done }
func main() { if len(util.FlagCpuProf) > 0 { f := util.CreateFile(util.FlagCpuProf) pprof.StartCPUProfile(f) defer f.Close() defer pprof.StopCPUProfile() } // Read all CATH domains, the best-of-all matrix, and the matrix for // each aligner. domains := readDomains(util.Arg(0)) boa := readMatrix(domains, util.Arg(1)) aligners := make([]aligner, 0) flibs := make([]flib, 0) for i := 2; i < util.NArg(); i += 2 { fpath := util.Arg(i) if path.Ext(fpath) == ".bowdb" { db := util.OpenBowDB(fpath) records, err := db.ReadAll() util.Assert(err) bowed := make([]bow.Bowed, domains.in.Len()) for _, b := range records { if !domains.in.Exists(b.Id) { util.Fatalf("Found ID in bowdb that isn't in the list "+ "of CATH domains provided: %s", b.Id) } bowed[domains.in.Atom(b.Id)] = b } flibs = append(flibs, flib{db, bowed, util.Arg(i + 1)}) } else { aligners = append(aligners, aligner{ readMatrix(domains, fpath), util.Arg(i + 1), }) } } // Now remove CATH domains that don't have a corresponding structure file. // We don't do this initially since the matrix files are indexed with // respect to all CATH domains (includings ones without structure). // This is an artifact of the fact that the matrices were generated with // a very old version of CATH. domains.removeOldDomains() if a := matrixAuc(domains, boa, boa, flagThreshold); a != 1.0 { util.Fatalf("Something is wrong. The AUC of the best-of-all matrix "+ "with respect to itself is %f, but it should be 1.0.", a) } if len(aligners) > 0 { fmt.Println("Computing AUC for aligners...") writeAuc := func(aligner aligner) struct{} { w := util.CreateFile(aligner.outpath) a := matrixAuc(domains, boa, aligner.matrix, flagThreshold) fmt.Fprintf(w, "%f\n", a) return struct{}{} } fun.ParMap(writeAuc, aligners) } if len(flibs) > 0 { fmt.Println("Computing AUC for bowdbs...") writeAuc := func(flib flib) struct{} { w := util.CreateFile(flib.outpath) a := flibAuc(domains, boa, flib, flagThreshold) fmt.Fprintf(w, "%f\n", a) return struct{}{} } fun.ParMap(writeAuc, flibs) } }
func main() { pdbEntry := util.PDBRead(flag.Arg(0)) fasEntries := make([]seq.Sequence, 0, 5) if !flagSeparateChains { var fasEntry seq.Sequence if len(pdbEntry.Chains) == 1 { fasEntry.Name = chainHeader(pdbEntry.OneChain()) } else { fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode)) } seq := make([]seq.Residue, 0, 100) for _, chain := range pdbEntry.Chains { if isChainUsable(chain) { seq = append(seq, chain.Sequence...) } } fasEntry.Residues = seq if len(fasEntry.Residues) == 0 { util.Fatalf("Could not find any amino acids.") } fasEntries = append(fasEntries, fasEntry) } else { for _, chain := range pdbEntry.Chains { if !isChainUsable(chain) { continue } fasEntry := seq.Sequence{ Name: chainHeader(chain), Residues: chain.Sequence, } fasEntries = append(fasEntries, fasEntry) } } if len(fasEntries) == 0 { util.Fatalf("Could not find any chains with amino acids.") } var fasOut io.Writer if flag.NArg() == 1 { fasOut = os.Stdout } else { if len(flagSplit) > 0 { util.Fatalf("The '--split' option is incompatible with a single " + "output file.") } fasOut = util.CreateFile(util.Arg(1)) } if len(flagSplit) == 0 { util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries), "Could not write FASTA file '%s'", fasOut) } else { for _, entry := range fasEntries { fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name)) out := util.CreateFile(fp) w := fasta.NewWriter(out) util.Assert(w.Write(entry), "Could not write to '%s'", fp) util.Assert(w.Flush(), "Could not write to '%s'", fp) } } }
func main() { fmapPath := util.Arg(0) fmap := util.FmapRead(fmapPath) qchain := getPdbChain(fmapPath) stats := newSequenceStats(qchain.Sequence) total, trueps := 0, 0 qcorrupt, tcorrupt := 0, 0 for _, frags := range fmap.Segments { for _, frag := range frags.Frags { hit := frag.Hit if frag.IsCorrupt() { tcorrupt += 1 stats.incTCorrupt(hit) continue } qatoms := qchain.SequenceCaAtomSlice(hit.QueryStart-1, hit.QueryEnd) if qatoms == nil { qcorrupt += 1 stats.incQCorrupt(hit) continue } if len(qatoms) != len(frag.CaAtoms) { util.Fatalf("Uncomparable lengths. Query is (%d, %d) while "+ "template is (%d, %d). Length of query CaAtoms: %d, "+ "length of template CaAtoms: %d", hit.QueryStart, hit.QueryEnd, hit.TemplateStart, hit.TemplateEnd, len(qatoms), len(frag.CaAtoms)) } if structure.RMSD(qatoms, frag.CaAtoms) <= flagRmsd { trueps += 1 stats.incTruePs(hit) } total += 1 stats.incTotal(hit) } } coveredResidues := 0 for _, resStats := range stats { if resStats.trueps >= 1 { coveredResidues += 1 } } coverage := float64(coveredResidues) / float64(len(qchain.Sequence)) fmt.Printf("RMSDThreshold: %f\n", flagRmsd) fmt.Printf("TotalFragments: %d\n", total) fmt.Printf("TruePositives: %d\n", trueps) fmt.Printf("Precision: %f\n", float64(trueps)/float64(total)) fmt.Printf("CorruptQuery: %d\n", qcorrupt) fmt.Printf("CorruptTemplate: %d\n", tcorrupt) fmt.Printf("TotalResidues: %d\n", len(qchain.Sequence)) fmt.Printf("CoveredResidues: %d\n", coveredResidues) fmt.Printf("Coverage: %f\n", coverage) }