Exemplo n.º 1
0
func getPdbChain(fp string) *pdb.Chain {
	b := path.Base(fp)
	if !strings.HasSuffix(b, ".fmap") {
		util.Fatalf("Expected file named 'something.fmap' but got '%s'.", b)
	}
	idAndChain := b[0 : len(b)-5]
	if len(idAndChain) != 5 {
		util.Fatalf("Expected 4-letter PDB id concatenated with 1-letter "+
			"chain identifier, but got '%s' instead.", idAndChain)
	}

	pdbName := idAndChain[0:4]
	chainId := idAndChain[4]
	pdbCat := idAndChain[1:3]
	pdbFile := fmt.Sprintf("pdb%s.ent.gz", pdbName)

	pdbPath := path.Join(util.FlagPdbDir, pdbCat, pdbFile)

	entry := util.PDBRead(pdbPath)
	chain := entry.Chain(chainId)
	if chain == nil {
		util.Fatalf("Could not find chain '%c' in PDB entry '%s'.",
			chainId, pdbPath)
	}
	return chain
}
Exemplo n.º 2
0
func main() {
	var f io.Reader
	var err error

	f = util.OpenFile(flag.Arg(0))
	if strings.HasSuffix(flag.Arg(0), ".gz") {
		f, err = gzip.NewReader(f)
		util.Assert(err)
	}
	cifEntry, err := pdbx.Read(f)
	util.Assert(err, "Could not read PDBx/mmCIF file")

	fasEntries := make([]seq.Sequence, 0, 5)
	for _, ent := range cifEntry.Entities {
		for _, chain := range ent.Chains {
			if !isChainUsable(chain) || len(ent.Seq) == 0 {
				continue
			}

			fasEntry := seq.Sequence{
				Name:     chainHeader(chain),
				Residues: ent.Seq,
			}
			fasEntries = append(fasEntries, fasEntry)
		}
	}
	if len(fasEntries) == 0 {
		util.Fatalf("Could not find any chains with amino acids.")
	}

	var fasOut io.Writer
	if flag.NArg() == 1 {
		fasOut = os.Stdout
	} else {
		if len(flagSplit) > 0 {
			util.Fatalf("The '--split' option is incompatible with a single " +
				"output file.")
		}
		fasOut = util.CreateFile(util.Arg(1))
	}

	if len(flagSplit) == 0 {
		util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries),
			"Could not write FASTA file '%s'", fasOut)
	} else {
		for _, entry := range fasEntries {
			fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name))
			out := util.CreateFile(fp)

			w := fasta.NewWriter(out)
			util.Assert(w.Write(entry), "Could not write to '%s'", fp)
			util.Assert(w.Flush(), "Could not write to '%s'", fp)
		}
	}
}
Exemplo n.º 3
0
func mkPaired(c *command) {
	c.assertNArg(2)

	in := util.Library(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	util.AssertOverwritable(outPath, flagOverwrite)

	if _, ok := in.(fragbag.WeightedLibrary); ok {
		util.Fatalf("%s is a weighted library (not allowed)", in.Name())
	}

	name := fmt.Sprintf("paired-%s", in.Name())
	if fragbag.IsStructure(in) {
		var pairs [][]structure.Coords
		lib := in.(fragbag.StructureLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Atoms(i), lib.Atoms(j)
				pairs = append(pairs, append(f1, f2...))
			}
		}
		pairLib, err := fragbag.NewStructureAtoms(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "hmm") {
		var pairs []*seq.HMM
		lib := in.(fragbag.SequenceLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM)
				pairs = append(pairs, seq.HMMCat(f1, f2))
			}
		}
		pairLib, err := fragbag.NewSequenceHMM(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "profile") {
		util.Fatalf("Sequence profiles not implemented.")
	} else {
		util.Fatalf("Unrecognized fragment library: %s", in.Tag())
	}
}
Exemplo n.º 4
0
func main() {
	entry := util.PDBRead(flag.Arg(0))

	if len(flagChain) > 0 {
		if len(flagChain) != 1 {
			util.Fatalf("Chain identifiers must be a single character.")
		}
		chain := entry.Chain(flagChain[0])
		if chain == nil {
			util.Fatalf("Could not find chain '%c' in PDB entry '%s'.",
				chain.Ident, entry.Path)
		}
		showMapping(chain, chain.SequenceAtoms())
	} else {
		for _, chain := range entry.Chains {
			showMapping(chain, chain.SequenceAtoms())
		}
	}
}
Exemplo n.º 5
0
func coords(num int, atomRecords []byte) []structure.Coords {
	r := bytes.NewReader(atomRecords)
	name := fmt.Sprintf("fragment %d", num)

	entry, err := pdb.Read(r, name)
	util.Assert(err, "Fragment contents could not be read in PDB format")

	atoms := entry.OneChain().CaAtoms()
	if len(atoms) == 0 {
		util.Fatalf("Fragment %d has no ATOM coordinates.", num)
	}
	return atoms
}
Exemplo n.º 6
0
func ioFromFile(fpath, force string) msaIO {
	var fmt string
	if len(force) > 0 {
		fmt = force
	} else {
		var ok bool
		ext := path.Ext(fpath)
		if len(ext) > 0 {
			ext = ext[1:]
		}

		fmt, ok = extToFmt[ext]
		if !ok {
			util.Fatalf("Could not detect format from extension '%s'.", ext)
		}
	}

	io, ok := fmtToIO[fmt]
	if !ok {
		util.Fatalf("BUG: Could not find converters for format '%s'.", fmt)
	}
	return io
}
Exemplo n.º 7
0
func main() {
	lib = util.StructureLibrary(util.Arg(0))
	pdbEntry := util.PDBRead(util.Arg(1))

	if util.NArg() == 2 {
		for _, chain := range pdbEntry.Chains {
			atoms := chain.CaAtoms()
			bestFragsForRegion(chain, atoms, 0, len(atoms))
		}
	} else {
		chainId := util.Arg(2)
		chain := pdbEntry.Chain(chainId[0])
		if chain == nil || !chain.IsProtein() {
			util.Fatalf("Could not find protein chain with id '%c'.", chainId)
		}
		atoms := chain.CaAtoms()

		if util.NArg() == 3 {
			bestFragsForRegion(chain, atoms, 0, len(atoms))
		} else {
			if util.NArg() != 5 {
				log.Println("Both a start and end must be provided.")
				util.Usage()
			}

			s, e := util.Arg(3), util.Arg(4)
			sn, en := util.ParseInt(s)-1, util.ParseInt(e)
			if en-sn < lib.FragmentSize() {
				util.Fatalf("The range [%s, %s] specifies %d alpha-carbon "+
					"atoms while at least %d alpha-carbon atoms are required "+
					"for the given fragment library.",
					s, e, en-sn, lib.FragmentSize())
			}
			bestFragsForRegion(chain, atoms, sn, en)
		}
	}
}
Exemplo n.º 8
0
func search(c *command) {
	c.assertLeastNArg(2)

	// Some search options don't translate directly to command line parameters
	// specified by the flag package.
	if flagSearchDesc {
		flagSearchOpts.Order = bowdb.OrderDesc
	}
	switch flagSearchSort {
	case "cosine":
		flagSearchOpts.SortBy = bowdb.SortByCosine
	case "euclid":
		flagSearchOpts.SortBy = bowdb.SortByEuclid
	default:
		util.Fatalf("Unknown sort field '%s'.", flagSearchSort)
	}

	db := util.OpenBowDB(c.flags.Arg(0))
	bowPaths := c.flags.Args()[1:]

	_, err := db.ReadAll()
	util.Assert(err, "Could not read BOW database entries")

	// always hide the progress bar here.
	bows := util.ProcessBowers(bowPaths, db.Lib, false, flagCpu, true)
	out, outDone := outputter()

	// launch goroutines to search queries in parallel
	wgSearch := new(sync.WaitGroup)
	for i := 0; i < flagCpu; i++ {
		wgSearch.Add(1)
		go func() {
			defer wgSearch.Done()

			for b := range bows {
				sr := db.Search(flagSearchOpts, b)
				out <- searchResult{b, sr}
			}
		}()
	}

	wgSearch.Wait()
	close(out)
	<-outDone
	util.Assert(db.Close())
}
Exemplo n.º 9
0
func init() {
	flag.Float64Var(&flagThreshold, "threshold", flagThreshold,
		"Set the distance threshold to use when computing AUC.")
	util.FlagUse("cpu", "cpuprof")
	util.FlagParse(
		"cath-domain-labels best-of-all-matrix"+
			"(bowdb | matrix-file) out-file "+
			"[ (bowdb | matrix-file) out-file ... ]",
		"Computes the AUC of each aligner matrix (or BOW database) given\n"+
			"with respect to the 'best-of-all' matrix given. Each AUC is\n"+
			"written to a separate out-file. The sizes of all matrices must\n"+
			"be exactly equivalent.\n"+
			"Files are interpreted as BOW databases if they have a '.bowdb'\n"+
			"file extension.")
	util.AssertLeastNArg(4)
	if util.NArg()%2 != 0 {
		util.Fatalf("There must be an out file for each matrix or bowdb file.")
	}
}
Exemplo n.º 10
0
func main() {
	libPath := util.Arg(0)
	chain := util.Arg(1)
	pdbEntryPath := util.Arg(2)
	bowOut := util.Arg(3)

	lib := util.StructureLibrary(libPath)
	entry := util.PDBRead(pdbEntryPath)

	thechain := entry.Chain(chain[0])
	if thechain == nil || !thechain.IsProtein() {
		util.Fatalf("Could not find chain with identifier '%c'.", chain[0])
	}

	bow := bow.BowerFromChain(thechain).StructureBow(lib)
	if bowOut == "--" {
		fmt.Println(bow)
	} else {
		util.BowWrite(util.CreateFile(bowOut), bow)
	}
}
Exemplo n.º 11
0
func recordToDist(record []string) pair {
	namePieces := strings.SplitN(record[0], ".ent_", 2)
	if len(namePieces) != 2 {
		util.Fatalf("Invalid alignment pair: '%s'.", record[0])
	}
	p1, p2 := namePieces[0], namePieces[1]
	p2 = p2[0 : len(p2)-5]

	rf := func(i int) float64 { return readFloat(record[i]) }
	corelen, rmsd := rf(1), rf(2)
	l1, l2 := rf(7), rf(8)
	coreval := (2.0 * corelen) / (l1 + l2)

	dist := -6.04979701*(rmsd-coreval*corelen*0.155+1.6018) + 1000
	dist = 1.0 / dist
	dist *= 100.0
	if p1 < p2 {
		return pair{[2]string{p1, p2}, dist}
	}
	return pair{[2]string{p2, p1}, dist}
}
Exemplo n.º 12
0
func outputter() (chan searchResult, chan struct{}) {
	out := make(chan searchResult)
	done := make(chan struct{})
	go func() {
		if flagSearchOutFmt == "csv" {
			fmt.Printf("QueryID\tHitID\tCosine\tEuclid\n")
		}

		first := true
		for sr := range out {
			switch flagSearchOutFmt {
			case "plain":
				outputPlain(sr, first)
			case "csv":
				outputCsv(sr, first)
			default:
				util.Fatalf("Invalid output format '%s'.", flagSearchOutFmt)
			}
			first = false
		}
		done <- struct{}{}
	}()
	return out, done
}
Exemplo n.º 13
0
func main() {
	if len(util.FlagCpuProf) > 0 {
		f := util.CreateFile(util.FlagCpuProf)
		pprof.StartCPUProfile(f)
		defer f.Close()
		defer pprof.StopCPUProfile()
	}

	// Read all CATH domains, the best-of-all matrix, and the matrix for
	// each aligner.
	domains := readDomains(util.Arg(0))
	boa := readMatrix(domains, util.Arg(1))
	aligners := make([]aligner, 0)
	flibs := make([]flib, 0)
	for i := 2; i < util.NArg(); i += 2 {
		fpath := util.Arg(i)
		if path.Ext(fpath) == ".bowdb" {
			db := util.OpenBowDB(fpath)
			records, err := db.ReadAll()
			util.Assert(err)

			bowed := make([]bow.Bowed, domains.in.Len())
			for _, b := range records {
				if !domains.in.Exists(b.Id) {
					util.Fatalf("Found ID in bowdb that isn't in the list "+
						"of CATH domains provided: %s", b.Id)
				}
				bowed[domains.in.Atom(b.Id)] = b
			}
			flibs = append(flibs, flib{db, bowed, util.Arg(i + 1)})
		} else {
			aligners = append(aligners, aligner{
				readMatrix(domains, fpath),
				util.Arg(i + 1),
			})
		}
	}
	// Now remove CATH domains that don't have a corresponding structure file.
	// We don't do this initially since the matrix files are indexed with
	// respect to all CATH domains (includings ones without structure).
	// This is an artifact of the fact that the matrices were generated with
	// a very old version of CATH.
	domains.removeOldDomains()

	if a := matrixAuc(domains, boa, boa, flagThreshold); a != 1.0 {
		util.Fatalf("Something is wrong. The AUC of the best-of-all matrix "+
			"with respect to itself is %f, but it should be 1.0.", a)
	}

	if len(aligners) > 0 {
		fmt.Println("Computing AUC for aligners...")
		writeAuc := func(aligner aligner) struct{} {
			w := util.CreateFile(aligner.outpath)
			a := matrixAuc(domains, boa, aligner.matrix, flagThreshold)
			fmt.Fprintf(w, "%f\n", a)
			return struct{}{}
		}
		fun.ParMap(writeAuc, aligners)
	}
	if len(flibs) > 0 {
		fmt.Println("Computing AUC for bowdbs...")
		writeAuc := func(flib flib) struct{} {
			w := util.CreateFile(flib.outpath)
			a := flibAuc(domains, boa, flib, flagThreshold)
			fmt.Fprintf(w, "%f\n", a)
			return struct{}{}
		}
		fun.ParMap(writeAuc, flibs)
	}
}
Exemplo n.º 14
0
func main() {
	pdbEntry := util.PDBRead(flag.Arg(0))

	fasEntries := make([]seq.Sequence, 0, 5)
	if !flagSeparateChains {
		var fasEntry seq.Sequence
		if len(pdbEntry.Chains) == 1 {
			fasEntry.Name = chainHeader(pdbEntry.OneChain())
		} else {
			fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode))
		}

		seq := make([]seq.Residue, 0, 100)
		for _, chain := range pdbEntry.Chains {
			if isChainUsable(chain) {
				seq = append(seq, chain.Sequence...)
			}
		}
		fasEntry.Residues = seq

		if len(fasEntry.Residues) == 0 {
			util.Fatalf("Could not find any amino acids.")
		}
		fasEntries = append(fasEntries, fasEntry)
	} else {
		for _, chain := range pdbEntry.Chains {
			if !isChainUsable(chain) {
				continue
			}

			fasEntry := seq.Sequence{
				Name:     chainHeader(chain),
				Residues: chain.Sequence,
			}
			fasEntries = append(fasEntries, fasEntry)
		}
	}
	if len(fasEntries) == 0 {
		util.Fatalf("Could not find any chains with amino acids.")
	}

	var fasOut io.Writer
	if flag.NArg() == 1 {
		fasOut = os.Stdout
	} else {
		if len(flagSplit) > 0 {
			util.Fatalf("The '--split' option is incompatible with a single " +
				"output file.")
		}
		fasOut = util.CreateFile(util.Arg(1))
	}

	if len(flagSplit) == 0 {
		util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries),
			"Could not write FASTA file '%s'", fasOut)
	} else {
		for _, entry := range fasEntries {
			fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name))
			out := util.CreateFile(fp)

			w := fasta.NewWriter(out)
			util.Assert(w.Write(entry), "Could not write to '%s'", fp)
			util.Assert(w.Flush(), "Could not write to '%s'", fp)
		}
	}
}
Exemplo n.º 15
0
func main() {
	fmapPath := util.Arg(0)

	fmap := util.FmapRead(fmapPath)
	qchain := getPdbChain(fmapPath)
	stats := newSequenceStats(qchain.Sequence)

	total, trueps := 0, 0
	qcorrupt, tcorrupt := 0, 0
	for _, frags := range fmap.Segments {
		for _, frag := range frags.Frags {
			hit := frag.Hit

			if frag.IsCorrupt() {
				tcorrupt += 1
				stats.incTCorrupt(hit)
				continue
			}

			qatoms := qchain.SequenceCaAtomSlice(hit.QueryStart-1, hit.QueryEnd)
			if qatoms == nil {
				qcorrupt += 1
				stats.incQCorrupt(hit)
				continue
			}

			if len(qatoms) != len(frag.CaAtoms) {
				util.Fatalf("Uncomparable lengths. Query is (%d, %d) while "+
					"template is (%d, %d). Length of query CaAtoms: %d, "+
					"length of template CaAtoms: %d",
					hit.QueryStart, hit.QueryEnd,
					hit.TemplateStart, hit.TemplateEnd,
					len(qatoms), len(frag.CaAtoms))
			}

			if structure.RMSD(qatoms, frag.CaAtoms) <= flagRmsd {
				trueps += 1
				stats.incTruePs(hit)
			}
			total += 1
			stats.incTotal(hit)
		}
	}

	coveredResidues := 0
	for _, resStats := range stats {
		if resStats.trueps >= 1 {
			coveredResidues += 1
		}
	}
	coverage := float64(coveredResidues) / float64(len(qchain.Sequence))

	fmt.Printf("RMSDThreshold: %f\n", flagRmsd)
	fmt.Printf("TotalFragments: %d\n", total)
	fmt.Printf("TruePositives: %d\n", trueps)
	fmt.Printf("Precision: %f\n", float64(trueps)/float64(total))
	fmt.Printf("CorruptQuery: %d\n", qcorrupt)
	fmt.Printf("CorruptTemplate: %d\n", tcorrupt)
	fmt.Printf("TotalResidues: %d\n", len(qchain.Sequence))
	fmt.Printf("CoveredResidues: %d\n", coveredResidues)
	fmt.Printf("Coverage: %f\n", coverage)
}