func TranslateQuerySeqs( query *bytes.Reader, action SearchOperator) (*bytes.Reader, error) { buf := new(bytes.Buffer) f := fasta.NewWriter(buf) reader := fasta.NewReader(query) for i := 0; true; i++ { sequence, err := reader.Read() if err == io.EOF { break } if err != nil { return nil, err } origSeq := sequence.Bytes() n := sequence.Name // generate 6 ORFs transSeqs := Translate(origSeq) for _, s := range transSeqs { result := seq.NewSequenceString(n, string(Reduce(s))) f.Write(result) } } return bytes.NewReader(buf.Bytes()), nil }
// ReadOriginalSeqs reads a FASTA formatted file and returns a channel that // each new sequence is sent to. func ReadOriginalSeqs( fileName string, ignore []byte, ) (chan ReadOriginalSeq, error) { var f io.Reader var err error f, err = os.Open(fileName) if err != nil { return nil, err } if strings.HasSuffix(fileName, ".gz") { f, err = gzip.NewReader(f) if err != nil { return nil, err } } reader := fasta.NewReader(f) seqChan := make(chan ReadOriginalSeq, 200) go func() { for i := 0; true; i++ { sequence, err := reader.Read() if err == io.EOF { close(seqChan) break } if err != nil { seqChan <- ReadOriginalSeq{ Seq: nil, Err: err, } close(seqChan) break } for i, residue := range sequence.Residues { for _, toignore := range ignore { if toignore == byte(residue) { sequence.Residues[i] = 'X' break } } } seqChan <- ReadOriginalSeq{ Seq: NewFastaOriginalSeq(i, sequence), Err: nil, } } }() return seqChan, nil }
func readSeqs(buf *bytes.Buffer) (HHMSecondary, seq.MSA, error) { // Remember, the sequence portion of an HHM file actually has two parts. // The first part is optional and contains secondary structure information. // These SS sequences can be identified by special sequence headers: // "ss_dssp", "sa_dssp", "ss_pred", "ss_conf", and "Consensus". // If a sequence doesn't contain a special header, then that signifies that // we should start reading the MSA, which comes after the SS information. doneSS := false ss := HHMSecondary{} msa := seq.NewMSA() reader := fasta.NewReader(buf) reader.TrustSequences = true seqs, err := reader.ReadAll() if err != nil { return HHMSecondary{}, seq.MSA{}, err } for _, s := range seqs { s := s if !doneSS { switch { case strings.HasPrefix(s.Name, "ss_dssp"): ss.SSdssp = &s case strings.HasPrefix(s.Name, "sa_dssp"): ss.SAdssp = &s case strings.HasPrefix(s.Name, "ss_pred"): ss.SSpred = &s case strings.HasPrefix(s.Name, "ss_conf"): ss.SSconf = &s case strings.HasPrefix(s.Name, "Consensus"): ss.Consensus = &s default: doneSS = true } } if doneSS { msa.Add(s) } } return ss, msa, nil }
func (coarsedb *CoarseDB) readFasta() error { Vprintf("\t\tReading %s...\n", FileCoarseFasta) timer := time.Now() fastaReader := fasta.NewReader(coarsedb.FileFasta) for i := 0; true; i++ { seq, err := fastaReader.Read() if err == io.EOF { break } if err != nil { return err } coarsedb.Seqs = append(coarsedb.Seqs, NewFastaCoarseSeq(i, seq)) } coarsedb.seqsRead = len(coarsedb.Seqs) Vprintf("\t\tDone reading %s (%s).\n", FileCoarseFasta, time.Since(timer)) return nil }
// This file isn't in the repo, so if it isn't there, we just return nil. // The caller must not fail with nil returned. func readDisordered() map[string]seq.Sequence { f, err := os.Open("pdb_disordered.fasta") if err != nil { return nil } r := fasta.NewReader(f) r.TrustSequences = true m := make(map[string]seq.Sequence, 10000) for { s, err := r.Read() if err == io.EOF { break } assert(err) m[strings.ToUpper(s.Name)] = s } return m }
func main() { a3mPath := util.Arg(0) fa3m := util.OpenFile(a3mPath) freader := fasta.NewReader(fa3m) freader.TrustSequences = true seqs, err := freader.ReadAll() util.Assert(err, "Could not read fasta format '%s'", a3mPath) util.Assert(fa3m.Close()) w := util.CreateFile(a3mPath) fwriter := fasta.NewWriter(w) fwriter.Columns = 0 for _, seq := range seqs { if len(seq.Residues) > 0 { util.Assert(fwriter.Write(seq)) } } util.Assert(fwriter.Flush()) util.Assert(w.Close()) }
func getOneFastaSequence(queryFasta string) (s seq.Sequence, err error) { fquery, err := os.Open(queryFasta) if err != nil { return } defer fquery.Close() seqs, err := fasta.NewReader(fquery).ReadAll() if err != nil { return } else if len(seqs) == 0 { err = fmt.Errorf("No sequences found in '%s'.", queryFasta) return } else if len(seqs) > 1 { err = fmt.Errorf("%d sequences found in '%s'. Expected only 1.", len(seqs), queryFasta) return } s = seqs[0] return }
func main() { rfasta := util.OpenFasta(util.Arg(0)) dir := util.Arg(1) util.Assert(os.MkdirAll(dir, 0777)) fr := fasta.NewReader(rfasta) for { s, err := fr.Read() if err != nil { if err == io.EOF { break } util.Assert(err) } s.Name = strings.Fields(s.Name)[0] fw := util.CreateFile(path.Join(dir, s.Name+".fasta")) w := fasta.NewWriter(fw) util.Assert(w.Write(s)) util.Assert(w.Flush()) util.Assert(fw.Close()) } }
func ReduceQuerySeqs( query *bytes.Reader) (*bytes.Reader, error) { buf := new(bytes.Buffer) f := fasta.NewWriter(buf) reader := fasta.NewReader(query) for i := 0; true; i++ { sequence, err := reader.Read() if err == io.EOF { break } if err != nil { return nil, err } rs := Reduce(sequence.Bytes()) n := sequence.Name result := seq.NewSequenceString(n, string(rs)) f.Write(result) } f.Flush() return bytes.NewReader(buf.Bytes()), nil }
func processCompressedQueries(db *cablastp.DB, queryDBConf *cablastp.DBConf, inputQueryFilename string, searchBuf *bytes.Buffer) error { cablastp.Vprintln("Compressing queries into a database...") dbDirLoc := "./tmp_query_database" // TODO this should be a parameter qDBDirLoc, err := compressQueries(inputQueryFilename, queryDBConf, dbDirLoc) handleFatalError("Error compressing queries", err) cablastp.Vprintln("Opening DB for reading") qDB, err := cablastp.NewReadDB(qDBDirLoc) handleFatalError("Error opening query database", err) cablastp.Vprintln("Opening compressed queries for search...") compQueryFilename := qDB.CoarseFastaLocation() compQueries, err := getInputFasta(compQueryFilename) handleFatalError("Error opening compressed query file", err) queryBuf := new(bytes.Buffer) f := fasta.NewWriter(queryBuf) reader := fasta.NewReader(compQueries) for origSeqID := 0; true; origSeqID++ { sequence, err := reader.Read() if err == io.EOF { break } if err != nil { fatalf("Could not read input fasta query: %s\n", err) } origSeq := sequence.Bytes() n := sequence.Name // generate 6 ORFs transSeqs := cablastp.Translate(origSeq) for _, s := range transSeqs { // reduce each one result := seq.NewSequenceString(n, string(cablastp.Reduce(s))) f.Write(result) } f.Flush() transCoarseQueries := bytes.NewReader(queryBuf.Bytes()) cablastp.Vprintln("\nBlasting query on coarse database...") err = blastCoarse(db, transCoarseQueries, searchBuf) handleFatalError("Error blasting coarse database", err) cablastp.Vprintln("Decompressing coarse blast hits...") expandedSequences, err := expandBlastHits(db, searchBuf) handleFatalError("Error decompressing coarse blast hits", err) if len(expandedSequences) == 0 { cablastp.Vprintln("No results from coarse search") } else { cablastp.Vprintln("Making FASTA from coarse blast hits...") searchBuf.Reset() err = writeFasta(expandedSequences, searchBuf) handleFatalError("Could not create FASTA input from coarse hits", err) cablastp.Vprintln("Expanding coarse query...") expQuery, err := expandCoarseSequence(qDB, origSeqID, &sequence) handleFatalError("Could not expand coarse queries", err) fineQueryBuf := new(bytes.Buffer) fineWriter := fasta.NewWriter(fineQueryBuf) for _, fineQuery := range expQuery { fineQueryBytes := fineQuery.FastaSeq().Bytes() // <- Is This the same as fineQuery.Residues()? fineName := fineQuery.Name writeSeq := seq.NewSequenceString(fineName, string(fineQueryBytes)) fineWriter.Write(writeSeq) } fineWriter.Flush() transFineQueries := bytes.NewReader(fineQueryBuf.Bytes()) cablastp.Vprintln("Building fine BLAST target database...") targetTmpDir, err := makeFineBlastDB(db, searchBuf) handleFatalError("Could not create fine database to search on", err) cablastp.Vprintln("Blasting original query on fine database...") err = blastFine(db, targetTmpDir, transFineQueries) handleFatalError("Error blasting fine database", err) err = os.RemoveAll(targetTmpDir) handleFatalError("Could not remove fine database", err) } queryBuf.Reset() } cablastp.Vprintln("Cleaning up...") err = os.RemoveAll(dbDirLoc) handleFatalError("Could not remove query database", err) return nil }
func main() { searchBuf := new(bytes.Buffer) // might need more than 1 buffer if flag.NArg() != 2 { flag.Usage() } // If the quiet flag isn't set, enable verbose output. if !flagQuiet { cablastp.Verbose = true } queryDBConf := argDBConf.DeepCopy() // deep copy of the default DBConf // updated by the args inputFastaQueryName := flag.Arg(1) db, err := cablastp.NewReadDB(flag.Arg(0)) if err != nil { fatalf("Could not open '%s' database: %s\n", flag.Arg(0), err) } // For query-compression mode, we first run compression on the query file // then coarse-coarse search, decompress both, fine-fine search. // otherwise, just coarse search, decompress results, fine search. // iterate over the query sequences in the input fasta // initially, only implement standard search. if flagCompressQuery { processCompressedQueries(db, queryDBConf, inputFastaQueryName, searchBuf) } else { queryBuf := new(bytes.Buffer) // might need more than 1 buffer inputFastaQuery, err := getInputFasta(inputFastaQueryName) handleFatalError("Could not read input fasta query", err) f := fasta.NewWriter(queryBuf) reader := fasta.NewReader(inputFastaQuery) for i := 0; true; i++ { if flagIterativeQuery { for j := 0; j < flagQueryChunkSize; j++ { translateQueries(reader, f) } transQueries := bytes.NewReader(queryBuf.Bytes()) processQueries(db, transQueries, searchBuf) queryBuf.Reset() } else { translateQueries(reader, f) } } if !flagIterativeQuery { cablastp.Vprintln("\nProcessing Queries in one batch...") f.Flush() transQueries := bytes.NewReader(queryBuf.Bytes()) processQueries(db, transQueries, searchBuf) } } cleanup(db) }
// BowerOpen reads the contents of `fpath` and attempts to interpret it as a // value (or values) that implement the `bow.Bower` interface. The list of // `bow.Bower` values returned is guaranteed to be homogenous: they will // either be all `bow.SequenceBower` values or `bow.StructureBower` values. // // The actual return value of the function is a receive-only channel of BowerErr // values. Each BowerErr value either has the `Bower` member set or has the // `err` field set to an error that prevented the file from being opened. // Errors in this case are reserved for files that appear to be capable of // producing a BOW, but were unable to be read. // // If the fpath given cannot be detected as a bower file, then a closed empty // channel will be returned. A warning is also emitted to stderr. // // `lib` is a fragment library that is used to help interpret what kind of // value must be in `r`. For example, if `lib` is a sequence fragment library, // then `BowerOpen` is guaranteed to return a `Bower` value that implements the // `bow.SequenceBower` interface. // // As of now, `BowerOpen` can read these types of files: // // File extension Format Interpretation // *.{ent.gz,pdb,ent} PDB whatever `lib` is // *.{fasta,fas,fasta.gz,fas.gz} FASTA sequence // everything else error invalid // // Note that special syntax for PDB file names is supported. Namely, chain // identifiers can be appended to the end of the file name, and only that chain // will be included in the `bow.Bower` value. Otherwise, all chains in the PDB // entry will be returned as individual `bow.Bower` values. // // The format is simple and easily demonstrated by examples: // // 1ctf.end.gz Chains A and B // 1ctf.ent.gz:A Only chain A // 1ctf.ent.gz:B Only chain B // 1ctf.ent.gz:A,B Chains A and B // // A secondary format is also accepted. The following are equivalent to their // corresponding examples above: // // 1ctf // 1ctfA // 1ctfB // 1ctf:A,B // // Finally, `fpath` may be the name of a PDB identifier and its file path will // be inferred from the value of the `PDB_PATH` environment variable. // Alternatively, `fpath` may be the name of a SCOP domain, and its // corresponding PDB file will be inferred from the value of the // `SCOP_PDB_PATH` environment variable. func BowerOpen(fpath string, lib fragbag.Library, models bool) <-chan BowerErr { if lib == nil { Fatalf("Files can only be converted to Fragbag frequency vectors " + "if a fragment library is specified.") } bowers := make(chan BowerErr, 100) switch { case IsPDB(fpath): go func() { defer close(bowers) entry, chains, err := PDBOpen(fpath) if err != nil { err = fmt.Errorf("Error reading '%s': %s", fpath, err) bowers <- BowerErr{Err: err} return } if fragbag.IsStructure(lib) { for i := range chains { if !chains[i].IsProtein() { continue } if !models { b := bow.BowerFromChain(chains[i]) bowers <- BowerErr{Bower: b} } else { for _, m := range chains[i].Models { b := bow.BowerFromModel(m) bowers <- BowerErr{Bower: b} } } } } else { for i := range chains { if !chains[i].IsProtein() { continue } s := chains[i].AsSequence() if s.Len() == 0 { s = aminoFromStructure(chains[i]) if s.Len() == 0 { Warnf("Chain '%s:%c' has no amino sequence.", entry.IdCode, chains[i].Ident) continue } } bowers <- BowerErr{Bower: bow.BowerFromSequence(s)} } } }() return bowers case IsFasta(fpath) && !fragbag.IsStructure(lib): go func() { defer close(bowers) r, fp, err := fastaOpen(fpath) if err != nil { err = fmt.Errorf("Error reading file: %s", err) bowers <- BowerErr{Err: err} return } defer fp.Close() fr := fasta.NewReader(r) for { s, err := fr.Read() if err != nil { if err == io.EOF { break } err = fmt.Errorf("Error reading file: %s", err) bowers <- BowerErr{Err: err} return } bowers <- BowerErr{Bower: bow.BowerFromSequence(s)} } }() return bowers } Warnf("I don't know how to produce a Fragbag frequency vector "+ "from the file '%s'.", fpath) close(bowers) return bowers }
// ReadTrustedFasta will read a single MSA from trusted input, where the input // can be formatted/ in FASTA format. Sequences are read until io.EOF. // // "Trust" in this context means that the input doesn't contain any illegal // characters in the sequence. Trusting the input should be faster. // // If you need to read A2M or A3M aligned formats, use ReadTrusted. func ReadTrustedFasta(reader io.Reader) (seq.MSA, error) { r := fasta.NewReader(reader) r.TrustSequences = true return read(r, true) }
// Read will read a single MSA from the input, where the input can be formatted // in A2M or A3M formats. Sequences are read until io.EOF. // // If you need to read FASTA aligned format, use ReadFasta. func Read(reader io.Reader) (seq.MSA, error) { r := fasta.NewReader(reader) r.TrustSequences = false return read(r, false) }