func init() { var err error m, err = NewMulti("example multi", []seq.Sequence{ linear.NewSeq("example DNA 1", []alphabet.Letter("ACGCTGACTTGGTGCACGT"), alphabet.DNA), linear.NewSeq("example DNA 2", []alphabet.Letter("ACGGTGACCTGGCGCGCAT"), alphabet.DNA), linear.NewSeq("example DNA 3", []alphabet.Letter("ACGATGACGTGGCGCTCAT"), alphabet.DNA), }, seq.DefaultConsensus) if err != nil { panic(err) } }
func (s *S) TestReadFasta(c *check.C) { var ( obtainN []string obtainS [][]alphabet.Letter ) for _, fa := range fas { r := NewReader(bytes.NewBufferString(fa), linear.NewSeq("", nil, alphabet.Protein)) for { if s, err := r.Read(); err != nil { if err == io.EOF { break } else { c.Fatalf("Failed to read %q: %s", fa, err) } } else { t := s.(*linear.Seq) header := t.Name() if desc := t.Description(); len(desc) > 0 { header += " " + desc } obtainN = append(obtainN, header) obtainS = append(obtainS, t.Slice().(alphabet.Letters)) } } c.Check(obtainN, check.DeepEquals, expectN) obtainN = nil for i := range obtainS { c.Check(len(obtainS[i]), check.Equals, len(expectS[i])) c.Check(obtainS[i], check.DeepEquals, expectS[i]) } obtainS = nil } }
func (s *S) TestReadFromFunc(c *check.C) { var ( obtainNfa []string obtainSfa [][]alphabet.Letter ) sc := seqio.NewScannerFromFunc( fasta.NewReader( bytes.NewBufferString(testaln0), linear.NewSeq("", nil, alphabet.Protein), ).Read, ) for sc.Next() { t := sc.Seq().(*linear.Seq) header := t.Name() if desc := t.Description(); len(desc) > 0 { header += " " + desc } obtainNfa = append(obtainNfa, header) obtainSfa = append(obtainSfa, t.Slice().(alphabet.Letters)) } c.Check(sc.Error(), check.Equals, nil) c.Check(obtainNfa, check.DeepEquals, expectNfa) for i := range obtainSfa { c.Check(len(obtainSfa[i]), check.Equals, len(expectSfa[i])) c.Check(obtainSfa[i], check.DeepEquals, expectSfa[i]) } }
func (s *S) TestReadFasta(c *check.C) { r := fasta.NewReader(strings.NewReader(fa), linear.NewSeq("", nil, alphabet.Protein)) m, _ := multi.NewMulti("", nil, seq.DefaultConsensus) a, err := NewReader(r, m).Read() c.Check(err, check.Equals, nil) c.Check(a.Rows(), check.Equals, 11) }
func main() { flag.Parse() if *exclude == "" { flag.Usage() os.Exit(1) } nameSet := make(map[string]struct{}) f, err := os.Open(*exclude) if err != nil { log.Fatalf("failed to open exclude file %q: %v", *exclude, err) } ls := bufio.NewScanner(f) for ls.Scan() { nameSet[ls.Text()] = struct{}{} } err = ls.Err() if err != nil { log.Fatalf("failed to read exclude file: %v", err) } sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) if _, ok := nameSet[s.ID]; ok { continue } fmt.Printf("%60a\n", s) } if err := sc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } }
func ExampleSet_AppendEach() { ss := [][]alphabet.Letter{ []alphabet.Letter("ACGCTGACTTGGTGCACGT"), []alphabet.Letter("ACGACTGGGACGT"), []alphabet.Letter("ACGCTGACTGGCCGT"), []alphabet.Letter("GCCTTTGCACGT"), } set = make(Set, 4) for i := range set { set[i] = linear.NewSeq(fmt.Sprintf("example DNA %d", i), ss[i], alphabet.DNA) } as := [][]alphabet.QLetter{ alphabet.QLetter{L: 'A'}.Repeat(2), alphabet.QLetter{L: 'C'}.Repeat(2), alphabet.QLetter{L: 'G'}.Repeat(2), alphabet.QLetter{L: 'T'}.Repeat(2), } set.AppendEach(as) for _, s := range set { fmt.Printf("%-s\n", s) } // Output: // ACGCTGACTTGGTGCACGTAA // ACGACTGGGACGTCC // ACGCTGACTGGCCGTGG // GCCTTTGCACGTTT }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))) for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } tmp := *seq tmp.ID += fmt.Sprintf("//%d_%d", start, end) tmp.Seq = tmp.Seq[start:end] fmt.Printf("%60a\n", &tmp) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }
func (s *S) SetUpSuite(c *check.C) { MaxKmerLen = 14 s.Seq = linear.NewSeq("", nil, alphabet.DNA) s.Seq.Seq = make(alphabet.Letters, testLen) for i := range s.Seq.Seq { s.Seq.Seq[i] = [...]alphabet.Letter{'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'}[rand.Int()%8] } }
func ExampleNewMulti() { m, err := NewMulti("example multi", []seq.Sequence{ linear.NewSeq("example DNA 1", []alphabet.Letter("ACGCTGACTTGGTGCACGT"), alphabet.DNA), linear.NewSeq("example DNA 2", []alphabet.Letter("ACGGTGACCTGGCGCGCAT"), alphabet.DNA), linear.NewSeq("example DNA 3", []alphabet.Letter("ACGATGACGTGGCGCTCAT"), alphabet.DNA), }, seq.DefaultConsensus) if err != nil { return } fmt.Printf("%- s\n\n%-s\n", m, m.Consensus(false)) // Output: // ACGCTGACTTGGTGCACGT // ACGGTGACCTGGCGCGCAT // ACGATGACGTGGCGCTCAT // // acgntgacntggcgcncat }
func getFasta(fn string) (seq.Sequence, error) { fasta_file, err := os.Open(fn) if err != nil { fmt.Println("Erro ao ler o arquivo", err) } defer fasta_file.Close() var s []alphabet.Letter t := linear.NewSeq("", s, alphabet.Protein) reader := fasta.NewReader(fasta_file, t) seq, _ := reader.Read() return seq, nil }
// writeFlankSeqs writes fasta files containing the sequence of unmapped flanks // identified in the primary hits provided. cutoff specifies the minimum sequence // length to consider. left and right specify the filenames for the left and right // flank fasta sequence files. func writeFlankSeqs(reads string, hits hitSet, cutoff int, left, right string) error { f, err := os.Open(reads) if err != nil { return err } defer f.Close() lf, err := os.Create(left) if err != nil { return err } rf, err := os.Create(right) if err != nil { return err } r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)) sc := seqio.NewScanner(r) for sc.Next() { seq := sc.Seq().(*linear.Seq) h, ok := hits[seq.Name()] if !ok { continue } all := seq.Seq if h.qStart >= cutoff { seq.Seq = all[:h.qStart] _, err := fmt.Fprintf(lf, "%60a\n", seq) if err != nil { return err } } if h.qLen-h.qEnd >= cutoff { seq.Seq = all[h.qEnd:] _, err := fmt.Fprintf(rf, "%60a\n", seq) if err != nil { return err } } } err = sc.Error() if err != nil { return err } err = lf.Close() if err != nil { return err } return rf.Close() }
func readFasta(fn string) (name string, seq string, err error) { fFasta, err := os.Open(fn) defer fFasta.Close() if err != nil { return "", "", err } t := linear.NewSeq("", nil, alphabet.Protein) reader := fasta.NewReader(fFasta, t) s, err := reader.Read() if err != nil { return "", "", err } sl := s.(*linear.Seq) return sl.Name(), sl.String(), nil }
func readContigs(file string) (map[string]int, error) { f, err := os.Open(file) if err != nil { return nil, err } lengths := make(map[string]int) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq() lengths[s.Name()] = s.Len() } if err != nil { log.Fatalf("error during fasta read: %v", err) } return lengths, nil }
func (r Row) Clone() seq.Sequence { b := make([]alphabet.Letter, r.Len()) for i, c := range r.Align.Seq { b[i] = c[r.Row] } switch { case r.Row < 0: panic("under") case r.Row >= r.Align.Rows(): panic("bang over Rows()") case r.Row >= len(r.Align.SubAnnotations): panic(fmt.Sprintf("bang over len(SubAnns): %d %d", r.Row, len(r.Align.SubAnnotations))) } return linear.NewSeq(r.Name(), b, r.Alphabet()) }
func readContigs(file string) (map[string]*linear.Seq, error) { f, err := os.Open(file) if err != nil { return nil, err } seqs := make(map[string]*linear.Seq) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { s := sc.Seq().(*linear.Seq) seqs[s.ID] = s } if err != nil { return nil, err } return seqs, nil }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } defer f.Close() names := make(map[string][]string) sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { seq := sc.Seq().(*linear.Seq) idx := strings.LastIndex(seq.ID, "/") names[seq.ID[:idx]] = append(names[seq.ID[:idx]], seq.ID[idx+1:]) } if err := sc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() base := filepath.Base(*in) unique, err := os.Create(base + ".unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".unique.text", err) } defer unique.Close() nonUnique, err := os.Create(base + ".non-unique.text") if err != nil { log.Fatalf("failed to create %q: %v", base+".non-unique.text", err) } defer nonUnique.Close() for name, coords := range names { switch len(coords) { case 0: case 1: fmt.Fprintln(unique, name) default: fmt.Fprintf(nonUnique, "%s\t%v\n", name, coords) } } }
func main() { flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } inFile, err := os.Open(*in) if err != nil { log.Fatalf("failed to open input:%v", err) } defer inFile.Close() *in = filepath.Base(*in) sc := seqio.NewScanner(fasta.NewReader(inFile, linear.NewSeq("", nil, alphabet.DNA))) var i, size int out, err := os.Create(fmt.Sprintf("%s-%d.fa", *in, i)) for sc.Next() { if sc.Seq().Len() < *cut { continue } if size != 0 && size+sc.Seq().Len() > *bundle { err = out.Close() if err != nil { log.Fatalf("failed to close file bundle %d: %v", i, err) } i++ size = 0 out, err = os.Create(fmt.Sprintf("%s-%d.fa", *in, i)) if err != nil { log.Fatalf("failed to open file bundle %d: %v", i, err) } } size += sc.Seq().Len() fmt.Fprintf(out, "%60a\n", sc.Seq()) } if sc.Error() != nil { log.Fatal(sc.Error()) } err = out.Close() if err != nil { log.Fatalf("failed to close file bundle %d: %v", i, err) } }
func (r *Reader) metaSeq(moltype, id []byte) (seq.Sequence, error) { var line, body []byte var err error for { line, err = r.r.ReadBytes('\n') if err != nil { if err == io.EOF { return nil, err } return nil, &csv.ParseError{Line: r.line, Err: err} } r.line++ line = bytes.TrimSpace(line) if len(line) == 0 { continue } if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) { return nil, &csv.ParseError{Line: r.line, Err: ErrBadSequence} } line = bytes.TrimSpace(line[2:]) if unsafeString(line) == "end-"+unsafeString(moltype) { break } else { line = bytes.Join(bytes.Fields(line), nil) body = append(body, line...) } } var alpha alphabet.Alphabet switch feat.ParseMoltype(unsafeString(moltype)) { case feat.DNA: alpha = alphabet.DNA case feat.RNA: alpha = alphabet.RNA case feat.Protein: alpha = alphabet.Protein default: return nil, ErrBadMoltype } s := linear.NewSeq(string(id), alphabet.BytesToLetters(body), alpha) return s, err }
func (s *S) TestWriteFasta(c *check.C) { fa := fas[0] b := &bytes.Buffer{} w := NewWriter(b, 60) seq := linear.NewSeq("", nil, alphabet.Protein) var n int for i := range expectN { seq.ID = expectN[i] seq.Seq = expectS[i] _n, err := w.Write(seq) if err != nil { c.Fatalf("Failed to write to buffer: %s", err) } n += _n } c.Check(n, check.Equals, b.Len()) c.Check(string(b.Bytes()), check.Equals, fa) }
func mangle() { seen := make(map[string]bool) hash := sha1.New() sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) if s.Desc == "" { s.Desc = s.ID } else { s.Desc = fmt.Sprintf("%s %s", s.ID, s.Desc) } hash.Write([]byte(s.Desc)) s.ID = fmt.Sprintf("%040x", hash.Sum(nil)) if seen[s.ID] { log.Fatalf("duplicate sha1: %s", s.ID) } seen[s.ID] = true hash.Reset() fmt.Printf("%60a\n", s) } }
func main() { flag.Parse() if *in == "" || *typ < 0 || 2 < *typ { flag.Usage() os.Exit(1) } cfn := []func(s seq.Sequence, start, end int) (float64, error){ 0: complexity.WF, 1: complexity.Entropic, 2: complexity.Z, }[*typ] f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } defer f.Close() sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) for sc.Next() { seq := sc.Seq().(*linear.Seq) // err is always nil for a linear.Seq Start() and End(). c, _ := cfn(seq, seq.Start(), seq.End()) if *dist { fmt.Printf("%s\t%v\t%d\n", seq.Name(), c, seq.Len()) continue } if c >= *thresh { fmt.Printf("%60a\n", seq) } } if err := sc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } }
func unmangle(mapfile string) { table := make(map[string]string) sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA))) for sc.Next() { s := sc.Seq().(*linear.Seq) id := strings.Fields(s.Desc)[0] if id == "" { log.Fatalf("no id for sequence %s", s.ID) } table[s.ID] = id } f, err := os.Open(mapfile) if err != nil { log.Fatalf("failed to open map file %q: %v", mapfile, err) } s := bufio.NewScanner(f) for s.Scan() { line := s.Text() fields := strings.Fields(line) if len(fields) <= *queryNameField { log.Fatalf("unexpected number of fields in line %q", line) } id := table[fields[*queryNameField]] if id == "" { log.Fatalf("no id for map query %s", fields[*queryNameField]) } fields[*queryNameField] = id for i, f := range fields { if i != 0 { fmt.Print("\t") } fmt.Print(f) } fmt.Println() } }
// Helper func stringToSeq(s string) *linear.Seq { return linear.NewSeq("", alphabet.BytesToLetters([]byte(s)), alphabet.DNA) }
func main() { if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file") os.Exit(1) } extract := make(map[string][2]int) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) read := f.FeatAttributes.Get("Read") if read == "" { continue } fields := strings.Fields(read) name := fields[0] start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } extract[name] = [2]int{start, end} } err := sc.Error() if err != nil { log.Fatalf("error during GFF read: %v", err) } for _, reads := range os.Args[1:] { sf, err := os.Open(reads) if err != nil { log.Fatalf("failed to open %q: %v", reads, err) } sr, err := sam.NewReader(sf) if err != nil { log.Fatalf("failed to open SAM input %q: %v", reads, err) } for { r, err := sr.Read() if err != nil { if err != io.EOF { log.Fatalf("unexpected error reading SAM: %v", err) } break } v, ok := extract[r.Name] if !ok { continue } // Currently reefer only expects a single hit per read, // so any multiples are due to duplicate read file input. // Update this behaviour if we change reefer to look at // remapping soft-clipped segments. delete(extract, r.Name) reverse := r.Flags&sam.Reverse != 0 rng := fmt.Sprintf("//%d_%d", v[0], v[1]) if reverse { rng += "(-)" len := r.Seq.Length v[0], v[1] = len-v[1], len-v[0] } v[0] = feat.OneToZero(v[0]) s := linear.NewSeq( r.Name+rng, alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]], alphabet.DNA, ) if reverse { s.Desc = "(sequence revcomp relative to read)" } fmt.Printf("%60a\n", s) } sf.Close() } }
##DNA <seqname> ##acggctcggattggcgctggatgatagatcagacgac ##... ##end-DNA ##RNA <seqname> ##acggcucggauuggcgcuggaugauagaucagacgac ##... ##end-RNA ##Protein <seqname> ##MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF ##... ##end-Protein ##sequence-region <seqname> 1 5 `, feat: []feat.Feature{ linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein), &Region{Sequence: Sequence{SeqName: "<seqname>", Type: feat.DNA}, RegionStart: 0, RegionEnd: 5}, }, write: []interface{}{ 2, "source-version <source> <version-text>", mustTime(time.Parse(Astronomical, "1997-11-08")), Sequence{SeqName: "<seqname>", Type: feat.DNA}, linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein), &Region{Sequence: Sequence{SeqName: "<seqname>"}, RegionStart: 0, RegionEnd: 5}, }, },
func main() { flag.IntVar(&maxFam, "maxFam", 0, "maxFam indicates maximum family size considered (0 == no limit).") flag.BoolVar(&subSample, "subsample", false, "choose maxFam members of a family if the family has more than maxFam members.") flag.BoolVar(&consFasta, "fasta", false, "output consensus as fasta with quality case filtering.") flag.StringVar(&cDir, "cDir", "", "target directory for consensus output. If not empty Dir is deleted first.") flag.StringVar(&aDir, "aDir", "", "target directory for alignment output. If not empty dir is deleted first.") flag.StringVar(&sDir, "sDir", "", "target directory for sequence information output. If not empty dir is deleted first.") flag.Parse() fmt.Printf("Initialising Files: %s\n", flag.Args()[0]) checks(cDir, aDir, sDir) //Opening files f, fErr := os.Open(flag.Args()[0]) if fErr != nil { log.Printf("error: could not open %s to read %v", flag.Args()[0], fErr) } defer f.Close() // Reading in sequences var v []seq.Sequence r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)) sc := seqio.NewScanner(r) for sc.Next() { v = append(v, sc.Seq()) } if sc.Error() != nil { log.Fatalf("failed to read sequences: %v", sc.Error()) } //Checking that there aren't too many sequences if maxFam != 0 && !subSample && len(v) > maxFam { log.Fatalf("too many sequences: %d", len(v)) } var aveLength int if subSample { // Shuffle first. w := make([]seq.Sequence, 0, len(v)) for _, j := range rand.Perm(len(v)) { w = append(w, v[j]) } v = w // Calculating lengths and averages for all sequences read var LenAllSeq = make([]int, len(v)) for num, i := range v { LenAllSeq[num] = i.Len() } // fmt.Printf("The variable: %v\n", LenAllSeq) totalLength := 0 for i := 0; i < len(v); i++ { totalLength = totalLength + LenAllSeq[i] } aveLength = totalLength / len(v) fmt.Printf("Average length: %v\n", aveLength) } // FIXME(brittany): Make this more scientifically robust. var ( sampled int buf bytes.Buffer ) // creating a file for the sequence information seqFile := fmt.Sprintf("%s_included_sequences.fq", flag.Args()[0]) seqOut, sErr := os.Create(filepath.Join(aDir, seqFile)) if sErr != nil { log.Fatalf("failed to create %s: %v", seqFile, sErr) } defer seqOut.Close() seqBufOut := bufio.NewWriter(f) defer seqBufOut.Flush() // printing subsampled sequences to a file var ( LenSubSeq = make([]int, maxFam) // length of each sequence ) for num, s := range v { if sampled++; subSample && sampled > maxFam { break } fmt.Fprintf(&buf, "%60a\n", s) fmt.Fprintf(seqOut, "including: %s %s\n", s.Name(), s.Description()) // Calculating lengths and averages for the subsampled sequences LenSubSeq[num] = s.Len() } totalSubLength := 0 for i := 0; i < maxFam; i++ { totalSubLength = totalSubLength + LenSubSeq[i] } fmt.Printf("total length of subbed: %v\n", totalSubLength) aveSubLength := totalSubLength / maxFam fmt.Printf("Average length of subbed: %v\n", aveSubLength) fmt.Printf("The sub seq variable for %v sequences: %v\n", maxFam, LenSubSeq) var ( c *linear.QSeq m *multi.Multi err error ) //Creating the consensus fmt.Println("Creating consensus") c, m, err = consensus(&buf) if err != nil { log.Printf("failed to generate consensus for %s: %v", flag.Args()[0], err) return } c.ID = fmt.Sprintf("%s_consensus", flag.Args()[0]) c.Desc = fmt.Sprintf("%d members total %v members sampled", len(v), sampled-1) c.Threshold = 42 c.QFilter = seq.CaseFilter conLength := c.Len() // Find a way to make the consensus the same case // Calculating cutoff of consensus length to mean sequence length fmt.Printf("Length of consensus:%v\n", conLength) cutoffSubs := float64(conLength) / float64(aveLength) cutoffTotal := float64(conLength) / float64(aveSubLength) fmt.Printf("ratio for all seqs: %f, \nratio for just sub-sampled: %f\n", cutoffTotal, cutoffSubs) // Creating a file for the consensus length information confile := fmt.Sprintf("%s_consensus-length-%v.txt", flag.Args()[0], maxFam) conOut, err := os.Create(filepath.Join(cDir, confile)) if err != nil { log.Fatalf("failed to create %s: %v", confile, err) } fmt.Fprintf(conOut, "number sampled: %v \nratio for all seqs: %f \nratio for just sub-sampled: %f\n\n %f\t%f", maxFam, cutoffTotal, cutoffSubs, cutoffTotal, cutoffSubs) // Creating a file for the consensus file := fmt.Sprintf("%s_consensus.fq%v", flag.Args()[0], maxFam) out, err := os.Create(filepath.Join(cDir, file)) if err != nil { log.Fatalf("failed to create %s: %v", file, err) } if consFasta { fmt.Fprintf(out, "%60a\n", c) } else { fmt.Fprintf(out, "%q\n", c) } // creating a file for the mutliple alignment alignFile := fmt.Sprintf("%s_multiple_alignment%v.fq", flag.Args()[0], maxFam) AlignOut, err := os.Create(filepath.Join(aDir, alignFile)) if err != nil { log.Fatalf("failed to create %s: %v", alignFile, err) } if consFasta { fmt.Fprintf(AlignOut, "%60a\n", m) } else { fmt.Fprintf(AlignOut, "%q\n", m) } out.Close() fmt.Printf("Complete\n\n") }
// adjustDeletion performs a deletion ends refinement based on a // pair of Smith-Waterman alignments. // // l s e r // ref: -----|------+~~~+------|---------- // // query_left: ----|-----------+~~~~~~|~~~~~~+--------------- // l s m e // query_right: ----------------+~~~~~~|~~~~~~+-----------|--- // s m e r // // where ~~ is the region found by CIGAR score walking above in the // deletions function. // // align ref(l..r) with query_left(l..m) -> ref(s)-query_left(s) // align ref(l..r) with query_right(m..r) -> ref(e)-query_left(e) // // This can give either of two outcomes: // 1. ref(s) < ref(e) // 2. ref(e) <= ref(s) // // The first case is a standard colinear alignment: // // s e // ref: -----------+---+----------------- // / \ // / \ // / \ // / \ // query: ----------------+-------------+--------------- // s e // // // The second case is a non-colinear alignment: // // e s // ref: -----------+---+----------------- // \ / // / // / \ // / \ // / \ // / \ // / \ // / \ // query: ----------------+-------------+--------------- // s e // // // which has a potential target site duplication interpretation: // // e s // ref: -----------+---+----------------- // / \ / \ // / / \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // query: ------------+---+-------------+---+----------- // s e // // adjustDeletions handles the second case by making ref(s=e) for the // reference and adding annotation for the length of the duplication // (d) in ref: // // s|e s+d // ref: -----------+---+----------------- // / \ / \ // / / \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // query: ------------+---+-------------+---+----------- // s-d s e e+d // func (r *refiner) adjust(d deletion) (refined deletion, ok bool, err error) { if r == nil { return d, false, nil } if d.qend-d.qstart < d.rend-d.rstart { // Do not do any work for deletions. return d, false, fmt.Errorf("not an insertion: len(q)=%d len(r)=%d", d.qend-d.qstart, d.rend-d.rstart) } name := d.record.Ref.Name() ref, ok := r.ref[name] if !ok { return d, false, fmt.Errorf("no reference sequence for %q", name) } rs := *ref rOff := max(0, d.rstart-r.refWindow/2) rs.Seq = ref.Seq[rOff:min(d.rend+r.refWindow/2, len(ref.Seq))] q := alphabet.BytesToLetters(d.record.Seq.Expand()) // Align the left junction of the qeuery to // the reference around the indel site. qsl := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped) qOffLeft := max(0, d.qstart-r.queryWindow) qsl.Seq = q[qOffLeft : (d.qstart+d.qend)/2] alnl, err := r.sw.Align(&rs, qsl) if err != nil { return d, false, err } // Align the right junction of the qeuery to // the reference around the indel site. qsr := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped) qOffRight := (d.qstart + d.qend) / 2 qsr.Seq = q[qOffRight:min(d.qend+r.queryWindow, len(q))] alnr, err := r.sw.Align(&rs, qsr) if err != nil { return d, false, err } // Get left and right ends of insertion in read // and the aligned segment of the reference. left := alnl[len(alnl)-1].Features() right := alnr[0].Features() // Bail out if the alignment extends too far. // We might have continued alignment. if flank := right[0].Start(); flank < r.minRefFlank { return d, false, fmt.Errorf("skipping: right ref flank less than %d from left: len(flank)=%v", r.minRefFlank, flank) } if flank := left[0].End(); len(rs.Seq)-flank < r.minRefFlank { return d, false, fmt.Errorf("skipping: left ref flank less than %d from right: len(flank)=%v", r.minRefFlank, len(rs.Seq)-flank) } centrel := r.queryWindow + (d.qend-d.qstart)/2 centrer := 0 // Bail out if the insertion is too short. // We might have continued alignment. if gap := centrel - left[1].End(); gap < r.minQueryGap { return d, false, fmt.Errorf("skipping left: left query gap less than %d from centre: len(gap)=%v", r.minQueryGap, gap) } if gap := right[1].Start() - centrer; gap < r.minQueryGap { return d, false, fmt.Errorf("skipping right: right query gap less than %d from centre: len(gap)=%v", r.minQueryGap, gap) } d.rstart = rOff + left[0].End() d.rend = rOff + right[0].Start() if d.rend <= d.rstart { d.dup = d.rstart - d.rend d.rstart = d.rend } d.qstart = qOffLeft + left[1].End() d.qend = qOffRight + alnr[0].Features()[1].Start() return d, true, nil }
func main() { flag.Var(&alnmat, "align", "specify the match, mismatch and gap parameters") flag.Parse() if *in == "" { flag.Usage() os.Exit(1) } f, err := os.Open(*in) if err != nil { log.Fatalf("failed to open %q: %v", *in, err) } events := make(map[string][]*gff.Feature) fsc := featio.NewScanner(gff.NewReader(f)) for fsc.Next() { f := fsc.Feat().(*gff.Feature) fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } events[fields[0]] = append(events[fields[0]], f) } if err := fsc.Error(); err != nil { log.Fatalf("error during gff read: %v", err) } f.Close() w := gff.NewWriter(os.Stdout, 60, true) w.WriteComment("Right coordinates (field 5) and strand (field 7) are hypothetical.") var out *os.File if *fastaOut != "" { out, err = os.Create(*fastaOut) if err != nil { log.Fatalf("failed to create fasta insertion output file %q: %v", *fastaOut, err) } defer out.Close() } hw := *window / 2 sw := makeTable(alphabet.DNAgapped, alnmat) for _, ref := range flag.Args() { f, err = os.Open(ref) if err != nil { log.Fatalf("failed to open reference %q: %v", ref, err) } ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped))) loop: for ssc.Next() { seq := ssc.Seq().(*linear.Seq) for _, f := range events[seq.Name()] { fields := strings.Fields(f.FeatAttributes.Get("Read")) if len(fields) != 3 { log.Fatalf("bad record: %+v", f) } start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to get start coordinate: %v", err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to get end coordinate: %v", err) } if out != nil { insert := *seq if insert.Desc != "" { insert.Desc += " " } insert.Desc += fmt.Sprintf("[%d,%d)", start, end) insert.Seq = insert.Seq[start:end] fmt.Fprintf(out, "%60a\n", &insert) } var lOff, lEnd, rOff, rEnd int // If we have refined ends, use them. if dup := f.FeatAttributes.Get("Dup"); dup != "" { d, err := strconv.Atoi(dup) if err != nil { log.Fatalf("failed to get duplication length: %v", err) } lOff = max(0, start-d) lEnd = start rOff = end rEnd = min(len(seq.Seq), end+d) } else { lOff = max(0, start-hw) lEnd = min(len(seq.Seq), start+hw) rOff = max(0, end-hw) rEnd = min(len(seq.Seq), end+hw) // Ensure windows don't overlap. if lEnd > rOff { lEnd = (lEnd + rOff) / 2 rOff = lEnd } } if lEnd-lOff < *thresh || rEnd-rOff < *thresh { // Don't do fruitless work. continue loop } left := *seq left.ID = "prefix" left.Seq = left.Seq[lOff:lEnd] right := *seq right.ID = "postfix" right.Seq = right.Seq[rOff:rEnd] aln, err := sw.Align(&right, &left) if err != nil { log.Fatal(err) } fa := align.Format(&right, &left, aln, '-') for _, seg := range fa { var n int for _, l := range seg.(alphabet.Letters) { if l != '-' { n++ } } if n < *thresh { continue loop } } var sc int for _, seg := range aln { type scorer interface { Score() int } sc += seg.(scorer).Score() } f.FeatAttributes = append(f.FeatAttributes, gff.Attribute{ Tag: "TSD", Value: fmt.Sprintf(`%v %d %d %v "%v" %d`, fa[0], aln[len(aln)-1].Features()[0].End()+lOff, aln[0].Features()[1].Start()+rOff, fa[1], aln, sc), }) w.Write(f) } } if err := ssc.Error(); err != nil { log.Fatalf("error during fasta read: %v", err) } f.Close() } }