func ExampleFitted_Align() { fsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GTTGACAGACTAGATTCACG"))} fsa.Alpha = alphabet.DNAgapped fsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GACAGACGA"))} fsb.Alpha = alphabet.DNAgapped // Query letter // - A C G T // - 0 -5 -5 -5 -5 // A -5 10 -3 -1 -4 // C -5 -3 9 -5 0 // G -5 -1 -5 7 -3 // T -5 -4 0 -3 8 fitted := Fitted{ {0, -5, -5, -5, -5}, {-5, 10, -3, -1, -4}, {-5, -3, 9, -5, 0}, {-5, -1, -5, 7, -3}, {-5, -4, 0, -3, 8}, } aln, err := fitted.Align(fsa, fsb) if err == nil { fmt.Printf("%s\n", aln) fa := Format(fsa, fsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: // [[3,10)/[0,7)=62 [10,12)/-=-10 [12,14)/[7,9)=17] // GACAGACTAGA // GACAGAC--GA }
func ExampleSW_Align_2() { swsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AAAATTTAAAA"))} swsa.Alpha = alphabet.DNAgapped swsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AAAAGGGAAAA"))} swsb.Alpha = alphabet.DNAgapped // w(gap) = 0 // w(match) = +2 // w(mismatch) = -1 smith := SW{ {0, 0, 0, 0, 0}, {0, 2, -1, -1, -1}, {0, -1, 2, -1, -1}, {0, -1, -1, 2, -1}, {0, -1, -1, -1, 2}, } aln, err := smith.Align(swsa, swsb) if err == nil { fmt.Printf("%v\n", aln) fa := Format(swsa, swsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: // [[0,4)/[0,4)=8 -/[4,7)=0 [4,7)/-=0 [7,11)/[7,11)=8] // AAAA---TTTAAAA // AAAAGGG---AAAA }
func ExampleNW_Align() { nwsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AGACTAGTTA"))} nwsa.Alpha = alphabet.DNAgapped nwsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GACAGACG"))} nwsb.Alpha = alphabet.DNAgapped // Query letter // - A C G T // - 0 -5 -5 -5 -5 // A -5 10 -3 -1 -4 // C -5 -3 9 -5 0 // G -5 -1 -5 7 -3 // T -5 -4 0 -3 8 needle := NW{ {0, -5, -5, -5, -5}, {-5, 10, -3, -1, -4}, {-5, -3, 9, -5, 0}, {-5, -1, -5, 7, -3}, {-5, -4, 0, -3, 8}, } aln, err := needle.Align(nwsa, nwsb) if err == nil { fmt.Printf("%s\n", aln) fa := Format(nwsa, nwsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: //[[0,1)/-=-5 [1,4)/[0,3)=26 [4,5)/-=-5 [5,10)/[3,8)=12] // AGACTAGTTA // -GAC-AGACG }
func ExampleSW_Align_1() { swsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ACACACTA"))} swsa.Alpha = alphabet.DNAgapped swsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AGCACACA"))} swsb.Alpha = alphabet.DNAgapped // w(gap) = -1 // w(match) = +2 // w(mismatch) = -1 smith := SW{ {0, -1, -1, -1, -1}, {-1, 2, -1, -1, -1}, {-1, -1, 2, -1, -1}, {-1, -1, -1, 2, -1}, {-1, -1, -1, -1, 2}, } aln, err := smith.Align(swsa, swsb) if err == nil { fmt.Printf("%v\n", aln) fa := Format(swsa, swsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: // [[0,1)/[0,1)=2 -/[1,2)=-1 [1,6)/[2,7)=10 [6,7)/-=-1 [7,8)/[7,8)=2] // A-CACACTA // AGCACAC-A }
// Read a single sequence and return it and potentially an error. Note that // a non-nil returned error may be associated with a valid sequence, so it is // the responsibility of the caller to examine the error to determine whether // the read was successful. // Note that if the Reader's template type returns different non-nil error // values from calls to SetName and SetDescription, a new error string will be // returned on each call to Read. So to allow direct error comparison these // methods should return the same error. func (r *Reader) Read() (seq.Sequence, error) { var ( buff, line []byte isPrefix bool s seq.Sequence ) defer func() { if r.working == nil { r.err = nil } }() for { var err error if buff, isPrefix, err = r.r.ReadLine(); err != nil { if err != io.EOF || r.working == nil { return nil, err } s, err = r.working, r.err r.working = nil return s, err } line = append(line, buff...) if isPrefix { continue } line = bytes.TrimSpace(line) if len(line) == 0 { continue } if bytes.HasPrefix(line, r.IDPrefix) { if r.working == nil { r.working, r.err = r.header(line) line = nil } else { s, err = r.working, r.err r.working, r.err = r.header(line) return s, err } } else if bytes.HasPrefix(line, r.SeqPrefix) { if r.working == nil { return nil, fmt.Errorf("fasta: badly formed line %q", line) } line = bytes.Join(bytes.Fields(line[len(r.SeqPrefix):]), nil) r.working.AppendLetters(alphabet.BytesToLetters(line)...) line = nil } else { return nil, fmt.Errorf("fasta: badly formed line %q", line) } } }
func ExampleFittedAffine_Align() { fsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATTGGCAATGA"))} fsa.Alpha = alphabet.DNAgapped fsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATAGGAA"))} fsb.Alpha = alphabet.DNAgapped // Query letter // - A C G T // - 0 -1 -1 -1 -1 // A -1 1 -1 -1 -1 // C -1 -1 1 -1 -1 // G -1 -1 -1 1 -1 // T -1 -1 -1 -1 1 // // Gap open: -5 fitted := FittedAffine{ Matrix: Linear{ {0, -1, -1, -1, -1}, {-1, 1, -1, -1, -1}, {-1, -1, 1, -1, -1}, {-1, -1, -1, 1, -1}, {-1, -1, -1, -1, 1}, }, GapOpen: -5, } aln, err := fitted.Align(fsa, fsb) if err == nil { fmt.Printf("%s\n", aln) fa := Format(fsa, fsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: // [[0,7)/[0,7)=3] // ATTGGCA // ATAGGAA }
func ExampleNWAffine_Align() { nwsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATAGGAAG"))} nwsa.Alpha = alphabet.DNAgapped nwsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATTGGCAATG"))} nwsb.Alpha = alphabet.DNAgapped // Query letter // - A C G T // - 0 -1 -1 -1 -1 // A -1 1 -1 -1 -1 // C -1 -1 1 -1 -1 // G -1 -1 -1 1 -1 // T -1 -1 -1 -1 1 // // Gap open: -5 needle := NWAffine{ Matrix: Linear{ {0, -1, -1, -1, -1}, {-1, 1, -1, -1, -1}, {-1, -1, 1, -1, -1}, {-1, -1, -1, 1, -1}, {-1, -1, -1, -1, 1}, }, GapOpen: -5, } aln, err := needle.Align(nwsa, nwsb) if err == nil { fmt.Printf("%s\n", aln) fa := Format(nwsa, nwsb, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) } // Output: // [[0,7)/[0,7)=3 -/[7,9)=-7 [7,8)/[9,10)=1] // ATAGGAA--G // ATTGGCAATG }
func (r *Reader) metaSeq(moltype, id []byte) (seq.Sequence, error) { var line, body []byte var err error for { line, err = r.r.ReadBytes('\n') if err != nil { if err == io.EOF { return nil, err } return nil, &csv.ParseError{Line: r.line, Err: err} } r.line++ line = bytes.TrimSpace(line) if len(line) == 0 { continue } if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) { return nil, &csv.ParseError{Line: r.line, Err: ErrBadSequence} } line = bytes.TrimSpace(line[2:]) if unsafeString(line) == "end-"+unsafeString(moltype) { break } else { line = bytes.Join(bytes.Fields(line), nil) body = append(body, line...) } } var alpha alphabet.Alphabet switch feat.ParseMoltype(unsafeString(moltype)) { case feat.DNA: alpha = alphabet.DNA case feat.RNA: alpha = alphabet.RNA case feat.Protein: alpha = alphabet.Protein default: return nil, ErrBadMoltype } s := linear.NewSeq(string(id), alphabet.BytesToLetters(body), alpha) return s, err }
// Helper func stringToSeq(s string) *linear.Seq { return linear.NewSeq("", alphabet.BytesToLetters([]byte(s)), alphabet.DNA) }
func main() { if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file") os.Exit(1) } extract := make(map[string][2]int) sc := featio.NewScanner(gff.NewReader(os.Stdin)) for sc.Next() { f := sc.Feat().(*gff.Feature) read := f.FeatAttributes.Get("Read") if read == "" { continue } fields := strings.Fields(read) name := fields[0] start, err := strconv.Atoi(fields[1]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } end, err := strconv.Atoi(fields[2]) if err != nil { log.Fatalf("failed to parse %q: %v", read, err) } extract[name] = [2]int{start, end} } err := sc.Error() if err != nil { log.Fatalf("error during GFF read: %v", err) } for _, reads := range os.Args[1:] { sf, err := os.Open(reads) if err != nil { log.Fatalf("failed to open %q: %v", reads, err) } sr, err := sam.NewReader(sf) if err != nil { log.Fatalf("failed to open SAM input %q: %v", reads, err) } for { r, err := sr.Read() if err != nil { if err != io.EOF { log.Fatalf("unexpected error reading SAM: %v", err) } break } v, ok := extract[r.Name] if !ok { continue } // Currently reefer only expects a single hit per read, // so any multiples are due to duplicate read file input. // Update this behaviour if we change reefer to look at // remapping soft-clipped segments. delete(extract, r.Name) reverse := r.Flags&sam.Reverse != 0 rng := fmt.Sprintf("//%d_%d", v[0], v[1]) if reverse { rng += "(-)" len := r.Seq.Length v[0], v[1] = len-v[1], len-v[0] } v[0] = feat.OneToZero(v[0]) s := linear.NewSeq( r.Name+rng, alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]], alphabet.DNA, ) if reverse { s.Desc = "(sequence revcomp relative to read)" } fmt.Printf("%60a\n", s) } sf.Close() } }
// adjustDeletion performs a deletion ends refinement based on a // pair of Smith-Waterman alignments. // // l s e r // ref: -----|------+~~~+------|---------- // // query_left: ----|-----------+~~~~~~|~~~~~~+--------------- // l s m e // query_right: ----------------+~~~~~~|~~~~~~+-----------|--- // s m e r // // where ~~ is the region found by CIGAR score walking above in the // deletions function. // // align ref(l..r) with query_left(l..m) -> ref(s)-query_left(s) // align ref(l..r) with query_right(m..r) -> ref(e)-query_left(e) // // This can give either of two outcomes: // 1. ref(s) < ref(e) // 2. ref(e) <= ref(s) // // The first case is a standard colinear alignment: // // s e // ref: -----------+---+----------------- // / \ // / \ // / \ // / \ // query: ----------------+-------------+--------------- // s e // // // The second case is a non-colinear alignment: // // e s // ref: -----------+---+----------------- // \ / // / // / \ // / \ // / \ // / \ // / \ // / \ // query: ----------------+-------------+--------------- // s e // // // which has a potential target site duplication interpretation: // // e s // ref: -----------+---+----------------- // / \ / \ // / / \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // query: ------------+---+-------------+---+----------- // s e // // adjustDeletions handles the second case by making ref(s=e) for the // reference and adding annotation for the length of the duplication // (d) in ref: // // s|e s+d // ref: -----------+---+----------------- // / \ / \ // / / \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // / / \ \ // query: ------------+---+-------------+---+----------- // s-d s e e+d // func (r *refiner) adjust(d deletion) (refined deletion, ok bool, err error) { if r == nil { return d, false, nil } if d.qend-d.qstart < d.rend-d.rstart { // Do not do any work for deletions. return d, false, fmt.Errorf("not an insertion: len(q)=%d len(r)=%d", d.qend-d.qstart, d.rend-d.rstart) } name := d.record.Ref.Name() ref, ok := r.ref[name] if !ok { return d, false, fmt.Errorf("no reference sequence for %q", name) } rs := *ref rOff := max(0, d.rstart-r.refWindow/2) rs.Seq = ref.Seq[rOff:min(d.rend+r.refWindow/2, len(ref.Seq))] q := alphabet.BytesToLetters(d.record.Seq.Expand()) // Align the left junction of the qeuery to // the reference around the indel site. qsl := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped) qOffLeft := max(0, d.qstart-r.queryWindow) qsl.Seq = q[qOffLeft : (d.qstart+d.qend)/2] alnl, err := r.sw.Align(&rs, qsl) if err != nil { return d, false, err } // Align the right junction of the qeuery to // the reference around the indel site. qsr := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped) qOffRight := (d.qstart + d.qend) / 2 qsr.Seq = q[qOffRight:min(d.qend+r.queryWindow, len(q))] alnr, err := r.sw.Align(&rs, qsr) if err != nil { return d, false, err } // Get left and right ends of insertion in read // and the aligned segment of the reference. left := alnl[len(alnl)-1].Features() right := alnr[0].Features() // Bail out if the alignment extends too far. // We might have continued alignment. if flank := right[0].Start(); flank < r.minRefFlank { return d, false, fmt.Errorf("skipping: right ref flank less than %d from left: len(flank)=%v", r.minRefFlank, flank) } if flank := left[0].End(); len(rs.Seq)-flank < r.minRefFlank { return d, false, fmt.Errorf("skipping: left ref flank less than %d from right: len(flank)=%v", r.minRefFlank, len(rs.Seq)-flank) } centrel := r.queryWindow + (d.qend-d.qstart)/2 centrer := 0 // Bail out if the insertion is too short. // We might have continued alignment. if gap := centrel - left[1].End(); gap < r.minQueryGap { return d, false, fmt.Errorf("skipping left: left query gap less than %d from centre: len(gap)=%v", r.minQueryGap, gap) } if gap := right[1].Start() - centrer; gap < r.minQueryGap { return d, false, fmt.Errorf("skipping right: right query gap less than %d from centre: len(gap)=%v", r.minQueryGap, gap) } d.rstart = rOff + left[0].End() d.rend = rOff + right[0].Start() if d.rend <= d.rstart { d.dup = d.rstart - d.rend d.rstart = d.rend } d.qstart = qOffLeft + left[1].End() d.qend = qOffRight + alnr[0].Features()[1].Start() return d, true, nil }
##DNA <seqname> ##acggctcggattggcgctggatgatagatcagacgac ##... ##end-DNA ##RNA <seqname> ##acggcucggauuggcgcuggaugauagaucagacgac ##... ##end-RNA ##Protein <seqname> ##MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF ##... ##end-Protein ##sequence-region <seqname> 1 5 `, feat: []feat.Feature{ linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein), &Region{Sequence: Sequence{SeqName: "<seqname>", Type: feat.DNA}, RegionStart: 0, RegionEnd: 5}, }, write: []interface{}{ 2, "source-version <source> <version-text>", mustTime(time.Parse(Astronomical, "1997-11-08")), Sequence{SeqName: "<seqname>", Type: feat.DNA}, linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA), linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein), &Region{Sequence: Sequence{SeqName: "<seqname>"}, RegionStart: 0, RegionEnd: 5}, }, },