Exemple #1
0
func ExampleFitted_Align() {
	fsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GTTGACAGACTAGATTCACG"))}
	fsa.Alpha = alphabet.DNAgapped
	fsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GACAGACGA"))}
	fsb.Alpha = alphabet.DNAgapped

	//		   Query letter
	//  	 -	 A	 C	 G	 T
	// -	 0	-5	-5	-5	-5
	// A	-5	10	-3	-1	-4
	// C	-5	-3	 9	-5	 0
	// G	-5	-1	-5	 7	-3
	// T	-5	-4	 0	-3	 8
	fitted := Fitted{
		{0, -5, -5, -5, -5},
		{-5, 10, -3, -1, -4},
		{-5, -3, 9, -5, 0},
		{-5, -1, -5, 7, -3},
		{-5, -4, 0, -3, 8},
	}

	aln, err := fitted.Align(fsa, fsb)
	if err == nil {
		fmt.Printf("%s\n", aln)
		fa := Format(fsa, fsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	// [[3,10)/[0,7)=62 [10,12)/-=-10 [12,14)/[7,9)=17]
	// GACAGACTAGA
	// GACAGAC--GA
}
Exemple #2
0
func ExampleSW_Align_2() {
	swsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AAAATTTAAAA"))}
	swsa.Alpha = alphabet.DNAgapped
	swsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AAAAGGGAAAA"))}
	swsb.Alpha = alphabet.DNAgapped

	// w(gap) = 0
	// w(match) = +2
	// w(mismatch) = -1
	smith := SW{
		{0, 0, 0, 0, 0},
		{0, 2, -1, -1, -1},
		{0, -1, 2, -1, -1},
		{0, -1, -1, 2, -1},
		{0, -1, -1, -1, 2},
	}

	aln, err := smith.Align(swsa, swsb)
	if err == nil {
		fmt.Printf("%v\n", aln)
		fa := Format(swsa, swsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	// [[0,4)/[0,4)=8 -/[4,7)=0 [4,7)/-=0 [7,11)/[7,11)=8]
	// AAAA---TTTAAAA
	// AAAAGGG---AAAA
}
Exemple #3
0
func ExampleNW_Align() {
	nwsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AGACTAGTTA"))}
	nwsa.Alpha = alphabet.DNAgapped
	nwsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("GACAGACG"))}
	nwsb.Alpha = alphabet.DNAgapped

	//		   Query letter
	//  	 -	 A	 C	 G	 T
	// -	 0	-5	-5	-5	-5
	// A	-5	10	-3	-1	-4
	// C	-5	-3	 9	-5	 0
	// G	-5	-1	-5	 7	-3
	// T	-5	-4	 0	-3	 8
	needle := NW{
		{0, -5, -5, -5, -5},
		{-5, 10, -3, -1, -4},
		{-5, -3, 9, -5, 0},
		{-5, -1, -5, 7, -3},
		{-5, -4, 0, -3, 8},
	}

	aln, err := needle.Align(nwsa, nwsb)
	if err == nil {
		fmt.Printf("%s\n", aln)
		fa := Format(nwsa, nwsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	//[[0,1)/-=-5 [1,4)/[0,3)=26 [4,5)/-=-5 [5,10)/[3,8)=12]
	// AGACTAGTTA
	// -GAC-AGACG
}
Exemple #4
0
func ExampleSW_Align_1() {
	swsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ACACACTA"))}
	swsa.Alpha = alphabet.DNAgapped
	swsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("AGCACACA"))}
	swsb.Alpha = alphabet.DNAgapped

	// w(gap) = -1
	// w(match) = +2
	// w(mismatch) = -1
	smith := SW{
		{0, -1, -1, -1, -1},
		{-1, 2, -1, -1, -1},
		{-1, -1, 2, -1, -1},
		{-1, -1, -1, 2, -1},
		{-1, -1, -1, -1, 2},
	}

	aln, err := smith.Align(swsa, swsb)
	if err == nil {
		fmt.Printf("%v\n", aln)
		fa := Format(swsa, swsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	// [[0,1)/[0,1)=2 -/[1,2)=-1 [1,6)/[2,7)=10 [6,7)/-=-1 [7,8)/[7,8)=2]
	// A-CACACTA
	// AGCACAC-A
}
Exemple #5
0
// Read a single sequence and return it and potentially an error. Note that
// a non-nil returned error may be associated with a valid sequence, so it is
// the responsibility of the caller to examine the error to determine whether
// the read was successful.
// Note that if the Reader's template type returns different non-nil error
// values from calls to SetName and SetDescription, a new error string will be
// returned on each call to Read. So to allow direct error comparison these
// methods should return the same error.
func (r *Reader) Read() (seq.Sequence, error) {
	var (
		buff, line []byte
		isPrefix   bool
		s          seq.Sequence
	)
	defer func() {
		if r.working == nil {
			r.err = nil
		}
	}()

	for {
		var err error
		if buff, isPrefix, err = r.r.ReadLine(); err != nil {
			if err != io.EOF || r.working == nil {
				return nil, err
			}
			s, err = r.working, r.err
			r.working = nil
			return s, err
		}
		line = append(line, buff...)
		if isPrefix {
			continue
		}
		line = bytes.TrimSpace(line)
		if len(line) == 0 {
			continue
		}

		if bytes.HasPrefix(line, r.IDPrefix) {
			if r.working == nil {
				r.working, r.err = r.header(line)
				line = nil
			} else {
				s, err = r.working, r.err
				r.working, r.err = r.header(line)
				return s, err
			}
		} else if bytes.HasPrefix(line, r.SeqPrefix) {
			if r.working == nil {
				return nil, fmt.Errorf("fasta: badly formed line %q", line)
			}
			line = bytes.Join(bytes.Fields(line[len(r.SeqPrefix):]), nil)
			r.working.AppendLetters(alphabet.BytesToLetters(line)...)
			line = nil
		} else {
			return nil, fmt.Errorf("fasta: badly formed line %q", line)
		}
	}
}
Exemple #6
0
func ExampleFittedAffine_Align() {
	fsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATTGGCAATGA"))}
	fsa.Alpha = alphabet.DNAgapped
	fsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATAGGAA"))}
	fsb.Alpha = alphabet.DNAgapped

	//		   Query letter
	//  	 -	 A	 C	 G	 T
	// -	 0	-1	-1	-1	-1
	// A	-1	 1	-1	-1	-1
	// C	-1	-1	 1	-1	-1
	// G	-1	-1	-1	 1	-1
	// T	-1	-1	-1	-1	 1
	//
	// Gap open: -5
	fitted := FittedAffine{
		Matrix: Linear{
			{0, -1, -1, -1, -1},
			{-1, 1, -1, -1, -1},
			{-1, -1, 1, -1, -1},
			{-1, -1, -1, 1, -1},
			{-1, -1, -1, -1, 1},
		},
		GapOpen: -5,
	}

	aln, err := fitted.Align(fsa, fsb)
	if err == nil {
		fmt.Printf("%s\n", aln)
		fa := Format(fsa, fsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	// [[0,7)/[0,7)=3]
	// ATTGGCA
	// ATAGGAA
}
Exemple #7
0
func ExampleNWAffine_Align() {
	nwsa := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATAGGAAG"))}
	nwsa.Alpha = alphabet.DNAgapped
	nwsb := &linear.Seq{Seq: alphabet.BytesToLetters([]byte("ATTGGCAATG"))}
	nwsb.Alpha = alphabet.DNAgapped

	//		   Query letter
	//  	 -	 A	 C	 G	 T
	// -	 0	-1	-1	-1	-1
	// A	-1	 1	-1	-1	-1
	// C	-1	-1	 1	-1	-1
	// G	-1	-1	-1	 1	-1
	// T	-1	-1	-1	-1	 1
	//
	// Gap open: -5
	needle := NWAffine{
		Matrix: Linear{
			{0, -1, -1, -1, -1},
			{-1, 1, -1, -1, -1},
			{-1, -1, 1, -1, -1},
			{-1, -1, -1, 1, -1},
			{-1, -1, -1, -1, 1},
		},
		GapOpen: -5,
	}

	aln, err := needle.Align(nwsa, nwsb)
	if err == nil {
		fmt.Printf("%s\n", aln)
		fa := Format(nwsa, nwsb, aln, '-')
		fmt.Printf("%s\n%s\n", fa[0], fa[1])
	}
	// Output:
	// [[0,7)/[0,7)=3 -/[7,9)=-7 [7,8)/[9,10)=1]
	// ATAGGAA--G
	// ATTGGCAATG
}
Exemple #8
0
func (r *Reader) metaSeq(moltype, id []byte) (seq.Sequence, error) {
	var line, body []byte

	var err error
	for {
		line, err = r.r.ReadBytes('\n')
		if err != nil {
			if err == io.EOF {
				return nil, err
			}
			return nil, &csv.ParseError{Line: r.line, Err: err}
		}
		r.line++
		line = bytes.TrimSpace(line)
		if len(line) == 0 {
			continue
		}
		if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) {
			return nil, &csv.ParseError{Line: r.line, Err: ErrBadSequence}
		}
		line = bytes.TrimSpace(line[2:])
		if unsafeString(line) == "end-"+unsafeString(moltype) {
			break
		} else {
			line = bytes.Join(bytes.Fields(line), nil)
			body = append(body, line...)
		}
	}

	var alpha alphabet.Alphabet
	switch feat.ParseMoltype(unsafeString(moltype)) {
	case feat.DNA:
		alpha = alphabet.DNA
	case feat.RNA:
		alpha = alphabet.RNA
	case feat.Protein:
		alpha = alphabet.Protein
	default:
		return nil, ErrBadMoltype
	}
	s := linear.NewSeq(string(id), alphabet.BytesToLetters(body), alpha)

	return s, err
}
Exemple #9
0
// Helper
func stringToSeq(s string) *linear.Seq {
	return linear.NewSeq("", alphabet.BytesToLetters([]byte(s)), alphabet.DNA)
}
Exemple #10
0
func main() {
	if len(os.Args) < 2 {
		fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file")
		os.Exit(1)
	}

	extract := make(map[string][2]int)
	sc := featio.NewScanner(gff.NewReader(os.Stdin))
	for sc.Next() {
		f := sc.Feat().(*gff.Feature)
		read := f.FeatAttributes.Get("Read")
		if read == "" {
			continue
		}
		fields := strings.Fields(read)
		name := fields[0]
		start, err := strconv.Atoi(fields[1])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		end, err := strconv.Atoi(fields[2])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		extract[name] = [2]int{start, end}
	}
	err := sc.Error()
	if err != nil {
		log.Fatalf("error during GFF read: %v", err)
	}

	for _, reads := range os.Args[1:] {
		sf, err := os.Open(reads)
		if err != nil {
			log.Fatalf("failed to open %q: %v", reads, err)
		}
		sr, err := sam.NewReader(sf)
		if err != nil {
			log.Fatalf("failed to open SAM input %q: %v", reads, err)
		}
		for {
			r, err := sr.Read()
			if err != nil {
				if err != io.EOF {
					log.Fatalf("unexpected error reading SAM: %v", err)
				}
				break
			}

			v, ok := extract[r.Name]
			if !ok {
				continue
			}
			// Currently reefer only expects a single hit per read,
			// so any multiples are due to duplicate read file input.
			// Update this behaviour if we change reefer to look at
			// remapping soft-clipped segments.
			delete(extract, r.Name)

			reverse := r.Flags&sam.Reverse != 0
			rng := fmt.Sprintf("//%d_%d", v[0], v[1])
			if reverse {
				rng += "(-)"
				len := r.Seq.Length
				v[0], v[1] = len-v[1], len-v[0]
			}
			v[0] = feat.OneToZero(v[0])
			s := linear.NewSeq(
				r.Name+rng,
				alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]],
				alphabet.DNA,
			)
			if reverse {
				s.Desc = "(sequence revcomp relative to read)"
			}
			fmt.Printf("%60a\n", s)
		}
		sf.Close()
	}
}
Exemple #11
0
// adjustDeletion performs a deletion ends refinement based on a
// pair of Smith-Waterman alignments.
//
//                    l      s   e      r
//  ref:         -----|------+~~~+------|----------
//
//  query_left:  ----|-----------+~~~~~~|~~~~~~+---------------
//                   l           s      m      e
//  query_right: ----------------+~~~~~~|~~~~~~+-----------|---
//                               s      m      e           r
//
//  where ~~ is the region found by CIGAR score walking above in the
//  deletions function.
//
//  align ref(l..r) with query_left(l..m) -> ref(s)-query_left(s)
//  align ref(l..r) with query_right(m..r) -> ref(e)-query_left(e)
//
// This can give either of two outcomes:
//  1. ref(s) < ref(e)
//  2. ref(e) <= ref(s)
//
// The first case is a standard colinear alignment:
//
//                              s   e
//  ref:             -----------+---+-----------------
//                             /     \
//                            /       \
//                           /         \
//                          /           \
//  query: ----------------+-------------+---------------
//                         s             e
//
//
// The second case is a non-colinear alignment:
//
//                              e   s
//  ref:             -----------+---+-----------------
//                               \ /
//                                /
//                               / \
//                              /   \
//                             /     \
//                            /       \
//                           /         \
//                          /           \
//  query: ----------------+-------------+---------------
//                         s             e
//
//
// which has a potential target site duplication interpretation:
//
//                              e   s
//  ref:             -----------+---+-----------------
//                             / \ / \
//                            /   /   \
//                           /   / \   \
//                          /   /   \   \
//                         /   /     \   \
//                        /   /       \   \
//                       /   /         \   \
//                      /   /           \   \
//  query: ------------+---+-------------+---+-----------
//                         s             e
//
// adjustDeletions handles the second case by making ref(s=e) for the
// reference and adding annotation for the length of the duplication
// (d) in ref:
//
//                             s|e s+d
//  ref:             -----------+---+-----------------
//                             / \ / \
//                            /   /   \
//                           /   / \   \
//                          /   /   \   \
//                         /   /     \   \
//                        /   /       \   \
//                       /   /         \   \
//                      /   /           \   \
//  query: ------------+---+-------------+---+-----------
//                    s-d  s             e  e+d
//
func (r *refiner) adjust(d deletion) (refined deletion, ok bool, err error) {
	if r == nil {
		return d, false, nil
	}
	if d.qend-d.qstart < d.rend-d.rstart {
		// Do not do any work for deletions.
		return d, false, fmt.Errorf("not an insertion: len(q)=%d len(r)=%d", d.qend-d.qstart, d.rend-d.rstart)
	}

	name := d.record.Ref.Name()
	ref, ok := r.ref[name]
	if !ok {
		return d, false, fmt.Errorf("no reference sequence for %q", name)
	}

	rs := *ref
	rOff := max(0, d.rstart-r.refWindow/2)
	rs.Seq = ref.Seq[rOff:min(d.rend+r.refWindow/2, len(ref.Seq))]

	q := alphabet.BytesToLetters(d.record.Seq.Expand())

	// Align the left junction of the qeuery to
	// the reference around the indel site.
	qsl := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped)
	qOffLeft := max(0, d.qstart-r.queryWindow)
	qsl.Seq = q[qOffLeft : (d.qstart+d.qend)/2]
	alnl, err := r.sw.Align(&rs, qsl)
	if err != nil {
		return d, false, err
	}

	// Align the right junction of the qeuery to
	// the reference around the indel site.
	qsr := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped)
	qOffRight := (d.qstart + d.qend) / 2
	qsr.Seq = q[qOffRight:min(d.qend+r.queryWindow, len(q))]
	alnr, err := r.sw.Align(&rs, qsr)
	if err != nil {
		return d, false, err
	}

	// Get left and right ends of insertion in read
	// and the aligned segment of the reference.
	left := alnl[len(alnl)-1].Features()
	right := alnr[0].Features()

	// Bail out if the alignment extends too far.
	// We might have continued alignment.
	if flank := right[0].Start(); flank < r.minRefFlank {
		return d, false, fmt.Errorf("skipping: right ref flank less than %d from left: len(flank)=%v",
			r.minRefFlank, flank)
	}
	if flank := left[0].End(); len(rs.Seq)-flank < r.minRefFlank {
		return d, false, fmt.Errorf("skipping: left ref flank less than %d from right: len(flank)=%v",
			r.minRefFlank, len(rs.Seq)-flank)
	}

	centrel := r.queryWindow + (d.qend-d.qstart)/2
	centrer := 0

	// Bail out if the insertion is too short.
	// We might have continued alignment.
	if gap := centrel - left[1].End(); gap < r.minQueryGap {
		return d, false, fmt.Errorf("skipping left: left query gap less than %d from centre: len(gap)=%v",
			r.minQueryGap, gap)
	}
	if gap := right[1].Start() - centrer; gap < r.minQueryGap {
		return d, false, fmt.Errorf("skipping right: right query gap less than %d from centre: len(gap)=%v",
			r.minQueryGap, gap)
	}

	d.rstart = rOff + left[0].End()
	d.rend = rOff + right[0].Start()
	if d.rend <= d.rstart {
		d.dup = d.rstart - d.rend
		d.rstart = d.rend
	}

	d.qstart = qOffLeft + left[1].End()
	d.qend = qOffRight + alnr[0].Features()[1].Start()

	return d, true, nil
}
Exemple #12
0
##DNA <seqname>
##acggctcggattggcgctggatgatagatcagacgac
##...
##end-DNA
##RNA <seqname>
##acggcucggauuggcgcuggaugauagaucagacgac
##...
##end-RNA
##Protein <seqname>
##MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF
##...
##end-Protein
##sequence-region <seqname> 1 5
`,
			feat: []feat.Feature{
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein),
				&Region{Sequence: Sequence{SeqName: "<seqname>", Type: feat.DNA}, RegionStart: 0, RegionEnd: 5},
			},
			write: []interface{}{
				2,
				"source-version <source> <version-text>",
				mustTime(time.Parse(Astronomical, "1997-11-08")),
				Sequence{SeqName: "<seqname>", Type: feat.DNA},
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein),
				&Region{Sequence: Sequence{SeqName: "<seqname>"}, RegionStart: 0, RegionEnd: 5},
			},
		},