Esempio n. 1
0
// ReadBamFile reads bam file, and return the header and a channel of sam records.
func readBamFile(fileName string) (h *sam.Header, c chan sam.Record) {
	// Initialize the channel of sam records.
	c = make(chan sam.Record)

	// Create a new go routine to read the records.
	go func() {
		// Close the record channel when finished.
		defer close(c)

		// Open file stream, and close it when finished.
		f, err := os.Open(fileName)
		if err != nil {
			log.Fatalln(err)
		}
		defer f.Close()

		type SamReader interface {
			Header() *sam.Header
			Read() (*sam.Record, error)
		}

		var reader SamReader
		if fileName[len(fileName)-3:] == "bam" {
			bamReader, err := bam.NewReader(f, 0)
			if err != nil {
				log.Fatalln(err)
			}
			defer bamReader.Close()
			reader = bamReader
		} else {
			reader, err = sam.NewReader(f)
			if err != nil {
				log.Fatalln(err)
			}
		}

		// Read and assign header.
		h = reader.Header()

		// Read sam records and send them to the channel,
		// until it hit an error, which raises a panic
		// if it is not a IO EOF.
		for {
			rec, err := reader.Read()
			if err != nil {
				if err != io.EOF {
					log.Fatalln(err)
				}
				break
			}
			c <- *rec
		}
	}()

	return
}
Esempio n. 2
0
func main() {
	if len(os.Args) < 2 {
		fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file")
		os.Exit(1)
	}

	extract := make(map[string][2]int)
	sc := featio.NewScanner(gff.NewReader(os.Stdin))
	for sc.Next() {
		f := sc.Feat().(*gff.Feature)
		read := f.FeatAttributes.Get("Read")
		if read == "" {
			continue
		}
		fields := strings.Fields(read)
		name := fields[0]
		start, err := strconv.Atoi(fields[1])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		end, err := strconv.Atoi(fields[2])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		extract[name] = [2]int{start, end}
	}
	err := sc.Error()
	if err != nil {
		log.Fatalf("error during GFF read: %v", err)
	}

	for _, reads := range os.Args[1:] {
		sf, err := os.Open(reads)
		if err != nil {
			log.Fatalf("failed to open %q: %v", reads, err)
		}
		sr, err := sam.NewReader(sf)
		if err != nil {
			log.Fatalf("failed to open SAM input %q: %v", reads, err)
		}
		for {
			r, err := sr.Read()
			if err != nil {
				if err != io.EOF {
					log.Fatalf("unexpected error reading SAM: %v", err)
				}
				break
			}

			v, ok := extract[r.Name]
			if !ok {
				continue
			}
			// Currently reefer only expects a single hit per read,
			// so any multiples are due to duplicate read file input.
			// Update this behaviour if we change reefer to look at
			// remapping soft-clipped segments.
			delete(extract, r.Name)

			reverse := r.Flags&sam.Reverse != 0
			rng := fmt.Sprintf("//%d_%d", v[0], v[1])
			if reverse {
				rng += "(-)"
				len := r.Seq.Length
				v[0], v[1] = len-v[1], len-v[0]
			}
			v[0] = feat.OneToZero(v[0])
			s := linear.NewSeq(
				r.Name+rng,
				alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]],
				alphabet.DNA,
			)
			if reverse {
				s.Desc = "(sequence revcomp relative to read)"
			}
			fmt.Printf("%60a\n", s)
		}
		sf.Close()
	}
}
Esempio n. 3
0
// deletions analyses *sam.Records from mapping reads to the given reference
// using the suffix array file if provided. If run is false, blasr is not
// run and the existing blasr output is used to provide the *sam.Records.
// procs specifies the number of blasr threads to use.
func deletions(reads, ref, suff, ext string, procs int, run bool, window, min int, br *refiner, w *gff.Writer) error {
	base := filepath.Base(reads)
	b := blasr.BLASR{
		Cmd: *blasrPath,

		Reads: reads, Genome: ref, SuffixArray: suff,
		BestN: 1,

		SAM:           true,
		Clipping:      "soft",
		SAMQV:         true,
		CIGARSeqMatch: true,

		Aligned:   base + ".blasr." + ext,
		Unaligned: base + ".blasr.unmapped.fasta",

		Procs: procs,
	}
	if run {
		cmd, err := b.BuildCommand()
		if err != nil {
			return err
		}
		cmd.Stdout = errStream
		cmd.Stderr = errStream
		err = cmd.Run()
		if err != nil {
			return err
		}
	}

	f, err := os.Open(b.Aligned)
	if err != nil {
		return err
	}
	defer f.Close()

	cost := [...]float64{
		sam.CigarInsertion: -2,
		sam.CigarDeletion:  -2,
		sam.CigarEqual:     1,
		sam.CigarMismatch:  -1,

		// Included for explicitness
		sam.CigarSoftClipped: 0,

		// Included to ensure no bounds panic.
		// All CIGAR operations not listed above
		// are given a zero cost.
		sam.CigarBack: 0,
	}

	_, err = w.WriteComment(fmt.Sprintf("smoothing window=%d", window))
	if err != nil {
		return nil
	}
	_, err = w.WriteComment(fmt.Sprintf("minimum feature length=%d", min))
	if err != nil {
		return nil
	}
	gf := &gff.Feature{
		Source:         "reefer",
		Feature:        "discordance",
		FeatFrame:      gff.NoFrame,
		FeatAttributes: gff.Attributes{{Tag: "Read"}, {Tag: "Dup"}},
	}
	var sr interface {
		Read() (*sam.Record, error)
	}
	switch ext {
	case "sam":
		sr, err = sam.NewReader(f)
		if err != nil {
			return err
		}
	case "bam":
		var br *bam.Reader
		br, err = bam.NewReader(f, 0)
		if err != nil {
			return err
		}
		defer br.Close()
		sr = br
	default:
		panic("reefer: invalid extension")
	}
	for {
		r, err := sr.Read()
		if err != nil {
			if err != io.EOF {
				return err
			}
			break
		}

		var (
			scores []costPos
			ref    = r.Start()
			query  int
		)
		for _, co := range r.Cigar {
			for i := 0; i < co.Len(); i++ {
				scores = append(scores, costPos{
					ref:   ref,
					query: query,
					cost:  cost[co.Type()],
				})
				consume := co.Type().Consumes()
				ref += consume.Reference
				query += consume.Query
			}
		}
		if len(scores) <= window {
			continue
		}
		smoothed := make([]costPos, len(scores)-window)
		for i := range scores[:len(scores)-window] {
			smoothed[i] = mean(scores[i : i+window])
		}

		var d deletion
		for i, v := range smoothed[1:] {
			switch {
			case d.record == nil && v.cost < 0 && smoothed[i].cost >= 0:
				d = deletion{record: r, rstart: v.ref + 1, qstart: v.query + 1}
			case d.record != nil && v.cost >= 0 && smoothed[i].cost < 0:
				d.rend = v.ref
				d.qend = v.query
				if d.rend-d.rstart >= min || d.qend-d.qstart >= min {
					gf.SeqName = d.record.Ref.Name()
					gf.FeatStrand = strandFor(d.record)
					if gf.FeatStrand == seq.Minus {
						len := d.record.Seq.Length
						d.qstart, d.qend = len-d.qend, len-d.qstart
					}

					// Adjust ends based on paired SW alignments.
					var refined bool
					d, refined, err = br.adjust(d)
					if err != nil && *verbose {
						log.Printf("failed alignment %s: %v", d.record.Name, err)
					}

					gf.FeatStart = d.rstart
					gf.FeatEnd = d.rend
					if gf.FeatStart == gf.FeatEnd {
						// This is disgusting garbage resulting from
						// GFF not allowing zero length features.
						gf.FeatEnd++
					}

					if refined {
						gf.FeatAttributes = gf.FeatAttributes[:2]
						gf.FeatAttributes[1].Value = strconv.Itoa(d.dup)
					} else {
						gf.FeatAttributes = gf.FeatAttributes[:1]
					}
					gf.FeatAttributes[0].Value = fmt.Sprintf("%s %d %d", d.record.Name, feat.ZeroToOne(d.qstart), d.qend)
					_, err = w.Write(gf)
					if err != nil {
						return err
					}
				}
				d.record = nil
			}
		}
	}
	return nil
}