Beispiel #1
0
// readFlipped() reads the compressed bitstream that indicates whether a read
// was flipped or not. If the file does not exist, returns nil.
func readFlipped(flippedFN string) []bool {
	// open the file; return empty if nothing there
	flippedIn, err := os.Open(flippedFN)
	if err == nil {
		log.Printf("Reading flipped bits from %s", flippedFN)
		defer flippedIn.Close()

		flippedZ, err := gzip.NewReader(flippedIn)
		DIE_ON_ERR(err, "Couldn't create unzipper for flipped file")
		defer flippedZ.Close()

		flippedBits := bitio.NewReader(bufio.NewReader(flippedZ))
		defer flippedBits.Close()

		flipped := make([]bool, 0, 1000000)
		for {
			b, err := flippedBits.ReadBit()
			if err != nil {
				break
			}
			if b > 0 {
				flipped = append(flipped, true)
			} else {
				flipped = append(flipped, false)
			}
		}
		log.Printf("Read %d bits indicating whether reads were flipped.", len(flipped))
		return flipped
	} else {
		log.Printf("No flipped bit file (%s) found; ignoring.", flippedFN)
		return nil
	}
}
Beispiel #2
0
// decodeKmersFromFile() opens the given gzipped bittree file and extracts the
// stored kmers.
func decodeKmersFromFile(filename string, k int) []string {
	log.Printf("Decoding kmer buckets from %v", filename)
	// open the file and wrap a bit reader around it
	bittree, err := os.Open(filename)
	DIE_ON_ERR(err, "Couldn't open bitree file %s", filename)
	defer bittree.Close()

	bittreeZ, err := gzip.NewReader(bittree)
	DIE_ON_ERR(err, "Couldn't create gzipper")
	defer bittreeZ.Close()

	in := bitio.NewReader(bufio.NewReader(bittreeZ))
	defer in.Close()

	// start a routine to produce the bits
	bits := make(chan byte, 1000000)
	go readBits(in, bits)

	// make a channel to get the output
	out := make(chan string, 1000000)

	// decode and pass the input to the decoded output
	go decodeBitTree(bits, k, out)

	kmers := make([]string, 0)
	for s := range out {
		kmers = append(kmers, s)
	}
	log.Printf("done; found %v kmers", len(kmers))
	return kmers
}
Beispiel #3
0
// main() encodes or decodes a set of reads based on the first command line
// argument (which is either encode or decode).
func main() {
	fmt.Println("kpath  Copyright (C) 2014  Carl Kingsford & Rob Patro\n")

	fmt.Println("This program comes with ABSOLUTELY NO WARRANTY; This is free software, and")
	fmt.Println("you are welcome to redistribute it under certain conditions; see")
	fmt.Println("accompanying LICENSE.txt file.\n")

	log.Println("Starting kpath version 0.6.1 (6-19-14)")
	startTime := time.Now()

	log.Printf("Maximum threads = %v", maxThreads)
	runtime.GOMAXPROCS(maxThreads)

	// parse the command line
	const (
		ENCODE int = 1
		DECODE int = 2
	)
	if len(os.Args) < 2 {
		encodeFlags.PrintDefaults()
		os.Exit(1)
	}
	var mode int
	if os.Args[1][0] == 'e' {
		mode = ENCODE
		log.SetPrefix("kpath (encode): ")
	} else {
		mode = DECODE
		log.SetPrefix("kpath (decode): ")
	}
	encodeFlags.Parse(os.Args[2:])
	if globalK <= 0 || globalK > 16 {
		log.Fatalf("K must be specified as a small positive integer with -k")
	}
	log.Printf("Using kmer size = %d", globalK)
	setShiftKmerMask()

	if refFile == "" {
		log.Fatalf("Must specify gzipped fasta as reference with -ref")
	}

	if readFile == "" {
		log.Println("Must specify input file with -reads")
		log.Fatalln("If decoding, just give basename of encoded files.")
	}

	if outFile == "" {
		log.Println("Must specify output location with -out")
		log.Println("If encoding, omit extension.")
	}

	if cpuProfile != "" {
		log.Printf("Writing CPU profile to %s", cpuProfile)
		cpuF, err := os.Create(cpuProfile)
		DIE_ON_ERR(err, "Couldn't create CPU profile file %s", cpuProfile)
		pprof.StartCPUProfile(cpuF)
		defer pprof.StopCPUProfile()
	}

	// count the kmers in the reference
	var hash KmerHash
	waitForReference := make(chan struct{})
	go func() {
		refStart := time.Now()
		hash = countKmersInReference(globalK, refFile)
		log.Printf("There are %v unique %v-mers in the reference\n",
			len(hash), globalK)
		log.Printf("Time: Took %v seconds to read reference.",
			time.Now().Sub(refStart).Seconds())
		close(waitForReference)
		return
	}()

	writeGlobalOptions()

	if mode == ENCODE {
		/* encode -k -ref -reads=FOO.seq -out=OUT
		   will encode into OUT.{enc,bittree,counts} */
		log.Printf("Reading from %s", readFile)
		log.Printf("Writing to %s, %s, %s",
			outFile+".enc", outFile+".bittree", outFile+".counts")

		// create the output file
		outF, err := os.Create(outFile + ".enc")
		DIE_ON_ERR(err, "Couldn't create output file %s", outFile)
		defer outF.Close()

		//outBuf := bufio.NewWriterSize(outF, 200000000)
		//defer outBuf.Flush()

		writer := bitio.NewWriter(outF)
		defer writer.Close()

		// create encoder
		encoder := arithc.NewEncoder(writer)
		defer encoder.Finish()

		// encode reads
		<-waitForReference
		tempReadFile, buckets, counts := preprocessWithBuckets(readFile, outFile, hash)
		n := encodeReadsFromTempFile(tempReadFile, buckets, counts, hash, encoder)
		log.Printf("Reads Flipped: %v", flipped)
		log.Printf("Encoded %v reads (may be < # of input reads due to duplicates).", n)

	} else {
		/* decode -k -ref -reads=FOO -out=OUT.seq
		   will look for FOO.enc, FOO.bittree, FOO.counts and decode into OUT.seq */

		tailsFN := readFile + ".enc"
		headsFN := readFile + ".bittree"
		countsFN := readFile + ".counts"

		log.Printf("Reading from %s, %s, and %s", tailsFN, headsFN, countsFN)

		// read the bucket names
		var kmers []string
		waitForBuckets := make(chan struct{})
		go func() {
			kmers = decodeKmersFromFile(headsFN, globalK)
			sort.Strings(kmers)
			close(waitForBuckets)
			runtime.Goexit()
			return
		}()

		// read the bucket counts
		var counts []int
		var readlen int
		waitForCounts := make(chan struct{})
		go func() {
			counts, readlen = readBucketCounts(countsFN)
			close(waitForCounts)
			runtime.Goexit()
			return
		}()

		// read the flipped bits --- flipped by be 0-length if no file could be
		// found; this indicates that either nothing was flipped or we don't
		// care about orientation
		var flipped []bool
		waitForFlipped := make(chan struct{})
		go func() {
			flipped = readFlipped(readFile + ".flipped")
			close(waitForFlipped)
			runtime.Goexit()
			return
		}()

		// read the NLocations, which might be 0-length if no file could be
		// found; this indicates that the Ns were recorded some other way.
		var NLocations [][]byte
		waitForNLocations := make(chan struct{})
		go func() {
			NLocations = readNLocations(readFile + ".ns")
			close(waitForNLocations)
			runtime.Goexit()
			return
		}()

		// open encoded read file
		encIn, err := os.Open(tailsFN)
		DIE_ON_ERR(err, "Can't open encoded read file %s", tailsFN)
		defer encIn.Close()

		readerBuf := bufio.NewReader(encIn)

		// create a bit reader wrapper around it
		reader := bitio.NewReader(readerBuf)
		defer reader.Close()

		// create a decoder around it
		decoder, err := arithc.NewDecoder(reader)
		DIE_ON_ERR(err, "Couldn't create decoder!")

		// create the output file
		log.Printf("Writing to %s", outFile)
		outF, err := os.Create(outFile)
		DIE_ON_ERR(err, "Couldn't create output file %s", outFile)
		defer outF.Close()

		<-waitForReference
		<-waitForBuckets
		<-waitForCounts
		<-waitForFlipped
		<-waitForNLocations
		log.Printf("Read length = %d", readlen)
		decodeReads(kmers, counts, flipped, NLocations, hash, readlen, outF, decoder)
	}
	log.Printf("Default interval used %v times and context used %v times",
		defaultIntervalSum, contextExists)

	endTime := time.Now()
	log.Printf("kpath took %v to run.", endTime.Sub(startTime).Seconds())

	/* UNCOMMENT TO DEBUG GARBAGE COLLECTION WITH GO 1.2
	   var stats debug.GCStats
	   stats.PauseQuantiles = make([]time.Duration, 5)
	   debug.ReadGCStats(&stats)
	   log.Printf("Last GC=%v\nNum GC=%v\nPause for GC=%v\nPauseHistory=%v",
	       stats.LastGC, stats.NumGC, stats.PauseTotal.Seconds(), stats.Pause)
	*/
}