Exemple #1
0
// encodeWithBuckets() reads the reads, creates the buckets, saves the buckets
// and their counts, and then encodes each read.
func preprocessWithBuckets(
	readFile string,
	outBaseName string,
	hash KmerHash,
) (*os.File, []string, []int) {
	// read the reads and flip as needed
	reads := readAndFlipReads(readFile, hash, flipReadsOption)

	readLength := len(reads[0].Seq)

	log.Printf("Estimated 2-bit encoding size: %d",
		uint64(math.Ceil(float64(2*len(reads)*readLength)/8.0)))

	// if the user wants the qualities written out
	waitForFlipped := make(chan struct{})
	if writeFlippedOption {
		outFlipped, err := os.Create(outBaseName + ".flipped")
		DIE_ON_ERR(err, "Couldn't create flipped file: %s", outBaseName+".flipped")
		defer outFlipped.Close()

		outFlippedZ, err := gzip.NewWriterLevel(outFlipped, gzip.BestCompression)
		DIE_ON_ERR(err, "Couldn't create gzipper for flipped file.")
		defer outFlippedZ.Close()

		flippedBits := bitio.NewWriter(outFlippedZ)
		defer flippedBits.Close()

		go func() {
			writeFlipped(flippedBits, reads)
			close(waitForFlipped)
			runtime.Goexit()
			return
		}()
	} else {
		close(waitForFlipped)
	}

	// if the user wants to write out the N positions, write them out
	waitForNs := make(chan struct{})
	if writeNsOption {
		outNs, err := os.Create(outBaseName + ".ns")
		DIE_ON_ERR(err, "Couldn't create N location file: %s", outBaseName+".ns")
		defer outNs.Close()

		outNsZ, err := gzip.NewWriterLevel(outNs, gzip.BestCompression)
		DIE_ON_ERR(err, "Couldn't create gzipper for N location file.")
		defer outNsZ.Close()

		go func() {
			writeNLocations(outNsZ, reads)
			close(waitForNs)
			runtime.Goexit()
			return
		}()
	} else {
		close(waitForNs)
	}

	// create the buckets and counts
	buckets, counts := listBuckets(reads)

	// write the bittree for the bucket out to a file
	outBT, err := os.Create(outBaseName + ".bittree")
	DIE_ON_ERR(err, "Couldn't create bucket file: %s", outBaseName+".bittree")
	defer outBT.Close()

	// compress the file with gzip as we are writing it
	outBZ, err := gzip.NewWriterLevel(outBT, gzip.BestCompression)
	DIE_ON_ERR(err, "Couldn't create gzipper for bucket file")
	defer outBZ.Close()

	// create a writer that lets us write bits
	writer := bitio.NewWriter(outBZ)
	defer writer.Close()

	/*** The main work to encode the bucket names ***/
	waitForBuckets := make(chan struct{})
	go func() {
		encodeKmersToFile(buckets, writer)
		close(waitForBuckets)
		runtime.Goexit()
		return
	}()

	// write out the counts
	countF, err := os.Create(outBaseName + ".counts")
	DIE_ON_ERR(err, "Couldn't create counts file: %s", outBaseName+".counts")
	defer countF.Close()

	// compress it as we are writing it
	countZ, err := gzip.NewWriterLevel(countF, gzip.BestCompression)
	DIE_ON_ERR(err, "Couldn't create gzipper for count file")
	defer countZ.Close()

	/*** The main work to encode the bucket counts ***/
	waitForCounts := make(chan struct{})
	go func() {
		writeCounts(countZ, readLength, counts)
		close(waitForCounts)
		runtime.Goexit()
		return
	}()

	// create a temp file containing the processed reads
	processedFile, err := ioutil.TempFile("", "kpath-encode-")
	DIE_ON_ERR(err, "Couldn't create temporary file in %s", os.TempDir())
	md5Hash := md5.New()
	waitForTemp := make(chan struct{})
	go func() {
		for i := range reads {
			md5Hash.Write(reads[i].Seq)
			processedFile.Write(reads[i].Seq)
			processedFile.Write([]byte{'\n'})
		}
		processedFile.Seek(0, 0)
		close(waitForTemp)
	}()

	log.Printf("MD5 hash of reads = %x", md5Hash.Sum(nil))

	// Wait for each of the coders to finish
	<-waitForBuckets
	<-waitForCounts
	<-waitForNs
	<-waitForFlipped
	<-waitForTemp

	log.Printf("Done processing; reads are of length %d ...", readLength)
	return processedFile, buckets, counts
}
Exemple #2
0
// main() encodes or decodes a set of reads based on the first command line
// argument (which is either encode or decode).
func main() {
	fmt.Println("kpath  Copyright (C) 2014  Carl Kingsford & Rob Patro\n")

	fmt.Println("This program comes with ABSOLUTELY NO WARRANTY; This is free software, and")
	fmt.Println("you are welcome to redistribute it under certain conditions; see")
	fmt.Println("accompanying LICENSE.txt file.\n")

	log.Println("Starting kpath version 0.6.1 (6-19-14)")
	startTime := time.Now()

	log.Printf("Maximum threads = %v", maxThreads)
	runtime.GOMAXPROCS(maxThreads)

	// parse the command line
	const (
		ENCODE int = 1
		DECODE int = 2
	)
	if len(os.Args) < 2 {
		encodeFlags.PrintDefaults()
		os.Exit(1)
	}
	var mode int
	if os.Args[1][0] == 'e' {
		mode = ENCODE
		log.SetPrefix("kpath (encode): ")
	} else {
		mode = DECODE
		log.SetPrefix("kpath (decode): ")
	}
	encodeFlags.Parse(os.Args[2:])
	if globalK <= 0 || globalK > 16 {
		log.Fatalf("K must be specified as a small positive integer with -k")
	}
	log.Printf("Using kmer size = %d", globalK)
	setShiftKmerMask()

	if refFile == "" {
		log.Fatalf("Must specify gzipped fasta as reference with -ref")
	}

	if readFile == "" {
		log.Println("Must specify input file with -reads")
		log.Fatalln("If decoding, just give basename of encoded files.")
	}

	if outFile == "" {
		log.Println("Must specify output location with -out")
		log.Println("If encoding, omit extension.")
	}

	if cpuProfile != "" {
		log.Printf("Writing CPU profile to %s", cpuProfile)
		cpuF, err := os.Create(cpuProfile)
		DIE_ON_ERR(err, "Couldn't create CPU profile file %s", cpuProfile)
		pprof.StartCPUProfile(cpuF)
		defer pprof.StopCPUProfile()
	}

	// count the kmers in the reference
	var hash KmerHash
	waitForReference := make(chan struct{})
	go func() {
		refStart := time.Now()
		hash = countKmersInReference(globalK, refFile)
		log.Printf("There are %v unique %v-mers in the reference\n",
			len(hash), globalK)
		log.Printf("Time: Took %v seconds to read reference.",
			time.Now().Sub(refStart).Seconds())
		close(waitForReference)
		return
	}()

	writeGlobalOptions()

	if mode == ENCODE {
		/* encode -k -ref -reads=FOO.seq -out=OUT
		   will encode into OUT.{enc,bittree,counts} */
		log.Printf("Reading from %s", readFile)
		log.Printf("Writing to %s, %s, %s",
			outFile+".enc", outFile+".bittree", outFile+".counts")

		// create the output file
		outF, err := os.Create(outFile + ".enc")
		DIE_ON_ERR(err, "Couldn't create output file %s", outFile)
		defer outF.Close()

		//outBuf := bufio.NewWriterSize(outF, 200000000)
		//defer outBuf.Flush()

		writer := bitio.NewWriter(outF)
		defer writer.Close()

		// create encoder
		encoder := arithc.NewEncoder(writer)
		defer encoder.Finish()

		// encode reads
		<-waitForReference
		tempReadFile, buckets, counts := preprocessWithBuckets(readFile, outFile, hash)
		n := encodeReadsFromTempFile(tempReadFile, buckets, counts, hash, encoder)
		log.Printf("Reads Flipped: %v", flipped)
		log.Printf("Encoded %v reads (may be < # of input reads due to duplicates).", n)

	} else {
		/* decode -k -ref -reads=FOO -out=OUT.seq
		   will look for FOO.enc, FOO.bittree, FOO.counts and decode into OUT.seq */

		tailsFN := readFile + ".enc"
		headsFN := readFile + ".bittree"
		countsFN := readFile + ".counts"

		log.Printf("Reading from %s, %s, and %s", tailsFN, headsFN, countsFN)

		// read the bucket names
		var kmers []string
		waitForBuckets := make(chan struct{})
		go func() {
			kmers = decodeKmersFromFile(headsFN, globalK)
			sort.Strings(kmers)
			close(waitForBuckets)
			runtime.Goexit()
			return
		}()

		// read the bucket counts
		var counts []int
		var readlen int
		waitForCounts := make(chan struct{})
		go func() {
			counts, readlen = readBucketCounts(countsFN)
			close(waitForCounts)
			runtime.Goexit()
			return
		}()

		// read the flipped bits --- flipped by be 0-length if no file could be
		// found; this indicates that either nothing was flipped or we don't
		// care about orientation
		var flipped []bool
		waitForFlipped := make(chan struct{})
		go func() {
			flipped = readFlipped(readFile + ".flipped")
			close(waitForFlipped)
			runtime.Goexit()
			return
		}()

		// read the NLocations, which might be 0-length if no file could be
		// found; this indicates that the Ns were recorded some other way.
		var NLocations [][]byte
		waitForNLocations := make(chan struct{})
		go func() {
			NLocations = readNLocations(readFile + ".ns")
			close(waitForNLocations)
			runtime.Goexit()
			return
		}()

		// open encoded read file
		encIn, err := os.Open(tailsFN)
		DIE_ON_ERR(err, "Can't open encoded read file %s", tailsFN)
		defer encIn.Close()

		readerBuf := bufio.NewReader(encIn)

		// create a bit reader wrapper around it
		reader := bitio.NewReader(readerBuf)
		defer reader.Close()

		// create a decoder around it
		decoder, err := arithc.NewDecoder(reader)
		DIE_ON_ERR(err, "Couldn't create decoder!")

		// create the output file
		log.Printf("Writing to %s", outFile)
		outF, err := os.Create(outFile)
		DIE_ON_ERR(err, "Couldn't create output file %s", outFile)
		defer outF.Close()

		<-waitForReference
		<-waitForBuckets
		<-waitForCounts
		<-waitForFlipped
		<-waitForNLocations
		log.Printf("Read length = %d", readlen)
		decodeReads(kmers, counts, flipped, NLocations, hash, readlen, outF, decoder)
	}
	log.Printf("Default interval used %v times and context used %v times",
		defaultIntervalSum, contextExists)

	endTime := time.Now()
	log.Printf("kpath took %v to run.", endTime.Sub(startTime).Seconds())

	/* UNCOMMENT TO DEBUG GARBAGE COLLECTION WITH GO 1.2
	   var stats debug.GCStats
	   stats.PauseQuantiles = make([]time.Duration, 5)
	   debug.ReadGCStats(&stats)
	   log.Printf("Last GC=%v\nNum GC=%v\nPause for GC=%v\nPauseHistory=%v",
	       stats.LastGC, stats.NumGC, stats.PauseTotal.Seconds(), stats.Pause)
	*/
}