Exemplo n.º 1
0
func main() {
	var (
		in           *fasta.Reader
		out, profile *os.File
		e            error
	)

	inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.")
	outName := flag.String("out", "", "Filename for output. Defaults to stdout.")
	k := flag.Int("k", 8, "kmer size to use.")
	cat := flag.Int("cat", 5, "number of categories.")
	iter := flag.Int("i", 1000, "iterations.")
	limit := flag.Int("time", 10, "time limit for NMF.")
	lo := flag.Int("lo", 1, "minimum number of kmer frequency to use in NMF.")
	hi := flag.Float64("hi", 0.5, "maximum proportion of kmer representation to use in NMF.")
	sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.")
	tol := flag.Float64("tol", 0.001, "tolerance for NMF.")
	threads := flag.Int("threads", 2, "number of threads to use.")
	seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.")
	help := flag.Bool("help", false, "print this usage message.")

	flag.Parse()

	if *help {
		flag.Usage()
		os.Exit(1)
	}

	runtime.GOMAXPROCS(*threads)
	sparse.MaxProcs = *threads
	fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0))
	if *cpuprofile != "" {
		if profile, e = os.Create(*cpuprofile); e != nil {
			fmt.Fprintf(os.Stderr, "Error: %v.", e)
			os.Exit(0)
		}
		fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile)
		pprof.StartCPUProfile(profile)
		defer pprof.StopCPUProfile()
	}

	if *inName == "" {
		fmt.Fprintln(os.Stderr, "Reading sequences from stdin.")
		in = fasta.NewReader(os.Stdin)
	} else if in, e = fasta.NewReaderName(*inName); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
		os.Exit(0)
	} else {
		fmt.Fprintf(os.Stderr, "Reading sequence from `%s'.\n", *inName)
	}
	defer in.Close()

	if *outName == "" {
		fmt.Fprintln(os.Stderr, "Writing output to stdout.")
		out = os.Stdout
	} else if out, e = os.Create(*outName); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
	} else {
		fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName)
	}
	defer out.Close()

	totalkmers := make(map[kmerindex.Kmer]float64)
	kmerlists := make([]map[kmerindex.Kmer]float64, 0)
	seqTable := make([]string, 0)

	for {
		if sequence, err := in.Read(); err != nil {
			break
		} else {
			var freqs map[kmerindex.Kmer]float64
			if kindex, e := kmerindex.New(*k, sequence); e != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.\n", e)
				fmt.Fprintln(os.Stderr)
				os.Exit(0)
			} else {
				freqs, _ = kindex.NormalisedKmerFrequencies()
				kmerlists = append(kmerlists, freqs)
				for kmer, freq := range freqs {
					totalkmers[kmer] += freq
				}
			}
			seqTable = append(seqTable, string(sequence.ID))
		}
	}

	kmerArray := make([][]float64, 0)
	kmerTable := make([]kmerindex.Kmer, 0)

	for kmer, _ := range totalkmers {
		var count int
		for _, kmerlist := range kmerlists {
			if kmerlist[kmer] > 0 {
				count++
			}
		}
		if count < *lo || float64(count)/float64(len(kmerlists)) > *hi {
			continue
		}
		row := make([]float64, len(kmerlists))
		for i, kmerlist := range kmerlists {
			row[i] = float64(kmerlist[kmer])
		}
		kmerArray = append(kmerArray, row)
		kmerTable = append(kmerTable, kmer)
	}

	var kmerMatrix *sparse.Sparse
	func() {
		defer func() {
			if r := recover(); r != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.", r)
				os.Exit(0)
			}
		}()
		kmerMatrix = sparse.Matrix(kmerArray)
	}()

	f := func(i, j int, v float64) float64 {
		if kmerMatrix.At(i, j) != 0 {
			return 1
		}
		return 0
	}
	nonZero := kmerMatrix.Apply(f).Sum()

	r, c := kmerMatrix.Dims()
	density := nonZero / float64(r*c)

	if *seed == -1 {
		*seed = time.Now().UnixNano()
	}
	fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed)
	rand.Seed(*seed)

	rows, cols := kmerMatrix.Dims()
	Wo := sparse.Random(rows, *cat, density**sf)
	Ho := sparse.Random(*cat, cols, density**sf)

	fmt.Fprintf(os.Stderr, "Dimensions of Kmer matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, kmerMatrix)

	W, H, ok := nmf.Factors(kmerMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9)

	fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok)

	printFeature(out, kmerMatrix, W, H, seqTable, kmerTable, *k)
}
Exemplo n.º 2
0
func main() {
	var (
		in      *bufio.Reader
		out     *bufio.Writer
		profile *os.File
		e       error
	)

	inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.")
	outName := flag.String("out", "", "Filename for output. Defaults to stdout.")
	transpose := flag.Bool("t", false, "Transpose columns and rows.")
	sep := flag.String("sep", "\t", "Column delimiter.")
	cat := flag.Int("cat", 5, "number of categories.")
	iter := flag.Int("i", 1000, "iterations.")
	rep := flag.Int("rep", 1, "Resample replicates.")
	limit := flag.Int("time", 10, "time limit for NMF.")
	sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.")
	tol := flag.Float64("tol", 0.001, "tolerance for NMF.")
	threads := flag.Int("threads", 2, "number of threads to use.")
	seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.")
	help := flag.Bool("help", false, "print this usage message.")

	flag.Parse()

	if *help {
		flag.Usage()
		os.Exit(1)
	}

	runtime.GOMAXPROCS(*threads)
	sparse.MaxProcs = *threads
	fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0))
	if *cpuprofile != "" {
		if profile, e = os.Create(*cpuprofile); e != nil {
			fmt.Fprintf(os.Stderr, "Error: %v.", e)
			os.Exit(0)
		}
		fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile)
		pprof.StartCPUProfile(profile)
		defer pprof.StopCPUProfile()
	}

	if *inName == "" {
		fmt.Fprintln(os.Stderr, "Reading table from stdin.")
		in = bufio.NewReader(os.Stdin)
	} else if f, err := os.Open(*inName); err != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", err)
		os.Exit(0)
	} else {
		defer f.Close()
		in = bufio.NewReader(f)
		fmt.Fprintf(os.Stderr, "Reading table from `%s'.\n", *inName)
	}

	if *outName == "" {
		fmt.Fprintln(os.Stderr, "Writing output to stdout.")
		out = bufio.NewWriter(os.Stdout)
	} else if f, err := os.Create(*outName); err != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", err)
		os.Exit(0)
	} else {
		defer f.Close()
		out = bufio.NewWriter(f)
		fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName)
	}
	defer out.Flush()

	var colNames, rowNames []string
	array := make([][]float64, 0)

	if line, err := in.ReadString('\n'); err != nil {
		fmt.Fprintln(os.Stderr, "No table to read\n")
		os.Exit(0)
	} else {
		line = strings.TrimSpace(line)
		colNames = strings.Split(line, "\t")
		colNames = colNames[1:]
	}

	for count := 1; ; count++ {
		if line, err := in.ReadString('\n'); err != nil {
			break
		} else {
			line = strings.TrimSpace(line)
			if row := strings.Split(line, *sep); len(row) != len(colNames)+1 {
				fmt.Fprintf(os.Stderr, "Table row mismatch at line %d.\n", count)
				os.Exit(0)
			} else {
				rowData := make([]float64, len(row)-1)
				for i, val := range row[1:] {
					if rowData[i], e = strconv.ParseFloat(val, 64); e != nil {
						fmt.Fprintf(os.Stderr, "Float conversion error %v at line %d element %d.\n", e, count, i)
						os.Exit(0)
					}
				}
				rowNames = append(rowNames, row[0])
				array = append(array, rowData)
			}
		}
	}

	var dataMatrix *sparse.Sparse
	func() {
		defer func() {
			if r := recover(); r != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.", r)
				os.Exit(0)
			}
		}()
		dataMatrix = sparse.Matrix(array)
	}()

	f := func(i, j int, v float64) float64 {
		if dataMatrix.At(i, j) != 0 {
			return 1
		}
		return 0
	}
	nonZero := dataMatrix.Apply(f).Sum()

	if *transpose {
		colNames, rowNames = rowNames, colNames
		dataMatrix = dataMatrix.T()
	}
	r, c := dataMatrix.Dims()

	density := nonZero / float64(r*c)

	if *seed == -1 {
		*seed = time.Now().UnixNano()
	}
	fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed)
	rand.Seed(*seed)

	rows, cols := dataMatrix.Dims()

	fmt.Fprintf(os.Stderr, "Dimensions of matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, dataMatrix)

	for run := 0; run < *rep; run++ {
		if *rep > 1 {
			fmt.Fprintf(os.Stderr, "Replicate #%d\n", run+1)
		}
		Wo := sparse.Random(rows, *cat, density**sf)
		Ho := sparse.Random(*cat, cols, density**sf)

		W, H, ok := nmf.Factors(dataMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9)

		fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok)

		printFeature(out, run, dataMatrix, W, H, rowNames, colNames)
	}
}