Beispiel #1
0
func printFeature(out *os.File, V, W, H *sparse.Sparse, seqTable []string, kmerTable []kmerindex.Kmer, k int) {
	patternCount, seqCount := H.Dims()
	kmerCount, _ := W.Dims()

	hipats := make([]WeightList, seqCount)
	pats := make([]string, 0)

	for i := 0; i < patternCount; i++ {
		fmt.Fprintf(out, "Feature %v:\n", i)
		klist := make(WeightList, 0)
		for j := 0; j < kmerCount; j++ {
			klist = append(klist, Weight{weight: W.At(j, i), index: j})
		}
		sort.Sort(&klist)
		name := fmt.Sprint("[")
		for j := 0; j < len(klist); j++ {
			if klist[j].weight > 0 {
				name += fmt.Sprintf(" %s/%.3e ", kmerindex.Stringify(k, kmerTable[klist[j].index]), klist[j].weight)
			}
		}
		name += fmt.Sprint("]")
		pats = append(pats, name)
		fmt.Fprintln(out, name)

		slist := make(WeightList, 0)
		for j := 0; j < seqCount; j++ {
			slist = append(slist, Weight{weight: H.At(i, j), index: j})
			hipats[j] = append(hipats[j], Weight{weight: H.At(i, j), index: i})
		}

		sort.Sort(&slist)
		instances := ""
		for j := 0; j < len(slist); j++ {
			if slist[j].weight > 0 {
				instances += fmt.Sprintf("%s/%.3e\n", seqTable[slist[j].index], slist[j].weight)
			}
		}
		fmt.Fprintln(out, instances)

		fmt.Fprintln(out)
	}

	for j, seq := range hipats {
		sort.Sort(&seq)
		fmt.Fprintf(out, "%s/%e: %d\n", seqTable[j], seq[0].weight, seq[0].index)
	}
}
Beispiel #2
0
func printFeature(out io.Writer, run int, V, W, H *sparse.Sparse, rowNames, colNames []string) {
	patternCount, colCount := H.Dims()
	rowCount, _ := W.Dims()

	hipats := make([]WeightList, colCount)
	pats := make([]string, 0)

	for i := 0; i < patternCount; i++ {
		rlist := make(WeightList, 0)
		for j := 0; j < rowCount; j++ {
			rlist = append(rlist, Weight{weight: W.At(j, i), index: j})
		}
		sort.Sort(&rlist)
		name := []string{}
		for j := 0; j < len(rlist); j++ {
			if rlist[j].weight > 0 {
				name = append(name, fmt.Sprintf("%s/%.3e", rowNames[rlist[j].index], rlist[j].weight))
			}
		}
		nameString := strings.Join(name, ",")
		pats = append(pats, nameString)

		clist := make(WeightList, 0)
		for j := 0; j < colCount; j++ {
			clist = append(clist, Weight{weight: H.At(i, j), index: j})
			hipats[j] = append(hipats[j], Weight{weight: H.At(i, j), index: i})
		}

		sort.Sort(&clist)
		instances := []string{}
		for j := 0; j < len(clist); j++ {
			if clist[j].weight > 0 {
				instances = append(instances, fmt.Sprintf("%s/%.3e", colNames[clist[j].index], clist[j].weight))
			}
		}
		instanceString := strings.Join(instances, ",")
		fmt.Fprintf(out, "%d\t[%s]\t(%s)\n", run, nameString, instanceString)
	}

	for j, col := range hipats {
		sort.Sort(&col)
		fmt.Fprintf(os.Stderr, "%s/%e: %d\n", colNames[j], col[0].weight, col[0].index)
	}
}
Beispiel #3
0
func main() {
	var (
		in           *fasta.Reader
		out, profile *os.File
		e            error
	)

	inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.")
	outName := flag.String("out", "", "Filename for output. Defaults to stdout.")
	k := flag.Int("k", 8, "kmer size to use.")
	cat := flag.Int("cat", 5, "number of categories.")
	iter := flag.Int("i", 1000, "iterations.")
	limit := flag.Int("time", 10, "time limit for NMF.")
	lo := flag.Int("lo", 1, "minimum number of kmer frequency to use in NMF.")
	hi := flag.Float64("hi", 0.5, "maximum proportion of kmer representation to use in NMF.")
	sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.")
	tol := flag.Float64("tol", 0.001, "tolerance for NMF.")
	threads := flag.Int("threads", 2, "number of threads to use.")
	seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.")
	help := flag.Bool("help", false, "print this usage message.")

	flag.Parse()

	if *help {
		flag.Usage()
		os.Exit(1)
	}

	runtime.GOMAXPROCS(*threads)
	sparse.MaxProcs = *threads
	fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0))
	if *cpuprofile != "" {
		if profile, e = os.Create(*cpuprofile); e != nil {
			fmt.Fprintf(os.Stderr, "Error: %v.", e)
			os.Exit(0)
		}
		fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile)
		pprof.StartCPUProfile(profile)
		defer pprof.StopCPUProfile()
	}

	if *inName == "" {
		fmt.Fprintln(os.Stderr, "Reading sequences from stdin.")
		in = fasta.NewReader(os.Stdin)
	} else if in, e = fasta.NewReaderName(*inName); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
		os.Exit(0)
	} else {
		fmt.Fprintf(os.Stderr, "Reading sequence from `%s'.\n", *inName)
	}
	defer in.Close()

	if *outName == "" {
		fmt.Fprintln(os.Stderr, "Writing output to stdout.")
		out = os.Stdout
	} else if out, e = os.Create(*outName); e != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", e)
	} else {
		fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName)
	}
	defer out.Close()

	totalkmers := make(map[kmerindex.Kmer]float64)
	kmerlists := make([]map[kmerindex.Kmer]float64, 0)
	seqTable := make([]string, 0)

	for {
		if sequence, err := in.Read(); err != nil {
			break
		} else {
			var freqs map[kmerindex.Kmer]float64
			if kindex, e := kmerindex.New(*k, sequence); e != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.\n", e)
				fmt.Fprintln(os.Stderr)
				os.Exit(0)
			} else {
				freqs, _ = kindex.NormalisedKmerFrequencies()
				kmerlists = append(kmerlists, freqs)
				for kmer, freq := range freqs {
					totalkmers[kmer] += freq
				}
			}
			seqTable = append(seqTable, string(sequence.ID))
		}
	}

	kmerArray := make([][]float64, 0)
	kmerTable := make([]kmerindex.Kmer, 0)

	for kmer, _ := range totalkmers {
		var count int
		for _, kmerlist := range kmerlists {
			if kmerlist[kmer] > 0 {
				count++
			}
		}
		if count < *lo || float64(count)/float64(len(kmerlists)) > *hi {
			continue
		}
		row := make([]float64, len(kmerlists))
		for i, kmerlist := range kmerlists {
			row[i] = float64(kmerlist[kmer])
		}
		kmerArray = append(kmerArray, row)
		kmerTable = append(kmerTable, kmer)
	}

	var kmerMatrix *sparse.Sparse
	func() {
		defer func() {
			if r := recover(); r != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.", r)
				os.Exit(0)
			}
		}()
		kmerMatrix = sparse.Matrix(kmerArray)
	}()

	f := func(i, j int, v float64) float64 {
		if kmerMatrix.At(i, j) != 0 {
			return 1
		}
		return 0
	}
	nonZero := kmerMatrix.Apply(f).Sum()

	r, c := kmerMatrix.Dims()
	density := nonZero / float64(r*c)

	if *seed == -1 {
		*seed = time.Now().UnixNano()
	}
	fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed)
	rand.Seed(*seed)

	rows, cols := kmerMatrix.Dims()
	Wo := sparse.Random(rows, *cat, density**sf)
	Ho := sparse.Random(*cat, cols, density**sf)

	fmt.Fprintf(os.Stderr, "Dimensions of Kmer matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, kmerMatrix)

	W, H, ok := nmf.Factors(kmerMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9)

	fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok)

	printFeature(out, kmerMatrix, W, H, seqTable, kmerTable, *k)
}
Beispiel #4
0
func main() {
	var (
		in      *bufio.Reader
		out     *bufio.Writer
		profile *os.File
		e       error
	)

	inName := flag.String("in", "", "Filename for input to be factorised. Defaults to stdin.")
	outName := flag.String("out", "", "Filename for output. Defaults to stdout.")
	transpose := flag.Bool("t", false, "Transpose columns and rows.")
	sep := flag.String("sep", "\t", "Column delimiter.")
	cat := flag.Int("cat", 5, "number of categories.")
	iter := flag.Int("i", 1000, "iterations.")
	rep := flag.Int("rep", 1, "Resample replicates.")
	limit := flag.Int("time", 10, "time limit for NMF.")
	sf := flag.Float64("sf", 0.01, "factor for sparcity of estimating matrices for NMF.")
	tol := flag.Float64("tol", 0.001, "tolerance for NMF.")
	threads := flag.Int("threads", 2, "number of threads to use.")
	seed := flag.Int64("seed", -1, "seed for random number generator (-1 uses system clock).")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to this file.")
	help := flag.Bool("help", false, "print this usage message.")

	flag.Parse()

	if *help {
		flag.Usage()
		os.Exit(1)
	}

	runtime.GOMAXPROCS(*threads)
	sparse.MaxProcs = *threads
	fmt.Fprintf(os.Stderr, "Using %d threads.\n", runtime.GOMAXPROCS(0))
	if *cpuprofile != "" {
		if profile, e = os.Create(*cpuprofile); e != nil {
			fmt.Fprintf(os.Stderr, "Error: %v.", e)
			os.Exit(0)
		}
		fmt.Fprintf(os.Stderr, "Writing CPU profile data to %s\n", *cpuprofile)
		pprof.StartCPUProfile(profile)
		defer pprof.StopCPUProfile()
	}

	if *inName == "" {
		fmt.Fprintln(os.Stderr, "Reading table from stdin.")
		in = bufio.NewReader(os.Stdin)
	} else if f, err := os.Open(*inName); err != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", err)
		os.Exit(0)
	} else {
		defer f.Close()
		in = bufio.NewReader(f)
		fmt.Fprintf(os.Stderr, "Reading table from `%s'.\n", *inName)
	}

	if *outName == "" {
		fmt.Fprintln(os.Stderr, "Writing output to stdout.")
		out = bufio.NewWriter(os.Stdout)
	} else if f, err := os.Create(*outName); err != nil {
		fmt.Fprintf(os.Stderr, "Error: %v.", err)
		os.Exit(0)
	} else {
		defer f.Close()
		out = bufio.NewWriter(f)
		fmt.Fprintf(os.Stderr, "Writing output to `%s'.\n", *outName)
	}
	defer out.Flush()

	var colNames, rowNames []string
	array := make([][]float64, 0)

	if line, err := in.ReadString('\n'); err != nil {
		fmt.Fprintln(os.Stderr, "No table to read\n")
		os.Exit(0)
	} else {
		line = strings.TrimSpace(line)
		colNames = strings.Split(line, "\t")
		colNames = colNames[1:]
	}

	for count := 1; ; count++ {
		if line, err := in.ReadString('\n'); err != nil {
			break
		} else {
			line = strings.TrimSpace(line)
			if row := strings.Split(line, *sep); len(row) != len(colNames)+1 {
				fmt.Fprintf(os.Stderr, "Table row mismatch at line %d.\n", count)
				os.Exit(0)
			} else {
				rowData := make([]float64, len(row)-1)
				for i, val := range row[1:] {
					if rowData[i], e = strconv.ParseFloat(val, 64); e != nil {
						fmt.Fprintf(os.Stderr, "Float conversion error %v at line %d element %d.\n", e, count, i)
						os.Exit(0)
					}
				}
				rowNames = append(rowNames, row[0])
				array = append(array, rowData)
			}
		}
	}

	var dataMatrix *sparse.Sparse
	func() {
		defer func() {
			if r := recover(); r != nil {
				fmt.Fprintf(os.Stderr, "Error: %v.", r)
				os.Exit(0)
			}
		}()
		dataMatrix = sparse.Matrix(array)
	}()

	f := func(i, j int, v float64) float64 {
		if dataMatrix.At(i, j) != 0 {
			return 1
		}
		return 0
	}
	nonZero := dataMatrix.Apply(f).Sum()

	if *transpose {
		colNames, rowNames = rowNames, colNames
		dataMatrix = dataMatrix.T()
	}
	r, c := dataMatrix.Dims()

	density := nonZero / float64(r*c)

	if *seed == -1 {
		*seed = time.Now().UnixNano()
	}
	fmt.Fprintf(os.Stderr, "Using %v as random seed.\n", *seed)
	rand.Seed(*seed)

	rows, cols := dataMatrix.Dims()

	fmt.Fprintf(os.Stderr, "Dimensions of matrix = (%v, %v)\nDensity = %.3f %%\n%v\n", r, c, (density)*100, dataMatrix)

	for run := 0; run < *rep; run++ {
		if *rep > 1 {
			fmt.Fprintf(os.Stderr, "Replicate #%d\n", run+1)
		}
		Wo := sparse.Random(rows, *cat, density**sf)
		Ho := sparse.Random(*cat, cols, density**sf)

		W, H, ok := nmf.Factors(dataMatrix, Wo, Ho, *tol, *iter, time.Duration(*limit)*1e9)

		fmt.Fprintf(os.Stderr, "norm(H) = %v norm(W) = %v\n\nFinished = %v\n\n", H.Norm(matrix.Fro), W.Norm(matrix.Fro), ok)

		printFeature(out, run, dataMatrix, W, H, rowNames, colNames)
	}
}