Пример #1
0
func remoteQueryTaxid2TaxonByFile(host string, port int, dataFile string, chunkSize int, threads int) {
	if chunkSize <= 0 {
		chunkSize = 1000
	}
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimRight(line, "\n")
		if line == "" {
			return "", false, nil
		}
		return line, true, nil
	}
	reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn)
	checkError(err)

	chResults := make(chan taxon.MessageTaxid2TaxonMap, threads)

	// receive result and print
	chDone := make(chan int)
	go func() {
		for msg := range chResults {
			if msg.Status != "OK" {
				log.Error(msg.Message)
			}
			for taxid, taxon := range msg.Taxons {
				fmt.Printf("Query TaxIDs: %s\n", taxid)
				bs, err := json.MarshalIndent(taxon, "", "  ")
				checkError(err)
				fmt.Printf("Taxon: %s\n\n", string(bs))
			}
		}
		chDone <- 1
	}()

	// querying
	var wg sync.WaitGroup
	tokens := make(chan int, threads)
	for chunk := range reader.Ch {
		tokens <- 1
		wg.Add(1)

		queries := make([]string, len(chunk.Data))
		for i, data := range chunk.Data {
			queries[i] = data.(string)
		}

		go func(queries []string) {
			defer func() {
				wg.Done()
				<-tokens
			}()

			msg := taxon.RemoteQueryTaxid2Taxon(host, port, queries)
			checkError(err)
			chResults <- msg
		}(queries)
	}
	wg.Wait()
	close(chResults)
	<-chDone
}
Пример #2
0
// ImportGiTaxid reads gi_taxid_nucl or gi_taxid_prot file and writes the data to database
func ImportGiTaxid(dbFile string, bucket string, dataFile string, chunkSize int, force bool) {
	db, err := bolt.Open(dbFile, 0600, nil)
	checkError(err)
	defer db.Close()

	if force {
		err = deleteBucket(db, bucket)
		checkError(err)
		log.Info("Old database deleted: %s", bucket)
	}

	if chunkSize <= 0 {
		chunkSize = 1000000
	}

	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimRight(line, "\n")
		if line == "" || line[0] == '#' {
			return nil, false, nil
		}
		items := strings.Split(line, "\t")
		if len(items) != 2 {
			return nil, false, nil
		}
		if items[0] == "" || items[1] == "" {
			return nil, false, nil
		}
		return items, true, nil
	}

	reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn)
	checkError(err)

	n := 0
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			checkError(chunk.Err)
			return
		}

		records := make([][]string, len(chunk.Data))
		for i, data := range chunk.Data {
			switch reflect.TypeOf(data).Kind() {
			case reflect.Slice:
				s := reflect.ValueOf(data)
				items := make([]string, s.Len())
				for i := 0; i < s.Len(); i++ {
					items[i] = s.Index(i).String()
				}
				records[i] = items
			}
		}
		write2db(records, db, bucket)
		n += len(records)
		log.Info("%d records imported to %s", n, dbFile)
	}
}
Пример #3
0
func remoteQueryGi2TaxidByFile(host string, port int, dataType string, dataFile string, chunkSize int, threads int) {
	if chunkSize <= 0 {
		chunkSize = 1000
	}
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimSpace(strings.TrimRight(line, "\n"))
		if line == "" {
			return "", false, nil
		}
		return line, true, nil
	}
	reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn)
	checkError(err)

	chResults := make(chan taxon.MessageGI2TaxidMap, threads)

	// receive result and print
	chDone := make(chan int)
	go func() {
		for msg := range chResults {
			if msg.Status != "OK" {
				log.Error(msg.Message)
			}
			for gi, taxid := range msg.Taxids {
				fmt.Printf("%s\t%s\n", gi, taxid)
			}
		}
		chDone <- 1
	}()

	// querying
	var wg sync.WaitGroup
	tokens := make(chan int, threads)
	for chunk := range reader.Ch {
		tokens <- 1
		wg.Add(1)

		gis := make([]string, len(chunk.Data))
		for i, data := range chunk.Data {
			gis[i] = data.(string)
		}

		go func(gis []string) {
			defer func() {
				wg.Done()
				<-tokens
			}()

			msg := taxon.RemoteQueryGi2Taxid(host, port, dataType, gis)
			checkError(err)
			chResults <- msg
		}(gis)
	}
	wg.Wait()
	close(chResults)
	<-chDone
}
Пример #4
0
// ImportNodes reads data from nodes.dmp and write to bolt database
func ImportNodes(dbFile string, bucket string, dataFile string, batchSize int, force bool) {
	db, err := bolt.Open(dbFile, 0600, nil)
	checkError(err)
	defer db.Close()

	if force {
		err = deleteBucket(db, bucket)
		checkError(err)
		log.Info("Old database deleted: %s", bucket)
	}

	if batchSize <= 0 {
		batchSize = 10000
	}

	re := regexp.MustCompile(`\t\|$`)
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimRight(line, "\n")
		if line == "" {
			return nil, false, nil
		}

		items := strings.Split(re.ReplaceAllString(line, ""), "\t|\t")
		if len(items) != 13 {
			return nil, false, nil
		}
		return nodes.NodeFromArgs(items), true, nil
	}

	reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), batchSize, fn)
	checkError(err)

	n := 0
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			checkError(chunk.Err)
			return
		}

		records := make([][]string, len(chunk.Data))
		for i, data := range chunk.Data {
			node := data.(nodes.Node)
			nodeJSONStr, err := node.ToJSON()
			if err != nil {
				checkError(chunk.Err)
				return
			}
			records[i] = []string{node.TaxID, nodeJSONStr}
		}
		write2db(records, db, bucket)
		n += len(records)
		log.Info("%d records imported to %s", n, dbFile)
	}
}
Пример #5
0
// ReadFeatures returns bed data of a file, availabe type values are 3,4,5,6
func ReadFeatures(file string, n int) ([]Feature, map[string]map[string]string, error) {
	if _, err := os.Stat(file); os.IsNotExist(err) {
		return nil, nil, err
	}
	fn := func(line string) (interface{}, bool, error) {
		if line[0] == '#' {
			return nil, false, nil
		}
		if string(line[0:7]) == "browser" {
			items := strings.Split(strings.TrimRight(line, "\n"), " ")
			if len(items) < 3 {
				return nil, false, ErrBadBrowserLine
			}
			details := make(map[string]string)
			details[items[1]] = items[2]
			return meta{"browser", details}, true, nil
		}
		if string(line[0:5]) == "track" {
			details := make(map[string]string)
			found := TrackItemRegexp.FindAllStringSubmatch(line, -1)
			for _, sub := range found {
				details[sub[0]] = sub[1]
			}
			return meta{"track", details}, true, nil
		}
		return nil, false, nil
	}

	reader, err := breader.NewBufferedReader(file, runtime.NumCPU(), 100, fn)
	if err != nil {
		return nil, nil, err
	}
	features := []Feature{}
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			return nil, nil, chunk.Err
		}
		for _, data := range chunk.Data {
			fmt.Println(reflect.TypeOf(data).Kind())
		}
	}
	return features, nil, nil
}
Пример #6
0
func queryGi2TaxidByFile(dbFilePath string, queryType string, dataFile string, chunkSize int, threads int) {
	if chunkSize <= 0 {
		chunkSize = 10000
	}
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimSpace(strings.TrimRight(line, "\n"))
		if line == "" {
			return "", false, nil
		}
		return line, true, nil
	}
	reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn)
	checkError(err)

	pool := taxon.NewDBPool(dbFilePath, threads)
	chResults := make(chan [][]string, threads)

	// receive result and print
	chDone := make(chan int)
	go func() {
		for s := range chResults {
			gis, taxids := s[0], s[1]
			for i, gi := range gis {
				fmt.Printf("%s\t%s\n", gi, taxids[i])
			}
		}
		chDone <- 1
	}()

	// querying
	var wg sync.WaitGroup
	tokens := make(chan int, threads)
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			checkError(chunk.Err)
			break
		}
		tokens <- 1
		wg.Add(1)

		gis := make([]string, len(chunk.Data))
		for i, data := range chunk.Data {
			gis[i] = data.(string)
		}

		go func(gis []string) {
			db := pool.GetDB()
			defer func() {
				pool.ReleaseDB(db)
				wg.Done()
				<-tokens
			}()

			taxids, err := taxon.QueryGi2Taxid(db, queryType, gis)
			checkError(err)
			chResults <- [][]string{gis, taxids}
		}(gis)
	}
	wg.Wait()
	close(chResults)
	<-chDone
}
Пример #7
0
// ImportNames reads data from names.dmp and write to bolt database
func ImportNames(dbFile string, bucket string, dataFile string, chunkSize int, force bool) {
	db, err := bolt.Open(dbFile, 0600, nil)
	checkError(err)
	defer db.Close()

	if force {
		err = deleteBucket(db, bucket)
		checkError(err)
		log.Info("Old database deleted: %s", bucket)
	}

	if chunkSize <= 0 {
		chunkSize = 10000
	}

	re := regexp.MustCompile(`\t\|$`)
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimRight(line, "\n")
		if line == "" {
			return nil, false, nil
		}
		items := strings.Split(re.ReplaceAllString(line, ""), "\t|\t")
		if len(items) != 4 {
			return nil, false, nil
		}
		return nodes.NameFromArgs(items), true, nil
	}

	reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn)
	checkError(err)

	names := make(map[string]nodes.Name)
	n := 0
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			checkError(chunk.Err)
			return
		}

		for _, data := range chunk.Data {
			name := data.(nodes.Name)
			if _, ok := names[name.TaxID]; ok {
				names[name.TaxID] = nodes.MergeNames(names[name.TaxID], name)
			} else {
				names[name.TaxID] = name
			}
		}
		n += len(chunk.Data)
		log.Info("%d records readed", n)
	}

	chResults := make(chan []string, runtime.NumCPU())

	// write to db
	chDone := make(chan int)
	go func() {
		records := make([][]string, chunkSize)
		i := 0
		n := 0
		for s := range chResults {
			records[i] = s
			i++
			n++
			if i%chunkSize == 0 {
				write2db(records, db, bucket)
				log.Info("%d records imported to %s", n, dbFile)
				records = make([][]string, chunkSize)
				i = 0
			}
		}
		log.Info("%d records imported to %s", n, dbFile)
		chDone <- 1
	}()

	// name to json
	tokens := make(chan int, runtime.NumCPU())
	var wg sync.WaitGroup
	for _, name := range names {
		tokens <- 1
		wg.Add(1)
		go func(name nodes.Name) {
			defer func() {
				wg.Done()
				<-tokens
			}()
			nameJSONStr, err := name.ToJSON()
			checkError(err)
			chResults <- []string{name.TaxID, nameJSONStr}
		}(name)
	}
	wg.Wait()
	close(chResults)
	<-chDone
}
Пример #8
0
// ReadFilteredFeatures returns gtf features of specific chrs in a file
func ReadFilteredFeatures(file string, chrs []string, feats []string, attrs []string) ([]Feature, error) {
	if _, err := os.Stat(file); os.IsNotExist(err) {
		return nil, err
	}
	chrsMap := make(map[string]struct{}, len(chrs))
	for _, chr := range chrs {
		chrsMap[strings.ToLower(chr)] = struct{}{}
	}

	featsMap := make(map[string]struct{}, len(feats))
	for _, f := range feats {
		featsMap[strings.ToLower(f)] = struct{}{}
	}

	attrsMap := make(map[string]struct{}, len(attrs))
	for _, f := range attrs {
		attrsMap[strings.ToLower(f)] = struct{}{}
	}

	fn := func(line string) (interface{}, bool, error) {
		if len(line) == 0 || line[0] == '#' {
			return nil, false, nil
		}
		line = strings.TrimRight(line, "\r\n")
		items := strings.Split(line, "\t")

		if len(items) != 9 {
			return nil, false, nil
		}

		if len(chrs) > 0 { // selected chrs
			if _, ok := chrsMap[strings.ToLower(items[0])]; !ok {
				return nil, false, nil
			}
		}

		if len(feats) > 0 { // selected features
			if _, ok := featsMap[strings.ToLower(items[2])]; !ok {
				return nil, false, nil
			}
		}
		var err error

		start, err := strconv.Atoi(items[3])
		if err != nil {
			return nil, false, fmt.Errorf("bad start: %s", items[3])
		}

		end, err := strconv.Atoi(items[4])
		if err != nil {
			return nil, false, fmt.Errorf("bad end: %s", items[4])
		}

		var score *float64
		if items[5] != "." {
			s, err := strconv.ParseFloat(items[5], 64)
			if err != nil {
				return nil, false, fmt.Errorf("bad score: %s", items[5])
			}
			score = &s
		}

		var strand *string
		if items[6] != "." {
			s := items[6]
			if !(s == "+" || s == "-") {
				return nil, false, fmt.Errorf("illigal strand: %s", s)
			}
			strand = &s
		}

		var frame *int
		if items[7] != "." {
			f, err := strconv.Atoi(items[7])
			if err != nil {
				return nil, false, fmt.Errorf("bad frame: %s", items[7])
			}
			if !(f == 0 || f == 1 || f == 2) {
				return nil, false, fmt.Errorf("illigal frame: %d", f)
			}
			frame = &f
		}

		feature := Feature{items[0], items[1], items[2], start, end, score, strand, frame, nil}

		tagValues := strings.Split(items[8], "; ")
		if len(tagValues) > 0 {
			var ok bool
			feature.Attributes = []Attribute{}
			for _, tagValue := range tagValues[0 : len(tagValues)-1] {
				items2 := strings.SplitN(tagValue, " ", 2)
				tag := items2[0]
				if _, ok = attrsMap[tag]; !ok {
					continue
				}
				value := items2[1]
				// if value[len(value)-1] == ';' {
				// 	value = value[0 : len(value)-1]
				// }
				if len(value) > 2 {
					value = value[1 : len(value)-1]
				} else {
					value = ""
				}
				feature.Attributes = append(feature.Attributes, Attribute{tag, value})
			}
		}
		return feature, true, nil
	}
	reader, err := breader.NewBufferedReader(file, Threads, 100, fn)
	if err != nil {
		return nil, err
	}
	features := []Feature{}
	for chunk := range reader.Ch {
		if chunk.Err != nil {
			return nil, chunk.Err
		}
		for _, data := range chunk.Data {
			features = append(features, data.(Feature))
		}
	}
	return features, nil
}
Пример #9
0
func remoteName2TaxIDByFile(host string, port int, useRegexp bool, nameClass string, dataFile string, chunkSize int, threads int) {
	if chunkSize <= 0 {
		chunkSize = 1000
	}
	fn := func(line string) (interface{}, bool, error) {
		line = strings.TrimRight(line, "\n")
		if line == "" {
			return "", false, nil
		}
		return line, true, nil
	}
	reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn)
	checkError(err)

	chResults := make(chan taxon.MssageName2TaxIDMap, threads)

	// receive result and print
	chDone := make(chan int)
	go func() {
		for msg := range chResults {
			if msg.Status != "OK" {
				log.Error(msg.Message)
			}
			for name, items := range msg.TaxIDs {
				idnames := make([]string, len(items))
				for i, item := range items {
					idnames[i] = fmt.Sprintf("%d(%s)", item.TaxID, item.ScientificName)
				}
				fmt.Printf("%s\t%s\n", name, strings.Join(idnames, ","))
			}
		}
		chDone <- 1
	}()

	// querying
	var wg sync.WaitGroup
	tokens := make(chan int, threads)
	for chunk := range reader.Ch {
		tokens <- 1
		wg.Add(1)

		queries := make([]string, len(chunk.Data))
		for i, data := range chunk.Data {
			queries[i] = data.(string)
		}

		go func(queries []string) {
			defer func() {
				wg.Done()
				<-tokens
			}()

			msg := taxon.RemoteQueryName2TaxID(host, port, useRegexp, nameClass, queries)
			checkError(err)
			chResults <- msg
		}(queries)
	}
	wg.Wait()
	close(chResults)
	<-chDone
}