Example #1
0
File: ct.go Project: jasonmoo/cdist
func main() {

	var inputs []io.Reader

	if flag.NArg() > 0 {
		for _, file_path := range flag.Args() {
			file, err := os.Open(file_path)
			if err != nil {
				log.Fatal(err)
			}
			inputs = append(inputs, file)
		}
	} else {
		inputs = []io.Reader{os.Stdin}
	}

	start := time.Now()

	filter := func(c rune) bool {
		return !unicode.IsLetter(c) && !unicode.IsNumber(c)
	}

	switch {
	case *words:

		dict := oc.NewOc()

		if *stem {

			const chunksize = 64 << 10

			stems := make(chan string, chunksize)
			stemswg := new(sync.WaitGroup)
			stemswg.Add(1)
			go func() {
				for word := range stems {
					dict.Increment(word, 1)
				}
				stemswg.Done()
			}()

			wg := new(sync.WaitGroup)

			for _, input := range inputs {
				br := bufio.NewReaderSize(input, 1<<20)
				for {
					buf := make([]byte, chunksize)

					n, err := br.Read(buf)
					if err != nil && err != io.EOF {
						log.Fatal(err)
					}
					if n == 0 {
						break
					}
					buf = buf[:n]

					// read to a newline to prevent
					// spanning a word across two buffers
					if buf[len(buf)-1] != '\n' {
						extra, err := br.ReadBytes('\n')
						if err != nil && err != io.EOF {
							log.Fatal(err)
						}
						buf = append(buf, extra...)
					}

					wg.Add(1)
					go func(b []byte) {
						for _, word := range strings.FieldsFunc(string(b), filter) {
							stems <- english.Stem(word, false)
						}
						wg.Done()
					}(buf)
				}
			}

			wg.Wait()
			close(stems)
			stemswg.Wait()

		} else {
			for _, input := range inputs {
				buf := bufio.NewReader(input)
				for {
					line, err := buf.ReadString('\n')
					if err != nil {
						if err == io.EOF {
							break
						}
						log.Fatal(err)
					}

					for _, word := range strings.FieldsFunc(line, filter) {
						dict.Increment(word, 1)
					}
				}
			}
		}

		dict.SortByCt(oc.DESC)

		for dict.Next() {
			key, ct := dict.KeyValue()
			fmt.Printf("%s: %d\n", key, ct)
		}

		fmt.Fprintf(os.Stderr, "%d words counted in %s\n", dict.Len(), time.Since(start))

	default:

		dict := oc.NewOc()

		for _, input := range inputs {
			buf := bufio.NewReader(input)
			for {
				r, _, err := buf.ReadRune()
				if err != nil {
					if err == io.EOF {
						break
					}
					log.Fatal(err)
				}
				dict.Increment(strconv.QuoteRune(r), 1)
			}
		}

		dict.SortByCt(oc.DESC)

		for dict.Next() {
			key, ct := dict.KeyValue()
			fmt.Printf("%s: %d\n", key, ct)
		}

		fmt.Fprintf(os.Stderr, "%d runes counted in %s\n", dict.Len(), time.Since(start))

	}

}
Example #2
0
func main() {

	fmt.Println("running...")
	defer fmt.Println("done!")

	nums := make([]int64, 1<<20)

	for i, _ := range nums {
		nums[i] = int64(i)
	}

	bnums := []byte{}
	bn := (*reflect.SliceHeader)(unsafe.Pointer(&bnums))
	bn.Data = (uintptr)(unsafe.Pointer(&nums[0]))
	bn.Cap = cap(nums) * 8
	bn.Len = len(nums) * 8

	set := oc.NewOc()

	set.Increment("[]int64", len(nums)*8)
	set.Increment("[]byte", len(bnums))

	var buf bytes.Buffer
	w := gzip.NewWriter(&buf)
	w.Write(bnums)
	w.Close()
	set.Increment("gzip []byte", buf.Len())

	buf.Reset()
	json.NewEncoder(&buf).Encode(bnums)
	set.Increment("json []int64", buf.Len())

	buf.Reset()
	w.Reset(&buf)
	json.NewEncoder(w).Encode(bnums)
	w.Close()
	set.Increment("gzip json []int64", buf.Len())

	buf.Reset()
	gob.NewEncoder(&buf).Encode(bnums)
	set.Increment("gob []int64", buf.Len())

	buf.Reset()
	w.Reset(&buf)
	gob.NewEncoder(w).Encode(bnums)
	w.Close()
	set.Increment("gzip gob []int64", buf.Len())

	buf.Reset()
	msgpack.NewEncoder(&buf).Encode(bnums)
	set.Increment("msgpack []int64", buf.Len())

	buf.Reset()
	w.Reset(&buf)
	msgpack.NewEncoder(w).Encode(bnums)
	w.Close()
	set.Increment("gzip msgpack []int64", buf.Len())

	data, _ := bson.Marshal(bnums)
	set.Increment("bson []int64", len(data))

	buf.Reset()
	w.Reset(&buf)
	w.Write(data)
	w.Close()
	set.Increment("gzip bson []int64", buf.Len())

	set.SortByCt(oc.ASC)

	for set.Next() {
		fmt.Println(set.KeyValue())
	}

}
Example #3
0
func TestHashSource(t *testing.T) {

	const (
		set_size = 16
		runs     = 1 << 20
	)

	tabw := tabwriter.NewWriter(os.Stdout, 16, 8, 1, '\t', 0)
	defer tabw.Flush()

	fmt.Fprintln(tabw, "name\tmin\tmax\tdev\tdist\tmean\tstddev")

	seed := NewCryptoRandSeed()

	for _, name := range names {

		source := rand.New(NewHashSource(hashes[name], seed))

		set := oc.NewOc()
		var numbers []float64

		for i := 0; i < runs; i++ {
			numbers = append(numbers, float64(source.Int63()))
			set.Increment(strconv.Itoa(source.Intn(set_size)), 1)
		}

		if set.Len() != set_size {
			t.Errorf("Expected full distribution across set for %s, got %d", name, set.Len())
		}

		set.SortByCt(oc.DESC)

		var min, max int64
		for set.Next() {
			k, vi := set.KeyValue()
			v := int64(vi)
			if min == 0 || v < min {
				min = v
			}
			if v > max {
				max = v
			}
			_ = k
			// fmt.Println(k, "\t", v)
		}

		stats := getStats(numbers)
		fmt.Fprintf(tabw, "%s\t%d\t%d\t%d\t%f\t%.0f\t%.0f\n", name, min, max, max-min, 1-(float64(max-min)/runs), stats.mean, stats.stdDev)

		tabw.Flush()
	}

	source := rand.New(rand.NewSource(seed))

	set := oc.NewOc()
	var numbers []float64

	for i := 0; i < runs; i++ {
		numbers = append(numbers, float64(source.Int63()))
		set.Increment(strconv.Itoa(source.Intn(set_size)), 1)
	}

	set.SortByCt(oc.DESC)

	var min, max int64
	for set.Next() {
		_, vi := set.KeyValue()
		v := int64(vi)
		if min == 0 || v < min {
			min = v
		}
		if v > max {
			max = v
		}
	}

	stats := getStats(numbers)
	fmt.Fprintf(tabw, "%s\t%d\t%d\t%d\t%f\t%.0f\t%.0f\n", "math/rand", min, max, max-min, 1-(float64(max-min)/runs), stats.mean, stats.stdDev)

}