Exemplo n.º 1
0
// startStats blocks and periodically logs transaction statistics (throughput,
// success rates, durations, ...).
// TODO(tschottdorf): Use a proper metrics subsystem for this (+the store-level
// stats).
// TODO(mrtracy): Add this to TimeSeries.
func (tc *TxnCoordSender) startStats() {
	res := time.Millisecond // for duration logging resolution
	lastNow := tc.clock.PhysicalNow()
	for {
		select {
		case <-time.After(statusLogInterval):
			if !log.V(1) {
				continue
			}

			tc.Lock()
			curStats := tc.txnStats
			tc.txnStats = txnCoordStats{}
			tc.Unlock()

			now := tc.clock.PhysicalNow()

			// Tests have weird clocks.
			if now-lastNow <= 0 {
				continue
			}

			num := len(curStats.durations)
			dMax := time.Duration(stats.Max(curStats.durations))
			dMean := time.Duration(stats.Mean(curStats.durations))
			dDev := time.Duration(stats.StdDevP(curStats.durations))
			rMax := stats.Max(curStats.restarts)
			rMean := stats.Mean(curStats.restarts)
			rDev := stats.StdDevP(curStats.restarts)

			rate := float64(int64(num)*int64(time.Second)) / float64(now-lastNow)
			var pCommitted, pAbandoned, pAborted float32
			if num > 0 {
				pCommitted = 100 * float32(curStats.committed) / float32(num)
				pAbandoned = 100 * float32(curStats.abandoned) / float32(num)
				pAborted = 100 * float32(curStats.aborted) / float32(num)
			}
			log.Infof("txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f %%cmmt/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%.1f avg/σ/max restarts (%d samples)",
				rate, pCommitted, pAborted, pAbandoned, util.TruncateDuration(dMean, res),
				util.TruncateDuration(dDev, res), util.TruncateDuration(dMax, res),
				rMean, rDev, rMax, num)
			lastNow = now
		case <-tc.stopper.ShouldStop():
			return
		}
	}
}
Exemplo n.º 2
0
// printStats prints the time it took for rebalancing to finish and the final
// standard deviation of replica counts across stores.
func (at *allocatorTest) printRebalanceStats(db *gosql.DB, host string, adminPort int) error {
	// TODO(cuongdo): Output these in a machine-friendly way and graph.

	// Output time it took to rebalance.
	{
		var rebalanceIntervalStr string
		var rebalanceInterval time.Duration
		q := `SELECT (SELECT MAX(timestamp) FROM rangelog) - ` +
			`(select MAX(timestamp) FROM eventlog WHERE eventType='` + string(sql.EventLogNodeJoin) + `')`
		if err := db.QueryRow(q).Scan(&rebalanceIntervalStr); err != nil {
			return err
		}
		rebalanceInterval, err := time.ParseDuration(rebalanceIntervalStr)
		if err != nil {
			return err
		}
		if rebalanceInterval < 0 {
			// This can happen with single-node clusters.
			rebalanceInterval = time.Duration(0)
		}
		log.Infof("cluster took %s to rebalance", rebalanceInterval)
	}

	// Output # of range events that occurred. All other things being equal,
	// larger numbers are worse and potentially indicate thrashing.
	{
		var rangeEvents int64
		q := `SELECT COUNT(*) from rangelog`
		if err := db.QueryRow(q).Scan(&rangeEvents); err != nil {
			return err
		}
		log.Infof("%d range events", rangeEvents)
	}

	// Output standard deviation of the replica counts for all stores.
	{
		var client http.Client
		var nodesResp serverpb.NodesResponse
		url := fmt.Sprintf("http://%s:%d/_status/nodes", host, adminPort)
		if err := util.GetJSON(client, url, &nodesResp); err != nil {
			return err
		}
		var replicaCounts stats.Float64Data
		for _, node := range nodesResp.Nodes {
			for _, ss := range node.StoreStatuses {
				replicaCounts = append(replicaCounts, float64(ss.Metrics["replicas"]))
			}
		}
		stddev, err := stats.StdDevP(replicaCounts)
		if err != nil {
			return err
		}
		log.Infof("stddev(replica count) = %.2f", stddev)
	}

	return nil
}
Exemplo n.º 3
0
func (at *allocatorTest) stdDev() (float64, error) {
	host := at.f.Hostname(0)
	var client http.Client
	var nodesResp serverpb.NodesResponse
	url := fmt.Sprintf("http://%s:%s/_status/nodes", host, adminPort)
	if err := httputil.GetJSON(client, url, &nodesResp); err != nil {
		return 0, err
	}
	var replicaCounts stats.Float64Data
	for _, node := range nodesResp.Nodes {
		for _, ss := range node.StoreStatuses {
			replicaCounts = append(replicaCounts, float64(ss.Metrics["replicas"]))
		}
	}
	stdDev, err := stats.StdDevP(replicaCounts)
	if err != nil {
		return 0, err
	}
	return stdDev, nil
}
Exemplo n.º 4
0
// startStats blocks and periodically logs transaction statistics (throughput,
// success rates, durations, ...). Note that this only captures write txns,
// since read-only txns are stateless as far as TxnCoordSender is concerned.
// stats).
// TODO(mrtracy): Add this to TimeSeries.
func (tc *TxnCoordSender) startStats() {
	res := time.Millisecond // for duration logging resolution
	lastNow := tc.clock.PhysicalNow()
	for {
		select {
		case <-time.After(statusLogInterval):
			if !log.V(1) {
				continue
			}

			tc.Lock()
			curStats := tc.txnStats
			tc.txnStats = txnCoordStats{}
			tc.Unlock()

			now := tc.clock.PhysicalNow()

			// Tests have weird clocks.
			if now-lastNow <= 0 {
				continue
			}

			num := len(curStats.durations)
			// Only compute when non-empty input.
			var dMax, dMean, dDev, rMax, rMean, rDev float64
			var err error
			if num > 0 {
				// There should never be an error in the below
				// computations.
				dMax, err = stats.Max(curStats.durations)
				if err != nil {
					panic(err)
				}
				dMean, err = stats.Mean(curStats.durations)
				if err != nil {
					panic(err)
				}
				dDev, err = stats.StdDevP(curStats.durations)
				if err != nil {
					panic(err)
				}
				rMax, err = stats.Max(curStats.restarts)
				if err != nil {
					panic(err)
				}
				rMean, err = stats.Mean(curStats.restarts)
				if err != nil {
					panic(err)
				}
				rDev, err = stats.StdDevP(curStats.restarts)
				if err != nil {
					panic(err)
				}
			}

			rate := float64(int64(num)*int64(time.Second)) / float64(now-lastNow)
			var pCommitted, pAbandoned, pAborted float32

			if fNum := float32(num); fNum > 0 {
				pCommitted = 100 * float32(curStats.committed) / fNum
				pAbandoned = 100 * float32(curStats.abandoned) / fNum
				pAborted = 100 * float32(curStats.aborted) / fNum
			}
			log.Infof(
				"txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f %%cmmt/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%.1f avg/σ/max restarts (%d samples)",
				rate, pCommitted, pAborted, pAbandoned,
				util.TruncateDuration(time.Duration(dMean), res),
				util.TruncateDuration(time.Duration(dDev), res),
				util.TruncateDuration(time.Duration(dMax), res),
				rMean, rDev, rMax, num,
			)
			lastNow = now
		case <-tc.stopper.ShouldStop():
			return
		}
	}
}