Beispiel #1
0
func calculateConfidenceInterval(s scoreResult) confInterval {
	var t0s []float64
	var t1s []float64

	// Partition the data into treatment 0 and treatment 1
	// and save the score for evaluation
	for _, each := range s.t0 {
		t0s = append(t0s, each.y)
	}

	for _, each := range s.t1 {
		t1s = append(t1s, each.y)
	}

	var ci confInterval
	//var z = 1.96 // http://www.dummies.com/how-to/content/creating-a-confidence-interval-for-the-difference-.html
	//var z = 1.645 // http://www.dummies.com/how-to/content/creating-a-confidence-interval-for-the-difference-.html
	//var z = 2.58

	var m0, _ = stats.Mean(t0s)
	var n0 = float64(len(t0s))
	var sd0, _ = stats.StandardDeviation(t0s)

	var m1, _ = stats.Mean(t1s)
	var n1 = float64(len(t1s))
	var sd1, _ = stats.StandardDeviation(t1s)

	var mDiff = m0 - m1
	var sd0s = sd0 * sd0
	var sd1s = sd1 * sd1

	ci.min = mDiff - zScore*math.Sqrt(sd1s/n1+sd0s/n0)
	ci.max = mDiff + zScore*math.Sqrt(sd1s/n1+sd0s/n0)
	ci.diff = ci.min - ci.max

	ci.t1Max = m1 + ci.max
	ci.t1Min = m1 + ci.min
	ci.diffSd = ci.diff / sd1

	// how close is the score to the middle of the confidence interval
	ci.middle = (ci.min + ci.max) / 2
	//ci.closeness = math.Abs(s.score - ci.middle)
	//ci.closeness = math.Abs(ci.diffSd - s.score)

	// Difference in sample means +- confidence interval

	//fmt.Printf("conf interval: %f to %f,  conf diff: %f, t1: %f, t1max: %f, t1min: %f, diffSd: %f\n", ci.min, ci.max, ci.diff, m1, ci.t1Max, ci.t1Min, ci.diffSd)

	return ci
}
Beispiel #2
0
// https://github.com/hermanschaaf/stats/blob/master/stats.go
func NormalConfidenceInterval(nums []float64) (lower float64, upper float64) {
	conf := 1.95996 // 95% confidence for the mean, http://bit.ly/Mm05eZ
	mean, _ := stats.Mean(nums)
	dev, _ := stats.StandardDeviation(nums)
	dev = dev / math.Sqrt(float64(len(nums)))
	return mean - dev*conf, mean + dev*conf
}
Beispiel #3
0
func (s *statistics) report() {
	for range time.Tick(time.Second) {
		s.Lock()
		writeTimes := s.writeTimes
		s.writeTimes = nil
		s.Unlock()

		// The stats functions return an error only when the input is empty.
		mean, _ := stats.Mean(writeTimes)
		stddev, _ := stats.StandardDeviation(writeTimes)
		log.Infof("wrote %d messages, latency mean=%s, stddev=%s",
			len(writeTimes), time.Duration(mean), time.Duration(stddev))
	}
}
Beispiel #4
0
// for a partition in the set of data, calculate the effective treatement
// score using (mean t1 - mean t0) / population standard deviation
func evalScore(d []coreData, rc []rowCriteria, dataSetId int) scoreResult {
	var s scoreResult
	s.rc = rc
	s.d = d
	s.score = 0
	s.dataSetId = dataSetId

	// check for minimum row threshhold
	if len(d) <= rowThreshhold {
		return s
	}

	var t0 []coreData
	var t1 []coreData
	var t0s []float64
	var t1s []float64
	var allTs []float64

	// Partition the data into treatment 0 and treatment 1
	// and save the score for evaluation
	for _, each := range d {
		// Save all responses for later SD calculation
		allTs = append(allTs, each.y)

		if each.treatment == 0 {
			t0 = append(t0, each)
			t0s = append(t0s, each.y)
		} else {
			t1 = append(t1, each)
			t1s = append(t1s, each.y)
		}
	}

	// Must have minimum threshhold of records
	if len(t0)+len(t1) < rowThreshhold {
		return s
	}

	// Must have at least one in each group
	if len(t0) == 0 || len(t1) == 0 {
		return s
	}

	// then calculate the median, also experiment with average
	var mean0, _ = stats.Mean(t0s)
	var mean1, _ = stats.Mean(t1s)
	//var meanAll, _ = stats.Mean(allTs)
	//var sd, _ = stats.StandardDeviationPopulation(allTs)

	// subtract the two t0-t1, we want t1 to be smaller
	// Note: use spooled
	// square root of ((Nt-1)St^2 + (Nc-1)Sc^2)/(Nt+Nc))
	var St, _ = stats.StandardDeviation(t1s)
	var Sc, _ = stats.StandardDeviation(t0s)
	var Nt = float64(len(t1s))
	var Nc = float64(len(t0s))

	//func calculateConfidenceInterval2(nt, nc, mt, mc, sdt, sdc float64) confInterval2
	var ci = calculateConfidenceInterval2(Nt, Nc, mean1, mean0, St, Sc)

	// If the confidence intervals overlap then not valid range
	if ci.overlap {
		return s
	}

	var St2 = St * St
	var Sc2 = Sc * Sc
	//var Ntm1 = float64(Nt - 1)
	//var Ncm1 = float64(Nc - 1)
	//var kt = Ntm1 * St2
	//var kc = Ncm1 * Sc2
	//var ksum = kt + kc
	//var Nsum = Nt + Nc
	//var sPooled = math.Sqrt(ksum / Nsum)

	//http://www.uccs.edu/~lbecker/
	var sPooled = math.Sqrt((St2 + Sc2) / 2)

	s.t0 = t0
	s.t1 = t1
	//sPooled = math.Sqrt((St2 * Sc2) / 2)

	//var _, t1confh = NormalConfidenceInterval(t1s)
	//var _, t0confh = NormalConfidenceInterval(t0s)

	//var meanValue = mean1 - mean0
	//var meanValue = (mean1/St - mean0/Sc) / sPooled

	// Score Type 1
	var meanDifference = mean1 - mean0
	//s.score = meanDifference / meanAll
	//var max, _ = stats.Max(allTs)

	//s.score = meanDifference / sPooled
	var cohensd = meanDifference / sPooled
	var a = ((Nt + Nc) * (Nt + Nc)) / (Nt + Nc)

	// Score type 5
	s.score = cohensd / math.Sqrt((cohensd*cohensd)+4)

	// Score type 6
	s.score = cohensd / math.Sqrt((cohensd*cohensd)+a)

	//s.score = (mean1/St - mean0/Sc) / St

	return s
}
Beispiel #5
0
func main() {
	t := time.Now()
	fmt.Println(t.Format(time.RFC3339))

	rand.Seed(1)

	// Read in data
	readData()

	// Set one level with all row criteria,
	// this is used to start the set creation
	levelOne = fullOneLevel()

	//levels = fullTwoLevel()
	outputRowCriteria(levels)

	// experiment variables
	rand_numSets = 1000
	rand_maxSetMembers = 5
	maxExperiments = 1

	var expMin []float64
	var expMax []float64
	scoreCutoff = -0.89
	rowThreshhold = 2
	zScore = 2.58
	for experiment := 1; experiment <= maxExperiments; experiment++ {
		// experiment variables, changes per experiment
		rand_numSets += 0
		rand_maxSetMembers += 0
		scoreCutoff += -0.00
		zScore += 0.0

		// Setup experiment variables
		var scores []scoreResult
		var minScore float64 = -100
		var maxScore float64 = 0
		levels = fullFourLevel() //randLevels()
		fmt.Printf("sets count: %d, max set members: %d, level 1 count: %d, rowThreshhold: %d, scoreCutoff: %f, zScore: %f\n", len(levels), rand_maxSetMembers+2, len(levelOne), rowThreshhold, scoreCutoff, zScore)

		for dataSetId := 1; dataSetId <= datasets; dataSetId++ {
			s := levelEval(dataSetId)
			sort.Sort(scoreResults(s))

			// s contains a list of scores for one dataset, sorted
			// this is were we can get some info on that data
			//outputScoreList(s)

			if len(s) > 0 {
				//var sEval = evaluateScores(s)
				// pick the top score
				var sEval = s[0]
				scores = append(scores, sEval)
				fmt.Printf("%d, %f \n", sEval.dataSetId, sEval.score)

				if minScore < sEval.score {
					minScore = sEval.score
				}
				if maxScore > sEval.score {
					maxScore = sEval.score
				}
			}

			// For all score in this set write out the median and standard deviation
			var set []float64
			for _, scoreItem := range s {
				if scoreItem.score < 0.0 {
					set = append(set, scoreItem.score)
				}
			}

			var median, _ = stats.Median(set)
			var sd, _ = stats.StandardDeviation(set)
			var min, _ = stats.Min(set)
			var max, _ = stats.Max(set)
			fmt.Printf("dataset: %d, median: %f, sd: %f, min: %f, max: %f, len: %d\n", dataSetId, median, sd, min, max, len(set))

		}

		expMin = append(expMin, minScore)
		expMax = append(expMax, maxScore)

		//scoreCutoff = (minScore * (percentRofMin / 100.0)) + minScore
		//fmt.Printf(" scoreCutoff: %f \n", scoreCutoff)

		outputScores(scores)
		// Write output file
		outputResults(scores)

		// Compare to training truth data
		// compareTrainingDataWithResults()
	}

	t = time.Now()
	fmt.Println(t.Format(time.RFC3339))

	// Output min max scores per experiment
	for _, each := range expMin {
		fmt.Printf("min: %f, ", each)
	}
	fmt.Println()
	for _, each := range expMax {
		fmt.Printf("max: %f, ", each)
	}
}
Beispiel #6
0
//apply transforms an array of data
func apply(data []string, transformation templates.Transformation) ([]string, []Mapping) {
	p := transformation.Parameters
	var wg sync.WaitGroup
	var mapping []Mapping

	switch transformation.Operation {
	case "toDate":
		if len(p) != 2 {
			log.Fatal("toDate transformation requires 2 parameters:  current format, new format")
		}

		oldFormat := p[0]
		newFormat := p[1]

		for i, x := range data {
			y, err := time.Parse(oldFormat, x)
			if err != nil {
				log.Print("Error parsing date with index ", i, " with format: ", oldFormat)
			} else {
				data[i] = y.Format(newFormat)
			}
		}
	case "setNull":
		for i, x := range data {
			if arrayPos(x, p) != -1 {
				data[i] = ""
			}
		}
	case "standardize":
		if len(p) != 1 {
			log.Fatal("standardize transformation requires 1 parameter:  type (min-max|z-score)")
		}

		stype := p[0]
		switch stype {
		case "min-max":
			newData := strArrToFloatArr(data)
			min, err := stats.Min(newData)
			if err != nil {
				log.Fatal("Error finding minimum of data: ", err)
			}
			max, err := stats.Max(newData)
			if err != nil {
				log.Fatal("Error finding maximum of data: ", err)
			}
			srange := max - min

			for i, x := range newData {
				data[i] = floatToString((x - min) / srange)
			}
		case "z-score":
			newData := strArrToFloatArr(data)
			mean, err := stats.Mean(newData)
			if err != nil {
				log.Fatal("Error finding mean of data: ", err)
			}
			sd, err := stats.StandardDeviation(newData)
			if err != nil {
				log.Fatal("Error finding standard deviation of data: ", err)
			}

			for i, x := range newData {
				data[i] = floatToString((x - mean) / sd)
			}
		case "decimal":
			newData := strArrToFloatArr(data)
			max, err := stats.Max(newData)
			if err != nil {
				log.Fatal("Error finding maximum of data: ", err)
			}
			min, err := stats.Min(newData)
			if err != nil {
				log.Fatal("Error finding minimum of data: ", err)
			}

			var maxAbs float64
			if math.Abs(max) > math.Abs(min) {
				maxAbs = math.Abs(max)
			} else {
				maxAbs = math.Abs(min)
			}
			c := math.Ceil(math.Log10(maxAbs))
			for i, x := range newData {
				data[i] = floatToString(x / math.Pow10(int(c)))
			}
		}
	case "binPercent":
		table := NewPivotTable(data)
		intP := strArrToIntArr(p)
		sort.Ints(intP)
		ps := NewPercentileService(*table, intP)
		mapping = ps.CreateMappings()
		ps.Bin(mapping, data)
	case "fuzzyMap":
		if len(p) != 3 {
			log.Fatal("fuzzyMap transformation requires 3 parameters:  datasource GUID, match, put")
		}

		dsGUID := p[0]
		ds := datasources.NewDatasourceService(database.GetDatabase())
		dsObj, err := ds.GetDatasource(dsGUID)
		if err != nil {
			log.Fatal("Error finding Datasource: ", err)
		}
		distinctValues := getDistinctValues(data)
		for i, datum := range distinctValues {
			wg.Add(1)
			go func(i int, datum string, dsObj datasources.Datasource) {
				result := fuzzyMap(datum, dsObj.Settings)
				fuzzyMapping := NewMapping(datum, result)
				mapping = append(mapping, *fuzzyMapping)
				defer wg.Done()
			}(i, datum, dsObj)
		}
		wg.Wait()
		data = applyMappings(mapping, data)
	}

	return data, mapping
}