func calculateConfidenceInterval(s scoreResult) confInterval { var t0s []float64 var t1s []float64 // Partition the data into treatment 0 and treatment 1 // and save the score for evaluation for _, each := range s.t0 { t0s = append(t0s, each.y) } for _, each := range s.t1 { t1s = append(t1s, each.y) } var ci confInterval //var z = 1.96 // http://www.dummies.com/how-to/content/creating-a-confidence-interval-for-the-difference-.html //var z = 1.645 // http://www.dummies.com/how-to/content/creating-a-confidence-interval-for-the-difference-.html //var z = 2.58 var m0, _ = stats.Mean(t0s) var n0 = float64(len(t0s)) var sd0, _ = stats.StandardDeviation(t0s) var m1, _ = stats.Mean(t1s) var n1 = float64(len(t1s)) var sd1, _ = stats.StandardDeviation(t1s) var mDiff = m0 - m1 var sd0s = sd0 * sd0 var sd1s = sd1 * sd1 ci.min = mDiff - zScore*math.Sqrt(sd1s/n1+sd0s/n0) ci.max = mDiff + zScore*math.Sqrt(sd1s/n1+sd0s/n0) ci.diff = ci.min - ci.max ci.t1Max = m1 + ci.max ci.t1Min = m1 + ci.min ci.diffSd = ci.diff / sd1 // how close is the score to the middle of the confidence interval ci.middle = (ci.min + ci.max) / 2 //ci.closeness = math.Abs(s.score - ci.middle) //ci.closeness = math.Abs(ci.diffSd - s.score) // Difference in sample means +- confidence interval //fmt.Printf("conf interval: %f to %f, conf diff: %f, t1: %f, t1max: %f, t1min: %f, diffSd: %f\n", ci.min, ci.max, ci.diff, m1, ci.t1Max, ci.t1Min, ci.diffSd) return ci }
// https://github.com/hermanschaaf/stats/blob/master/stats.go func NormalConfidenceInterval(nums []float64) (lower float64, upper float64) { conf := 1.95996 // 95% confidence for the mean, http://bit.ly/Mm05eZ mean, _ := stats.Mean(nums) dev, _ := stats.StandardDeviation(nums) dev = dev / math.Sqrt(float64(len(nums))) return mean - dev*conf, mean + dev*conf }
func (s *statistics) report() { for range time.Tick(time.Second) { s.Lock() writeTimes := s.writeTimes s.writeTimes = nil s.Unlock() // The stats functions return an error only when the input is empty. mean, _ := stats.Mean(writeTimes) stddev, _ := stats.StandardDeviation(writeTimes) log.Infof("wrote %d messages, latency mean=%s, stddev=%s", len(writeTimes), time.Duration(mean), time.Duration(stddev)) } }
// for a partition in the set of data, calculate the effective treatement // score using (mean t1 - mean t0) / population standard deviation func evalScore(d []coreData, rc []rowCriteria, dataSetId int) scoreResult { var s scoreResult s.rc = rc s.d = d s.score = 0 s.dataSetId = dataSetId // check for minimum row threshhold if len(d) <= rowThreshhold { return s } var t0 []coreData var t1 []coreData var t0s []float64 var t1s []float64 var allTs []float64 // Partition the data into treatment 0 and treatment 1 // and save the score for evaluation for _, each := range d { // Save all responses for later SD calculation allTs = append(allTs, each.y) if each.treatment == 0 { t0 = append(t0, each) t0s = append(t0s, each.y) } else { t1 = append(t1, each) t1s = append(t1s, each.y) } } // Must have minimum threshhold of records if len(t0)+len(t1) < rowThreshhold { return s } // Must have at least one in each group if len(t0) == 0 || len(t1) == 0 { return s } // then calculate the median, also experiment with average var mean0, _ = stats.Mean(t0s) var mean1, _ = stats.Mean(t1s) //var meanAll, _ = stats.Mean(allTs) //var sd, _ = stats.StandardDeviationPopulation(allTs) // subtract the two t0-t1, we want t1 to be smaller // Note: use spooled // square root of ((Nt-1)St^2 + (Nc-1)Sc^2)/(Nt+Nc)) var St, _ = stats.StandardDeviation(t1s) var Sc, _ = stats.StandardDeviation(t0s) var Nt = float64(len(t1s)) var Nc = float64(len(t0s)) //func calculateConfidenceInterval2(nt, nc, mt, mc, sdt, sdc float64) confInterval2 var ci = calculateConfidenceInterval2(Nt, Nc, mean1, mean0, St, Sc) // If the confidence intervals overlap then not valid range if ci.overlap { return s } var St2 = St * St var Sc2 = Sc * Sc //var Ntm1 = float64(Nt - 1) //var Ncm1 = float64(Nc - 1) //var kt = Ntm1 * St2 //var kc = Ncm1 * Sc2 //var ksum = kt + kc //var Nsum = Nt + Nc //var sPooled = math.Sqrt(ksum / Nsum) //http://www.uccs.edu/~lbecker/ var sPooled = math.Sqrt((St2 + Sc2) / 2) s.t0 = t0 s.t1 = t1 //sPooled = math.Sqrt((St2 * Sc2) / 2) //var _, t1confh = NormalConfidenceInterval(t1s) //var _, t0confh = NormalConfidenceInterval(t0s) //var meanValue = mean1 - mean0 //var meanValue = (mean1/St - mean0/Sc) / sPooled // Score Type 1 var meanDifference = mean1 - mean0 //s.score = meanDifference / meanAll //var max, _ = stats.Max(allTs) //s.score = meanDifference / sPooled var cohensd = meanDifference / sPooled var a = ((Nt + Nc) * (Nt + Nc)) / (Nt + Nc) // Score type 5 s.score = cohensd / math.Sqrt((cohensd*cohensd)+4) // Score type 6 s.score = cohensd / math.Sqrt((cohensd*cohensd)+a) //s.score = (mean1/St - mean0/Sc) / St return s }
func main() { t := time.Now() fmt.Println(t.Format(time.RFC3339)) rand.Seed(1) // Read in data readData() // Set one level with all row criteria, // this is used to start the set creation levelOne = fullOneLevel() //levels = fullTwoLevel() outputRowCriteria(levels) // experiment variables rand_numSets = 1000 rand_maxSetMembers = 5 maxExperiments = 1 var expMin []float64 var expMax []float64 scoreCutoff = -0.89 rowThreshhold = 2 zScore = 2.58 for experiment := 1; experiment <= maxExperiments; experiment++ { // experiment variables, changes per experiment rand_numSets += 0 rand_maxSetMembers += 0 scoreCutoff += -0.00 zScore += 0.0 // Setup experiment variables var scores []scoreResult var minScore float64 = -100 var maxScore float64 = 0 levels = fullFourLevel() //randLevels() fmt.Printf("sets count: %d, max set members: %d, level 1 count: %d, rowThreshhold: %d, scoreCutoff: %f, zScore: %f\n", len(levels), rand_maxSetMembers+2, len(levelOne), rowThreshhold, scoreCutoff, zScore) for dataSetId := 1; dataSetId <= datasets; dataSetId++ { s := levelEval(dataSetId) sort.Sort(scoreResults(s)) // s contains a list of scores for one dataset, sorted // this is were we can get some info on that data //outputScoreList(s) if len(s) > 0 { //var sEval = evaluateScores(s) // pick the top score var sEval = s[0] scores = append(scores, sEval) fmt.Printf("%d, %f \n", sEval.dataSetId, sEval.score) if minScore < sEval.score { minScore = sEval.score } if maxScore > sEval.score { maxScore = sEval.score } } // For all score in this set write out the median and standard deviation var set []float64 for _, scoreItem := range s { if scoreItem.score < 0.0 { set = append(set, scoreItem.score) } } var median, _ = stats.Median(set) var sd, _ = stats.StandardDeviation(set) var min, _ = stats.Min(set) var max, _ = stats.Max(set) fmt.Printf("dataset: %d, median: %f, sd: %f, min: %f, max: %f, len: %d\n", dataSetId, median, sd, min, max, len(set)) } expMin = append(expMin, minScore) expMax = append(expMax, maxScore) //scoreCutoff = (minScore * (percentRofMin / 100.0)) + minScore //fmt.Printf(" scoreCutoff: %f \n", scoreCutoff) outputScores(scores) // Write output file outputResults(scores) // Compare to training truth data // compareTrainingDataWithResults() } t = time.Now() fmt.Println(t.Format(time.RFC3339)) // Output min max scores per experiment for _, each := range expMin { fmt.Printf("min: %f, ", each) } fmt.Println() for _, each := range expMax { fmt.Printf("max: %f, ", each) } }
//apply transforms an array of data func apply(data []string, transformation templates.Transformation) ([]string, []Mapping) { p := transformation.Parameters var wg sync.WaitGroup var mapping []Mapping switch transformation.Operation { case "toDate": if len(p) != 2 { log.Fatal("toDate transformation requires 2 parameters: current format, new format") } oldFormat := p[0] newFormat := p[1] for i, x := range data { y, err := time.Parse(oldFormat, x) if err != nil { log.Print("Error parsing date with index ", i, " with format: ", oldFormat) } else { data[i] = y.Format(newFormat) } } case "setNull": for i, x := range data { if arrayPos(x, p) != -1 { data[i] = "" } } case "standardize": if len(p) != 1 { log.Fatal("standardize transformation requires 1 parameter: type (min-max|z-score)") } stype := p[0] switch stype { case "min-max": newData := strArrToFloatArr(data) min, err := stats.Min(newData) if err != nil { log.Fatal("Error finding minimum of data: ", err) } max, err := stats.Max(newData) if err != nil { log.Fatal("Error finding maximum of data: ", err) } srange := max - min for i, x := range newData { data[i] = floatToString((x - min) / srange) } case "z-score": newData := strArrToFloatArr(data) mean, err := stats.Mean(newData) if err != nil { log.Fatal("Error finding mean of data: ", err) } sd, err := stats.StandardDeviation(newData) if err != nil { log.Fatal("Error finding standard deviation of data: ", err) } for i, x := range newData { data[i] = floatToString((x - mean) / sd) } case "decimal": newData := strArrToFloatArr(data) max, err := stats.Max(newData) if err != nil { log.Fatal("Error finding maximum of data: ", err) } min, err := stats.Min(newData) if err != nil { log.Fatal("Error finding minimum of data: ", err) } var maxAbs float64 if math.Abs(max) > math.Abs(min) { maxAbs = math.Abs(max) } else { maxAbs = math.Abs(min) } c := math.Ceil(math.Log10(maxAbs)) for i, x := range newData { data[i] = floatToString(x / math.Pow10(int(c))) } } case "binPercent": table := NewPivotTable(data) intP := strArrToIntArr(p) sort.Ints(intP) ps := NewPercentileService(*table, intP) mapping = ps.CreateMappings() ps.Bin(mapping, data) case "fuzzyMap": if len(p) != 3 { log.Fatal("fuzzyMap transformation requires 3 parameters: datasource GUID, match, put") } dsGUID := p[0] ds := datasources.NewDatasourceService(database.GetDatabase()) dsObj, err := ds.GetDatasource(dsGUID) if err != nil { log.Fatal("Error finding Datasource: ", err) } distinctValues := getDistinctValues(data) for i, datum := range distinctValues { wg.Add(1) go func(i int, datum string, dsObj datasources.Datasource) { result := fuzzyMap(datum, dsObj.Settings) fuzzyMapping := NewMapping(datum, result) mapping = append(mapping, *fuzzyMapping) defer wg.Done() }(i, datum, dsObj) } wg.Wait() data = applyMappings(mapping, data) } return data, mapping }