Пример #1
0
func (p *Predictor) SetDatasetPercentage(training int, testing int) error {
	if training+testing == 100 {
		p.trainingDatasetPercentage = training
		p.testingDatasetPercentage = testing
	} else {
		return errors.New("Error: Sum of training and testing dataset must equal 100")
	}

	return nil
}
Пример #2
0
func (r *Regression) AddDataPoint(data DataPoint) error {
	numVariables := len(r.variableNames)
	if len(data.Variables) != numVariables {
		return errors.New("Error: Number of variables in the data != in the model")
	}

	r.dataPoints = append(r.dataPoints, data)
	r.initialized = true

	return nil
}
Пример #3
0
//Odds ratio is e^coefficient
func (r *Regression) computeOddsRatio() error {
	length := len(r.model.Coefficients)
	if length == 0 {
		return errors.New("Error: Coefficients in models are not generated yet")
	}

	r.model.OddsRatio = make([]float64, length)
	for i := 0; i < length; i++ {
		odds := math.Exp(r.model.Coefficients[i])
		r.model.OddsRatio[i] = odds
	}

	return nil
}
Пример #4
0
//-16325072
//1.08577462845905
//p-Value for wald statistics is coefficient / standard error
func (r *Regression) computeWaldStatistic() error {
	length := len(r.model.Coefficients)
	if length == 0 {
		return errors.New("Error: Coefficients in models are not generated yet")
	}

	//Assume standard error exists when coefficient is already computed
	r.model.WaldStatistics = make([]float64, length)
	for i := 0; i < length; i++ {
		pvalue := r.model.Coefficients[i] / r.model.StandardErrors[i]
		r.model.WaldStatistics[i] = pvalue
	}

	return nil

}
Пример #5
0
//lower = coefficient - 1.96 * standard error, upper = coefficient + 1.96 * standard error
func (r *Regression) computeConfidenceInterval() error {
	length := len(r.model.Coefficients)
	if length == 0 {
		return errors.New("Error: Coefficients in models are not generated yet")
	}

	//Assume standard error exists when coefficient is already computed
	r.model.LowerConfidenceIntervals = make([]float64, length)
	r.model.UpperConfidenceIntervals = make([]float64, length)

	for i := 0; i < length; i++ {
		offset := 1.96 * r.model.StandardErrors[i]
		r.model.LowerConfidenceIntervals[i] = r.model.Coefficients[i] - offset
		r.model.UpperConfidenceIntervals[i] = r.model.Coefficients[i] + offset
	}

	return nil
}
Пример #6
0
//Note: W[t-1] is nxn which could be huge so instead of computing b[t] = b[t-1] + inv(X'W[t-1]X)X'(y - p[t-1]) directly
//we compute the W[t-1]X part, without the use of W.
//
//Since W is derived from the prob vector and W has non-0.0 elements only on the diagonal we can avoid a ton of work
//by using the prob vector directly and not computing W at all.
//
//Some of the research papers refer to the product W[t-1]X as X~ hence the name of this method.
//Ex: if xMatrix is 10x4 then W would be 10x10 so WX would be 10x4 -- the same size as X
func (r *Regression) computeXtilde(pVector matrix.Matrix, xMatrix matrix.Matrix) (matrix.Matrix, error) {
	pRows := pVector.Rows()
	xRows := xMatrix.Rows()
	xCols := xMatrix.Cols()

	if pRows != xRows {
		return nil, errors.New("The pVector and xMatrix are not compatible in computeXtilde")
	}

	//We are not doing matrix multiplication. the p column vector sort of lays on top of each matrix column.
	result := matrix.Zeros(pRows, xCols)
	for i := 0; i < pRows; i++ {
		for j := 0; j < xCols; j++ {
			pVal := pVector.Get(i, 0)
			xVal := xMatrix.Get(i, j)
			result.Set(i, j, pVal*(1.0-pVal)*xVal) //Note the p(1-p)
		}
	}

	return result, nil
}
Пример #7
0
//How good are the predictions? (using an already-calculated prob vector)
//Note: it is possible that a model with better (lower) MSE than a second model could give worse predictive accuracy.
func (r *Regression) meanSquaredError(pVector matrix.Matrix, yVector matrix.Matrix) (mse float64, err error) {
	pRows := pVector.Rows()
	yRows := yVector.Rows()

	if pRows != yRows {
		err = errors.New("Error: The prob vector and the y vector are not compatible")
		return 0.0, err
	}

	if pRows == 0 {
		return 0.0, nil
	}

	result := 0.0
	for i := 0; i < pRows; i++ {
		result += (pVector.Get(i, 0) - yVector.Get(i, 0)) * (pVector.Get(i, 0) - yVector.Get(i, 0))
	}
	mse = result / float64(pRows)

	return mse, nil
}
Пример #8
0
func (r *Regression) Predict(testData DataPoint) (predicted float64, err error) {
	//Create matrix for independent variables
	numVariables := len(testData.Variables)
	testVariables := matrix.Zeros(1, numVariables+1)

	for i := 0; i < numVariables; i++ {
		if i == 0 {
			testVariables.Set(0, 0, 1)
		} else {
			testVariables.Set(0, i, testData.Variables[i-i])
		}
	}

	//Create coefficient matrix from already generated model
	coeffLen := len(r.model.Coefficients)
	bVector := matrix.Zeros(coeffLen, 1)

	for i := 0; i < coeffLen; i++ {
		bVector.Set(i, 0, r.model.Coefficients[i])
	}

	//Error checking first
	xCols := testVariables.Cols()
	bRows := bVector.Rows()
	if xCols != bRows {
		return 0.0, errors.New("Error:Bad dimensions for xMatrix or bVector in Predict()")
	}

	//Calculate probability vector
	pVector, err := r.constructProbVector(testVariables, bVector)
	if err != nil {
		return 0.0, err
	}

	return pVector.Get(0, 0), nil
}
Пример #9
0
func (r *Regression) GenerateModel(iteration int) error {
	if !r.initialized {
		return errors.New("Error: Need some data to perform regression")
	}

	numData := len(r.dataPoints)
	numVariables := len(r.variableNames)

	if numData <= numVariables {
		return errors.New("Error: Datapoints must exceed variables")
	}

	//Create training data matrix for observed (result) and (independent) variables
	trainingObserved := matrix.Zeros(numData, 1)
	trainingVariables := matrix.Zeros(numData, numVariables+1)

	//Copy data to matrix
	for i := 0; i < numData; i++ {
		trainingObserved.Set(i, 0, r.dataPoints[i].Result)
		for j := 0; j < numVariables+1; j++ {
			if j == 0 {
				trainingVariables.Set(i, 0, 1)
			} else {
				trainingVariables.Set(i, j, r.dataPoints[i].Variables[j-1])
			}
		}
	}

	if r.debugMode {
		r.debugContext.Infof("\n---------- VARIABLES ---------\n%s", trainingVariables.String())
		r.debugContext.Infof("\n-------- OBSERVED -------\n%s", trainingObserved.String())
	}

	//Initialize model arrays
	r.model.StandardErrors = make([]float64, numVariables+1)

	//Newton-Raphson algorithm stop condition
	maxIteration := iteration
	epsilon := 0.01      //Stop if all coefficients change less than this | Algorithm has converged
	jumpFactor := 1000.0 //Stop if any new coefficients jumps too much | Algorithm spinning out of control

	//Use Newton-Raphson to find coefficients that best fit training data
	err := r.computeBestCoefficients(trainingVariables, trainingObserved, maxIteration, epsilon, jumpFactor)
	if err != nil {
		return err
	}

	//Compute odds ratio of generated coefficients
	err = r.computeOddsRatio()
	if err != nil {
		return err
	}

	//Compute pValue of wald statistics from the generated coefficients
	err = r.computeWaldStatistic()
	if err != nil {
		return err
	}

	//Compute confidence interval of generated coefficients
	err = r.computeConfidenceInterval()
	if err != nil {
		return err
	}

	//Compute log likelihood of the model
	err = r.computeLogLikelihood()
	if err != nil {
		return err
	}

	//Compute deviance of the model
	r.computeDeviance()

	//Compute chi-square value
	r.computeChiSquare()

	return nil
}
Пример #10
0
func (r *Regression) TestModel(testData []DataPoint) (accuracy float64, err error) {
	numData := len(testData)
	numVariables := len(testData[0].Variables)

	//Create test data matrix for observed (result) and (independent) variables
	testObserved := matrix.Zeros(numData, 1)
	testVariables := matrix.Zeros(numData, numVariables+1)

	//Copy data to matrix
	for i := 0; i < numData; i++ {
		testObserved.Set(i, 0, testData[i].Result)
		for j := 0; j < numVariables+1; j++ {
			if j == 0 {
				testVariables.Set(i, 0, 1)
			} else {
				testVariables.Set(i, j, testData[i].Variables[j-1])
			}
		}
	}

	//Create coefficient matrix from already generated model
	coeffLen := len(r.model.Coefficients)
	bVector := matrix.Zeros(coeffLen, 1)
	for i := 0; i < coeffLen; i++ {
		bVector.Set(i, 0, r.model.Coefficients[i])
	}

	//Error checking first
	xRows := testVariables.Rows()
	xCols := testVariables.Cols()
	yRows := testObserved.Rows()
	bRows := bVector.Rows()
	if xCols != bRows || xRows != yRows {
		return 0.0, errors.New("Error:Bad dimensions for xMatrix or yVector or bVector in TestModel()")
	}

	pVector, err := r.constructProbVector(testVariables, bVector)
	if err != nil {
		return 0.0, err
	}

	pRows := pVector.Rows()
	if pRows != xRows {
		return 0.0, errors.New("Error:Unequal rows in prob vector and design matrix in TestModel()")
	}

	//Initiate cases
	numberCasesCorrect := 0
	numberCasesWrong := 0

	for i := 0; i < yRows; i++ {
		pVal := pVector.Get(i, 0)
		observedVal := testObserved.Get(i, 0)

		if r.debugMode {
			r.debugContext.Infof("\nPredicted vs Test Result: %v vs %v\n", pVal, observedVal)
		}

		if pVal >= 0.50 && observedVal == 1.0 {
			numberCasesCorrect += 1
		} else if pVal < 0.50 && observedVal == 0.0 {
			numberCasesCorrect += 1
		} else {
			numberCasesWrong += 1
		}
	}

	//Calculate correct prediction percentage
	total := numberCasesCorrect + numberCasesWrong
	correctPercentage := (100.0 * float64(numberCasesCorrect)) / float64(total)

	if r.debugMode {
		r.debugContext.Infof("\nCorrect case vs Wrong case: %v vs %v\n", numberCasesCorrect, numberCasesWrong)
		r.debugContext.Infof("\nCorrect predicted percentage: %v", correctPercentage)
	}

	if total == 0 {
		return 0.0, nil
	} else {
		return correctPercentage, nil
	}
}
Пример #11
0
//Log likelihood ln LF = TotalAddition[(Yi * ln Pi) + (1 - Yi) * ln (1 - Pi)]
func (r *Regression) computeLogLikelihood() error {
	numData := len(r.dataPoints)
	numVariables := len(r.dataPoints[0].Variables)

	//Create test data matrix for observed (result) and (independent) variables
	testObserved := matrix.Zeros(numData, 1)
	testVariables := matrix.Zeros(numData, numVariables+1)

	//Copy data to matrix
	for i := 0; i < numData; i++ {
		testObserved.Set(i, 0, r.dataPoints[i].Result)
		for j := 0; j < numVariables+1; j++ {
			if j == 0 {
				testVariables.Set(i, 0, 1)
			} else {
				testVariables.Set(i, j, r.dataPoints[i].Variables[j-1])
			}
		}
	}

	//Error check the coefficient
	coeffLen := len(r.model.Coefficients)
	if coeffLen == 0 {
		return errors.New("Error: Coefficients in models are not generated yet")
	}

	//Create coefficient matrix from already generated model
	bVector := matrix.Zeros(coeffLen, 1)
	for i := 0; i < coeffLen; i++ {
		bVector.Set(i, 0, r.model.Coefficients[i])
	}

	//Error checking again
	xRows := testVariables.Rows()
	xCols := testVariables.Cols()
	yRows := testObserved.Rows()
	bRows := bVector.Rows()
	if xCols != bRows || xRows != yRows {
		return errors.New("Error:Bad dimensions for xMatrix or yVector or bVector in computeLogLikelihood()")
	}

	pVector, err := r.constructProbVector(testVariables, bVector)
	if err != nil {
		return err
	}

	pRows := pVector.Rows()
	if pRows != xRows {
		return errors.New("Error:Unequal rows in prob vector and design matrix in computeLogLikelihood()")
	}

	//Initiate cases
	logLikelihood := 0.0

	for i := 0; i < yRows; i++ {
		pVal := pVector.Get(i, 0)
		observedVal := testObserved.Get(i, 0)

		current := 0.0
		if observedVal == 0.0 {
			current = math.Log(1 - pVal)
		} else if observedVal == 1.0 {
			current = math.Log(pVal)
		}

		if r.debugMode {
			r.debugContext.Infof("\nY is %v and P is %v", observedVal, pVal)
			r.debugContext.Infof("\n(Yi * ln Pi) + (1 - Yi) * ln (1 - Pi): %v", current)
		}

		logLikelihood += current
	}

	if r.debugMode {
		r.debugContext.Infof("\nLog likelihood: %v", logLikelihood)
	}

	//Save the calculated log likelihood result
	r.model.LogLikelihood = logLikelihood

	return nil
}
Пример #12
0
//Use the Newton-Raphson technique to estimate logistic regression beta parameters: b[t] = b[t-1] + inv(X'W[t-1]X)X'(Y - p[t-1])
//- xTrainingVector is a design matrix of predictor variables where the first column is augmented with all 1.0 to represent dummy x values for the b0 constant
//- yTrainingVector is a column vector of binary (0.0 or 1.0) dependent variables
//- maxIterations is the maximum number of times to iterate in the algorithm. A value of 1000 is reasonable.
//- epsilon is a closeness parameter: if all new b[i] values after an iteration are within epsilon of
// 	the old b[i] values, we assume the algorithm has converged and we return. A value like 0.001 is often reasonable.
//- jumpFactor stops the algorithm if any new beta value is jumpFactor times greater than the old value. A value of 1000.0 seems reasonable.
//
//The return is a column vector of the beta estimates: b[0] is the constant, b[1] for x1, etc.
//
//Note: There is a lot that can go wrong here. The algorithm involves finding a matrx inverse (see MatrixInverse) which will throw
//if the inverse cannot be computed. The Newton-Raphson algorithm can generate beta values that tend towards infinity.
//If anything bad happens the return is the best beta values known at the time (which could be all 0.0 values but not null).
func (r *Regression) computeBestCoefficients(xTrainingVector matrix.Matrix, yTrainingVector matrix.Matrix, maxIteration int, epsilon float64, jumpFactor float64) error {
	//Error checking for the matrix length first
	xRows := xTrainingVector.Rows()
	xCols := xTrainingVector.Cols()

	if xRows != yTrainingVector.Rows() {
		return errors.New("Error: Training vectors are not compatible to generate model")
	}

	//Initial coefficients
	coeffVector := matrix.Zeros(xCols, 1)
	for i := 0; i < xCols; i++ {
		coeffVector.Set(i, 0, 0.0)
	}

	if r.debugMode {
		r.debugContext.Infof("\nInitial coefficients vector:\n%s", coeffVector.String())
	}

	//Current best coefficients
	var bestCoeffVector matrix.Matrix
	bestCoeffVector = coeffVector.Copy()

	//A column vector of the probabilities of each row using the b[i] values and the x[i] values
	pVector, err := r.constructProbVector(xTrainingVector, coeffVector)
	if err != nil {
		return err
	}

	if r.debugMode {
		r.debugContext.Infof("\nInitial probabilities vector:\n%s", pVector.String())
	}

	//Check MSE of the prediction
	mse, err := r.meanSquaredError(pVector, yTrainingVector)
	if err != nil {
		return err
	}

	if r.debugMode {
		r.debugContext.Infof("\nInitial MSE:\n%f", mse)
	}

	//How many times are the new betas worse (i.e., give worse MSE) than the current betas
	timesWorse := 0
	for i := 0; i < maxIteration; i++ {
		//Generate new beta values using Newton-Raphson. Could return null
		newCoeffVector := r.constructNewCoefficientsVector(coeffVector, xTrainingVector, yTrainingVector, pVector)
		if newCoeffVector == nil {
			if r.debugMode {
				r.debugContext.Infof("\nNew coefficients vector is null | current product cannot be inverted -- stopping")
			}
			break
		}

		if r.debugMode {
			r.debugContext.Infof("\nNew calculated coefficients vector:\n%s", newCoeffVector.String())
		}

		//We are done because of no significant change in beta[]
		if r.noChange(coeffVector, newCoeffVector, epsilon) == true {
			if r.debugMode {
				r.debugContext.Infof("\nNo significant change between old beta values and new beta values -- stopping")
			}
			break
		}

		//Any new beta more than jumpFactor times greater than old?
		if r.outOfControl(coeffVector, newCoeffVector, jumpFactor) == true {
			if r.debugMode {
				r.debugContext.Infof("\nThe new coefficients vector has at least one value which changed by a factor of %s -- stopping", jumpFactor)
			}
			break
		}

		pVector, err = r.constructProbVector(xTrainingVector, newCoeffVector)
		if err != nil {
			return err
		}

		if r.debugMode {
			r.debugContext.Infof("\nNew calculated probabilities vector:\n%s", pVector.String())
		}

		newMSE, err := r.meanSquaredError(pVector, yTrainingVector)
		if err != nil {
			return err
		}

		if r.debugMode {
			r.debugContext.Infof("\nNew calculated MSE:\n%f", newMSE)
		}

		if newMSE > mse {
			//Update counter if newMSE is worst than the current one
			timesWorse += 1
			if timesWorse > 4 {
				if r.debugMode {
					r.debugContext.Infof("\nThe new coefficients vector produced worse predictions even after modification four times in a row -- stopping")
				}
				break
			}

			if r.debugMode {
				r.debugContext.Infof("\nThe new coefficients vector has produced probabilities which give worse predictions -- modifying new coefficients to halfway between old and new")
			}

			//Update current: old becomes not the new but halfway between new and old
			for k := 0; k < coeffVector.Rows(); k++ {
				val := coeffVector.Get(k, 0)
				newVal := newCoeffVector.Get(k, 0)
				coeffVector.Set(k, 0, (val+newVal)/2.0)
			}

			//Update current SSD
			mse = newMSE
		} else {
			if r.debugMode {
				r.debugContext.Infof("\nThe new coefficients vector has produced probabilities which give better predictions -- updating")
			}

			coeffVector = newCoeffVector.DenseMatrix() //Update best
			bestCoeffVector = coeffVector.Copy()       //Update current
			mse = newMSE                               //Update current MSE
			timesWorse = 0                             //Reset counter
		}

		if r.debugMode && i == maxIteration-1 {
			r.debugContext.Infof("\nExceeded max iterations -- stopping")
		}
	}

	if r.debugMode {
		r.debugContext.Infof("\nBest coefficients vector:\n%s", bestCoeffVector.String())
	}

	//Done, put coefficients data from matrix to arrays
	length := bestCoeffVector.Rows()
	r.model.Coefficients = make([]float64, length)
	for i := 0; i < length; i++ {
		r.model.Coefficients[i] = bestCoeffVector.Get(i, 0)
	}

	return nil
}