예제 #1
0
func DatasetFromPath(filepath string, targetStartInclusive, targetEndExclusive int) (dataset.Dataset, error) {
	file, err := os.Open(filepath)
	if err != nil {
		return nil, csvparseerrors.NewUnableToOpenFileError(filepath, err)
	}

	reader := csv.NewReader(file)

	_, err = reader.Read()
	line, err := reader.Read()
	if err != nil {
		return nil, csvparseerrors.NewUnableToReadTwoLinesError(filepath, err)
	}

	columnTypes, err := columntype.StringsToColumnTypes(line)
	if err != nil {
		return nil, csvparseerrors.NewUnableToParseColumnTypesError(filepath, err)
	}

	numColumns := len(columnTypes)
	if targetOutOfBounds(targetStartInclusive, targetEndExclusive, numColumns) {
		return nil, csvparseerrors.NewTargetOutOfBoundsError(filepath, targetStartInclusive, targetEndExclusive, numColumns)
	}

	newDataset := dataset.NewDataset(
		featureColumnIndices(targetStartInclusive, targetEndExclusive, numColumns),
		targetColumnIndices(targetStartInclusive, targetEndExclusive, numColumns),
		columnTypes,
	)

	for ; err == nil; line, err = reader.Read() {
		err = newDataset.AddRowFromStrings(line)
		if err != nil {
			return nil, csvparseerrors.NewUnableToParseRowError(filepath, err)
		}
	}
	if err != nil && err != io.EOF {
		return nil, csvparseerrors.NewGenericError(filepath, err)
	}

	return newDataset, nil
}
예제 #2
0
			err         error
		)

		JustBeforeEach(func() {
			trainingSet, testSet, err = crossvalidation.SplitDataset(
				originalSet,
				trainingRatio,
				rand.NewSource(5330), // SEED
			)
		})

		BeforeEach(func() {
			columnTypes, columnTypesError := columntype.StringsToColumnTypes([]string{"0"})
			Ω(columnTypesError).ShouldNot(HaveOccurred())

			originalSet = dataset.NewDataset([]int{}, []int{0}, columnTypes)
		})

		Context("when the training ratio negative", func() {
			BeforeEach(func() {
				trainingRatio = -0.67
			})

			It("errors", func() {
				Ω(err).Should(HaveOccurred())
			})
		})

		Context("when the training ratio is greater than 1", func() {
			BeforeEach(func() {
				trainingRatio = 1.67
예제 #3
0
	"github.com/amitkgupta/goodlearn/data/row"

	. "github.com/onsi/ginkgo"
	. "github.com/onsi/gomega"
)

var _ = Describe("Dataset", func() {
	var ds dataset.Dataset

	Describe("AllFeaturesFloats", func() {
		Context("When all features are floats", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"1.0", "1.0"})
				Ω(err).ShouldNot(HaveOccurred())

				ds = dataset.NewDataset([]int{0, 1}, []int{}, columnTypes)
			})

			It("Returns true", func() {
				Ω(ds.AllFeaturesFloats()).Should(BeTrue())
			})
		})

		Context("When not all features are floats", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"x", "1.0"})
				Ω(err).ShouldNot(HaveOccurred())

				ds = dataset.NewDataset([]int{0, 1}, []int{}, columnTypes)
			})
예제 #4
0
		})
	})

	Describe("Train", func() {
		var trainingData dataset.Dataset

		BeforeEach(func() {
			kNNClassifier, _ = knn.NewKNNClassifier(1)
		})

		Context("When the dataset is empty", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"hi", "0", "0"})
				Ω(err).ShouldNot(HaveOccurred())

				trainingData = dataset.NewDataset([]int{1, 2}, []int{0}, columnTypes)
			})

			It("Returns an error", func() {
				err := kNNClassifier.Train(trainingData)
				Ω(err).Should(HaveOccurred())
				Ω(err).Should(BeAssignableToTypeOf(knnerrors.EmptyTrainingDatasetError{}))
			})
		})

		Context("When the dataset's features are not all floats", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"hi", "bye", "0"})
				Ω(err).ShouldNot(HaveOccurred())

				trainingData = dataset.NewDataset([]int{1, 2}, []int{0}, columnTypes)
			var err error
			estimator, err = gradientdescentestimator.NewGradientDescentParameterEstimator(
				0.3,
				0.3,
				100,
				lossGradient,
			)
			Ω(err).ShouldNot(HaveOccurred())
		})

		Context("Given a dataset with non-float features", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"x", "1.0"})
				Ω(err).ShouldNot(HaveOccurred())

				trainingSet = dataset.NewDataset([]int{0}, []int{1}, columnTypes)

				err = trainingSet.AddRowFromStrings([]string{"hi", "24"})
				Ω(err).ShouldNot(HaveOccurred())
			})

			It("Returns an error", func() {
				err := estimator.Train(trainingSet)
				Ω(err).Should(BeAssignableToTypeOf(gdeErrors.NonFloatFeaturesError{}))
			})
		})

		Context("Given a dataset with a non-float target", func() {
			BeforeEach(func() {
				columnTypes, err := columntype.StringsToColumnTypes([]string{"x", "1.0"})
				Ω(err).ShouldNot(HaveOccurred())
		BeforeEach(func() {
			trueParameters = []float64{2, -3, 4}

			var err error
			estimator, err = gradientdescentestimator.NewGradientDescentParameterEstimator(
				0.001,
				0.000005,
				1000,
				gradientdescentestimator.LinearModelLeastSquaresLossGradient,
			)
			Ω(err).ShouldNot(HaveOccurred())

			columnTypes, err := columntype.StringsToColumnTypes([]string{"1.0", "1.0", "1.0"})
			Ω(err).ShouldNot(HaveOccurred())

			trainingSet := dataset.NewDataset([]int{0, 1}, []int{2}, columnTypes)
			for i := 0; i < 20; i++ {
				for j := 0; j < 20; j++ {
					x0 := -1.9 + 0.2*float64(i)
					x1 := -1.9 + 0.2*float64(j)
					y := trueParameters[0]*x0 + trueParameters[1]*x1 + trueParameters[2] + 0.1*rand.NormFloat64()

					err = trainingSet.AddRowFromStrings([]string{
						fmt.Sprintf("%.10f", x0),
						fmt.Sprintf("%.10f", x1),
						fmt.Sprintf("%.10f", y),
					})
					Ω(err).ShouldNot(HaveOccurred())
				}
			}