Пример #1
0
func TestComputeInstanceDerivative(t *testing.T) {
	weights := util.NewMatrix(2, 3)
	weights.GetValues(0).SetValues([]float64{1, 2, 3})
	weights.GetValues(1).SetValues([]float64{3, 4, 5})
	de := util.NewMatrix(2, 3)
	instance := data.Instance{}
	instance.Features = util.NewVector(3)

	instance.Features.SetValues([]float64{1, 0.3, 0.4})
	instance.Output = &data.InstanceOutput{Label: 0}
	MaxEntComputeInstanceDerivative(weights, &instance, de)

	util.ExpectNear(t, 0.0322, de.Get(0, 0), 0.0001)
	util.ExpectNear(t, 0.0096, de.Get(0, 1), 0.0001)
	util.ExpectNear(t, 0.0128, de.Get(0, 2), 0.0001)
	util.ExpectNear(t, 0.9658, de.Get(1, 0), 0.0001)
	util.ExpectNear(t, 0.2897, de.Get(1, 1), 0.0001)
	util.ExpectNear(t, 0.3863, de.Get(1, 2), 0.0001)

	instance.Features.SetValues([]float64{1, 0.6, 0.7})
	instance.Output.Label = 1
	MaxEntComputeInstanceDerivative(weights, &instance, de)
	util.ExpectNear(t, -0.9900, de.Get(0, 0), 0.0001)
	util.ExpectNear(t, -0.5940, de.Get(0, 1), 0.0001)
	util.ExpectNear(t, -0.6930, de.Get(0, 2), 0.0001)
	util.ExpectNear(t, 0.9899, de.Get(1, 0), 0.0001)
	util.ExpectNear(t, 0.5939, de.Get(1, 1), 0.0001)
	util.ExpectNear(t, 0.6929, de.Get(1, 2), 0.0001)

	instance.Features.SetValues([]float64{1, 0.4, 0.2})
	instance.Output.Label = 2
	MaxEntComputeInstanceDerivative(weights, &instance, de)
	util.ExpectNear(t, 0.0390, de.Get(0, 0), 0.0001)
	util.ExpectNear(t, 0.0156, de.Get(0, 1), 0.0001)
	util.ExpectNear(t, 0.0078, de.Get(0, 2), 0.0001)
	util.ExpectNear(t, -0.0425, de.Get(1, 0), 0.0001)
	util.ExpectNear(t, -0.0170, de.Get(1, 1), 0.0001)
	util.ExpectNear(t, -0.0085, de.Get(1, 2), 0.0001)
}
Пример #2
0
func (classifier *MaxEntClassifier) Predict(instance *data.Instance) data.InstanceOutput {
	output := data.InstanceOutput{}

	// 当使用NamedFeatures时转化为Features
	if instance.NamedFeatures != nil {
		if classifier.FeatureDictionary == nil {
			return output
		}
		instance.Features = util.NewSparseVector()
		// 第0个feature始终是1
		instance.Features.Set(0, 1.0)

		for k, v := range instance.NamedFeatures {
			id := classifier.FeatureDictionary.TranslateIdFromName(k)
			instance.Features.Set(id, v)
		}
	}

	output.LabelDistribution = util.NewVector(classifier.NumLabels)
	output.LabelDistribution.Set(0, 1.0)

	z := float64(1)
	mostPossibleLabel := 0
	mostPossibleLabelWeight := float64(1)
	for iLabel := 1; iLabel < classifier.NumLabels; iLabel++ {
		sum := float64(0)
		for _, k := range classifier.Weights.GetValues(iLabel - 1).Keys() {
			sum += classifier.Weights.Get(iLabel-1, k) * instance.Features.Get(k)
		}
		exp := math.Exp(sum)
		if exp > mostPossibleLabelWeight {
			mostPossibleLabel = iLabel
			mostPossibleLabelWeight = exp
		}
		z += exp
		output.LabelDistribution.Set(iLabel, exp)
	}
	output.LabelDistribution.Scale(1 / z)
	output.Label = mostPossibleLabel

	if classifier.LabelDictionary != nil {
		output.LabelString = classifier.LabelDictionary.GetNameFromId(output.Label)
	}

	return output
}
Пример #3
0
// 读入一个训练样本
func (classifier *OnlineSGDClassifier) TrainOnOneInstance(instance *data.Instance) {
	if instance.NamedFeatures != nil {
		// 将样本中的特征转化为稀疏向量并加入词典
		instance.Features = nil
		data.ConvertNamedFeatures(instance, classifier.featureDictionary)
	}

	if instance.Output == nil {
		return
	} else {
		// 将样本中的标注字符串转化为整数ID
		if instance.Output.LabelString != "" {
			instance.Output.Label =
				classifier.labelDictionary.GetIdFromName(
					instance.Output.LabelString)
		}
	}

	// 预测并记录
	prediction := classifier.Predict(instance)
	classifier.evaluator.Evaluate(*instance.Output, prediction)

	classifier.instanceDerivative.Clear()
	supervised.MaxEntComputeInstanceDerivative(
		classifier.weights, instance, classifier.instanceDerivative)
	classifier.derivative.Increment(classifier.instanceDerivative, 1.0)
	classifier.instancesProcessed++

	if classifier.instancesProcessed >= classifier.options.BatchSize {
		// 添加正则化项
		classifier.derivative.Increment(optimizer.ComputeRegularization(
			classifier.weights, classifier.options.Optimizer), 1.0/float64(classifier.options.NumInstancesForEvaluation))

		// 根据学习率更新权重
		classifier.weights.Increment(
			classifier.derivative,
			-1*classifier.options.Optimizer.LearningRate/float64(classifier.options.NumInstancesForEvaluation))

		// 重置
		classifier.derivative.Clear()
		classifier.instancesProcessed = 0
	}
}
Пример #4
0
func LoadLibSVMDataset(path string, usingSparseRepresentation bool) data.Dataset {
	log.Print("载入libsvm格式文件", path)

	content, err := ioutil.ReadFile(path)
	if err != nil {
		log.Fatalf("无法打开文件\"%v\",错误提示:%v\n", path, err)
	}
	lines := strings.Split(string(content), "\n")

	minFeature := 10000
	maxFeature := 0

	labels := make(map[string]int)
	labelIndex := 0

	for _, l := range lines {
		if l == "" {
			continue
		}

		fields := strings.Split(l, " ")

		_, ok := labels[fields[0]]
		if !ok {
			labels[fields[0]] = labelIndex
			labelIndex++
		}

		for i := 1; i < len(fields); i++ {
			if fields[i] == "" {
				continue
			}
			fs := strings.Split(fields[i], ":")
			fid, _ := strconv.Atoi(fs[0])
			if fid > maxFeature {
				maxFeature = fid
			}
			if fid < minFeature {
				minFeature = fid
			}
		}
	}

	if minFeature == 0 || maxFeature < 2 {
		log.Fatal("文件输入格式不合法")
	}
	log.Printf("feature 数目 %d", maxFeature)
	log.Printf("label 数目 %d", len(labels))

	set := data.NewInmemDataset()

	for _, l := range lines {
		if l == "" {
			continue
		}
		fields := strings.Split(l, " ")

		instance := new(data.Instance)
		instance.Output = &data.InstanceOutput{
			Label:       labels[fields[0]],
			LabelString: fields[0],
		}
		if usingSparseRepresentation {
			instance.NamedFeatures = make(map[string]float64)
		} else {
			instance.Features = util.NewVector(maxFeature + 1)
		}

		// 常数项
		if !usingSparseRepresentation {
			instance.Features.Set(0, 1)
		}

		for i := 1; i < len(fields); i++ {
			if fields[i] == "" {
				continue
			}
			fs := strings.Split(fields[i], ":")
			fid, _ := strconv.Atoi(fs[0])
			value, _ := strconv.ParseFloat(fs[1], 64)
			if usingSparseRepresentation {
				instance.NamedFeatures[fs[0]] = value
			} else {
				instance.Features.Set(fid, value)
			}
		}
		set.AddInstance(instance)
	}

	set.Finalize()

	log.Print("载入数据样本数目 ", set.NumInstances())

	return set
}
Пример #5
0
func TestTrain(t *testing.T) {
	set := data.NewInmemDataset()
	instance1 := new(data.Instance)
	instance1.Features = util.NewVector(4)
	instance1.Features.SetValues([]float64{1, 1, 1, 3})
	instance1.Output = &data.InstanceOutput{Label: 0}
	set.AddInstance(instance1)

	instance2 := new(data.Instance)
	instance2.Features = util.NewVector(4)
	instance2.Features.SetValues([]float64{1, 3, 1, 5})
	instance2.Output = &data.InstanceOutput{Label: 0}
	set.AddInstance(instance2)

	instance3 := new(data.Instance)
	instance3.Features = util.NewVector(4)
	instance3.Features.SetValues([]float64{1, 3, 4, 7})
	instance3.Output = &data.InstanceOutput{Label: 1}
	set.AddInstance(instance3)

	instance4 := new(data.Instance)
	instance4.Features = util.NewVector(4)
	instance4.Features.SetValues([]float64{1, 2, 8, 6})
	instance4.Output = &data.InstanceOutput{Label: 1}
	set.AddInstance(instance4)
	set.Finalize()

	gdTrainerOptions := TrainerOptions{
		Optimizer: optimizer.OptimizerOptions{
			OptimizerName:         "gd",
			RegularizationScheme:  2,
			RegularizationFactor:  1,
			LearningRate:          0.1,
			ConvergingDeltaWeight: 1e-6,
			ConvergingSteps:       3,
			MaxIterations:         0,
			GDBatchSize:           0, // full-bath
		},
	}
	gdTrainer := NewMaxEntClassifierTrainer(gdTrainerOptions)

	lbfgsTrainerOptions := TrainerOptions{
		Optimizer: optimizer.OptimizerOptions{
			OptimizerName:         "lbfgs",
			RegularizationScheme:  2,
			RegularizationFactor:  1,
			LearningRate:          1,
			ConvergingDeltaWeight: 1e-6,
			ConvergingSteps:       3,
			MaxIterations:         0,
		},
	}
	lbfgsTrainer := NewMaxEntClassifierTrainer(lbfgsTrainerOptions)
	lbfgsTrainer.Train(set)

	gdTrainer.Train(set).Write("test.mlf")
	model := LoadModel("test.mlf")
	util.Expect(t, "0", model.Predict(instance1).Label)
	util.Expect(t, "0", model.Predict(instance2).Label)
	util.Expect(t, "1", model.Predict(instance3).Label)
	util.Expect(t, "1", model.Predict(instance4).Label)
}
Пример #6
0
func TestTrainWithNamedFeatures(t *testing.T) {
	set := data.NewInmemDataset()
	instance1 := new(data.Instance)
	instance1.NamedFeatures = map[string]float64{
		"1": 1,
		"2": 1,
		"3": 1,
		"4": 3,
	}
	instance1.Output = &data.InstanceOutput{Label: 0}
	set.AddInstance(instance1)

	instance2 := new(data.Instance)
	instance2.NamedFeatures = map[string]float64{
		"1": 1,
		"2": 3,
		"3": 1,
		"4": 5,
	}
	instance2.Output = &data.InstanceOutput{Label: 0}
	set.AddInstance(instance2)

	instance3 := new(data.Instance)
	instance3.NamedFeatures = map[string]float64{
		"1": 1,
		"2": 3,
		"3": 4,
		"4": 7,
	}
	instance3.Output = &data.InstanceOutput{Label: 1}
	set.AddInstance(instance3)

	instance4 := new(data.Instance)
	instance4.NamedFeatures = map[string]float64{
		"1": 1,
		"2": 2,
		"3": 8,
		"4": 6,
	}
	instance4.Output = &data.InstanceOutput{Label: 1}
	set.AddInstance(instance4)
	set.Finalize()

	gdTrainerOptions := TrainerOptions{
		Optimizer: optimizer.OptimizerOptions{
			OptimizerName:         "gd",
			RegularizationScheme:  2,
			RegularizationFactor:  1,
			LearningRate:          0.1,
			ConvergingDeltaWeight: 1e-6,
			ConvergingSteps:       3,
			MaxIterations:         0,
			GDBatchSize:           0, // full-bath
		},
	}
	gdTrainer := NewMaxEntClassifierTrainer(gdTrainerOptions)

	lbfgsTrainerOptions := TrainerOptions{
		Optimizer: optimizer.OptimizerOptions{
			OptimizerName:         "lbfgs",
			RegularizationScheme:  2,
			RegularizationFactor:  1,
			LearningRate:          1,
			ConvergingDeltaWeight: 1e-6,
			ConvergingSteps:       3,
			MaxIterations:         0,
		},
	}
	lbfgsTrainer := NewMaxEntClassifierTrainer(lbfgsTrainerOptions)
	lbfgsTrainer.Train(set)

	gdTrainer.Train(set).Write("test.mlf")
	model := LoadModel("test.mlf")
	util.Expect(t, "0", model.Predict(instance1).Label)
	util.Expect(t, "0", model.Predict(instance2).Label)
	util.Expect(t, "1", model.Predict(instance3).Label)
	util.Expect(t, "1", model.Predict(instance4).Label)
}