Example #1
0
// Tags each row with the index of its nearest centroid. The old tags are used
// for optimization.
func tag(vecs, means [][]float64, oldTags []int) []int {
	if len(means) == 0 {
		panic("Cannot tag on 0 centroids.")
	}

	// Create a distance matrix of means from one another.
	meansd := make([][]float64, len(means))
	for i := range meansd {
		meansd[i] = make([]float64, len(means))
		for j := range means {
			meansd[i][j] = vectors.L2(means[i], means[j])
		}
	}

	tags := make([]int, len(vecs))

	// Go over vectors.
	for i := range vecs {
		// Find nearest centroid.
		tags[i] = oldTags[i]
		d := vectors.L2(means[oldTags[i]], vecs[i])

		for j := 0; j < len(means); j++ {
			// Use triangle inequality to skip means that are too distant.
			if j == tags[i] || meansd[j][tags[i]] >= 2*d {
				continue
			}

			dj := vectors.L2(means[j], vecs[i])
			if dj < d {
				d = dj
				tags[i] = j
			}
		}
	}

	return tags
}
Example #2
0
// Calculates the average squared-distance of elements from their assigned
// means.
func MeanSquaredError(vecs, means [][]float64, tags []int) float64 {
	if len(tags) != len(vecs) {
		panic(fmt.Sprintf("Non-matching lengths of matrix and tags: %d, %d",
			len(vecs), len(tags)))
	}
	if len(vecs) == 0 {
		return 0
	}

	d := 0.0
	for i := range tags {
		dist := vectors.L2(means[tags[i]], vecs[i])
		d += dist * dist
	}

	return d / float64(len(vecs))
}
Example #3
0
// Picks the initial means with the K-means++ algorithm.
func initialMeans(vecs [][]float64, k int) [][]float64 {
	result := make([][]float64, k)
	perm := rand.Perm(len(vecs))
	numTrials := 2 + int(math.Log(float64(k)))

	probs := make([]float64, len(vecs))    // Probability of each vector.
	nearest := make([]int, len(vecs))      // Index of nearest mean to each vector.
	distance := make([]float64, len(vecs)) // Distance to nearest mean.
	mdistance := make([][]float64, k)      // Distance between means.
	for i := range mdistance {
		mdistance[i] = make([]float64, k)
	}

	// Pick each mean.
	for i := range result {
		result[i] = make([]float64, len(vecs[0]))

		// First mean is first vector.
		if i == 0 {
			copy(result[0], vecs[perm[0]])
			for _, j := range perm {
				distance[j] = vectors.L2(vecs[j], result[0])
			}
			continue
		}

		// Find next mean.
		bestCandidate := -1
		bestImprovement := -math.MaxFloat64

		for t := 0; t < numTrials; t++ { // Make a few attempts.
			sum := 0.0
			for _, j := range perm {
				probs[j] = distance[j] * distance[j]
				sum += probs[j]
			}
			// Pick element with probability relative to d^2.
			r := rand.Float64() * sum
			newMean := 0
			for r > probs[newMean] {
				r -= probs[newMean]
				newMean++
			}
			copy(result[i], vecs[newMean])

			// Update distances from new mean to other means.
			for j := range mdistance[:i] {
				mdistance[j][i] = vectors.L2(result[i], result[j])
				mdistance[i][j] = mdistance[j][i]
			}

			// Check improvement.
			newImprovement := 0.0
			for j := range vecs {
				if mdistance[i][nearest[j]] < 2*distance[j] {
					d := vectors.L2(vecs[j], result[i])
					d = math.Min(distance[j], d)
					newImprovement += distance[j] - d
				}
			}
			if newImprovement > bestImprovement {
				bestCandidate = newMean
				bestImprovement = newImprovement
			}
		}

		copy(result[i], vecs[bestCandidate])

		// Update distances.
		for j := range mdistance[:i] { // From new mean to other means.
			mdistance[j][i] = vectors.L2(result[i], result[j])
			mdistance[i][j] = mdistance[j][i]
		}
		for j := range vecs { // From vecs to nearest means.
			if mdistance[i][nearest[j]] < 2*distance[j] {
				d := vectors.L2(vecs[j], result[i])
				if d < distance[j] {
					distance[j] = math.Min(distance[j], d)
					nearest[j] = i
				}
			}
		}
	}

	return result
}