// Tags each row with the index of its nearest centroid. The old tags are used // for optimization. func tag(vecs, means [][]float64, oldTags []int) []int { if len(means) == 0 { panic("Cannot tag on 0 centroids.") } // Create a distance matrix of means from one another. meansd := make([][]float64, len(means)) for i := range meansd { meansd[i] = make([]float64, len(means)) for j := range means { meansd[i][j] = vectors.L2(means[i], means[j]) } } tags := make([]int, len(vecs)) // Go over vectors. for i := range vecs { // Find nearest centroid. tags[i] = oldTags[i] d := vectors.L2(means[oldTags[i]], vecs[i]) for j := 0; j < len(means); j++ { // Use triangle inequality to skip means that are too distant. if j == tags[i] || meansd[j][tags[i]] >= 2*d { continue } dj := vectors.L2(means[j], vecs[i]) if dj < d { d = dj tags[i] = j } } } return tags }
// Calculates the average squared-distance of elements from their assigned // means. func MeanSquaredError(vecs, means [][]float64, tags []int) float64 { if len(tags) != len(vecs) { panic(fmt.Sprintf("Non-matching lengths of matrix and tags: %d, %d", len(vecs), len(tags))) } if len(vecs) == 0 { return 0 } d := 0.0 for i := range tags { dist := vectors.L2(means[tags[i]], vecs[i]) d += dist * dist } return d / float64(len(vecs)) }
// Picks the initial means with the K-means++ algorithm. func initialMeans(vecs [][]float64, k int) [][]float64 { result := make([][]float64, k) perm := rand.Perm(len(vecs)) numTrials := 2 + int(math.Log(float64(k))) probs := make([]float64, len(vecs)) // Probability of each vector. nearest := make([]int, len(vecs)) // Index of nearest mean to each vector. distance := make([]float64, len(vecs)) // Distance to nearest mean. mdistance := make([][]float64, k) // Distance between means. for i := range mdistance { mdistance[i] = make([]float64, k) } // Pick each mean. for i := range result { result[i] = make([]float64, len(vecs[0])) // First mean is first vector. if i == 0 { copy(result[0], vecs[perm[0]]) for _, j := range perm { distance[j] = vectors.L2(vecs[j], result[0]) } continue } // Find next mean. bestCandidate := -1 bestImprovement := -math.MaxFloat64 for t := 0; t < numTrials; t++ { // Make a few attempts. sum := 0.0 for _, j := range perm { probs[j] = distance[j] * distance[j] sum += probs[j] } // Pick element with probability relative to d^2. r := rand.Float64() * sum newMean := 0 for r > probs[newMean] { r -= probs[newMean] newMean++ } copy(result[i], vecs[newMean]) // Update distances from new mean to other means. for j := range mdistance[:i] { mdistance[j][i] = vectors.L2(result[i], result[j]) mdistance[i][j] = mdistance[j][i] } // Check improvement. newImprovement := 0.0 for j := range vecs { if mdistance[i][nearest[j]] < 2*distance[j] { d := vectors.L2(vecs[j], result[i]) d = math.Min(distance[j], d) newImprovement += distance[j] - d } } if newImprovement > bestImprovement { bestCandidate = newMean bestImprovement = newImprovement } } copy(result[i], vecs[bestCandidate]) // Update distances. for j := range mdistance[:i] { // From new mean to other means. mdistance[j][i] = vectors.L2(result[i], result[j]) mdistance[i][j] = mdistance[j][i] } for j := range vecs { // From vecs to nearest means. if mdistance[i][nearest[j]] < 2*distance[j] { d := vectors.L2(vecs[j], result[i]) if d < distance[j] { distance[j] = math.Min(distance[j], d) nearest[j] = i } } } } return result }