Пример #1
0
/*
type Feature struct {
	id   string
	data []float64
}

// Len of the set
func (f Feature) Len() int {
	return len(f.data)
}

// Get the ith
func (f Feature) Get(i int) float32 {
	return f.data[i]
}

type Features []Feature

func (f Features) Len() int {
	return len(f)
}

func (f Features) Values(i int) []float64 {
	return []float64(f[i].data)
}

*/
func TestClusterSimple(t *testing.T) {
	t.Log("hey hey")

	// first 3 group 1
	// 2nd 3 group 2
	// 3rd 4 outliers
	vectors := [][]float64{{1.0, 2.0}, {1.1, 1.2}, {1.3, 1.1},
		{-1.0, 0.9}, {-0.8, 1.1}, {-1.1, 1.0},
		{0.0, 0.0}, {1.0, -1.0}, {-1.0, -1.0}}

	// build data
	features := make(Features, len(vectors))
	for i := 0; i < len(vectors); i++ {
		features[i] = Feature{id: "0", data: vectors[i]}
	}

	t.Log(vectors)

	ms := meanshift.New(features, meanshift.NewTruncGauss(.60, 3), 0.1, 5)
	err := ms.Cluster()

	assert.Nil(t, err)

	for _, c := range ms.Centers() {
		t.Log("new group")
		for _, i := range c.Members() {
			f := features[i]
			t.Log(f)
		}
	}
}
Пример #2
0
// ClusterOverlap for the two articles.
// Care about the # of overlapping clusers and the "strength" of the overlapping clusters.
// Calc strength based on relevance of the words in the cluster and the properties of the cluster its self.
func ClusterOverlap(main, related map[string]word2vec.Vector, mainRelevance, relatedRelevance map[string]float32) (float32, int) {
	mainVecs, _ := buildFeatureArray(main, mainRelevance, mainArticleID)
	relatedVecs, _ := buildFeatureArray(related, relatedRelevance, relatedArticleID)

	allVecs := make(Features, len(mainVecs)+len(relatedVecs))
	var totalMainRel float32
	for i := range mainVecs {
		allVecs[i] = mainVecs[i]
		totalMainRel += mainVecs[i].relevance
	}

	var totalRelatedRel float32
	for i := range relatedVecs {
		allVecs[i+len(mainVecs)] = relatedVecs[i]
		totalRelatedRel += relatedVecs[i].relevance
	}

	// need to have some kind of total relevance
	if totalRelatedRel < 0.0001 || totalMainRel < 0.0001 {
		return 0.0, 0
	}

	// TODO: look into adaptive bandwidth stuff
	features := allVecs
	shifter := meanshift.NewTruncGauss(0.60, 2.6010)
	clusterer := meanshift.New(features, shifter, 0.01, 10)
	err := clusterer.Cluster()

	if err != nil {
		fmt.Println("err:", err)
		return 0.0, 0
	}

	numOverlaps := 0
	var score float32

	for _, c := range clusterer.Centers() {

		numMains := 0
		numRels := 0
		var mainQuality float32
		var relatedQuality float32

		for _, i := range c.Members() {
			f := features[i]

			if f.which == mainArticleID {
				numMains++
				mainQuality += f.relevance
			} else {
				numRels++
				relatedQuality += f.relevance
			}
		}

		// found at least one of each article in the cluster
		if numMains > 0 && numRels > 0 {
			numOverlaps++

			// how much of each story this cluster "captures"
			mainSignificance := mainQuality / totalMainRel
			relSignificance := relatedQuality / totalRelatedRel
			if totalRelatedRel < 0.001 || totalMainRel < 0.001 {
				panic("sig too low!!!")
			}

			// find cluster strength by doing a cluster covariance
			// clusterstrength is always [0:1]
			var clusterStrength float32
			for _, i := range c.Members() {
				f := features[i]
				var dsum float32
				for _, j := range c.Members() {
					if i == j {
						continue
					}
					ff := features[j]
					dsum += float32(dotVecs(f.data, ff.data))
				}
				clusterStrength += dsum
			}

			if clusterStrength < 0 {
				panic("oh nose!!! expected to only have pos vals")
			}

			// denom is num itrs run, sub len c b/c we don't mul by ourselves
			// len(c) can't be one, so denom calc is OK
			denom := float32(len(c.Members())*len(c.Members()) - len(c.Members()))
			clusterStrength = float32(math.Sqrt(float64(clusterStrength / denom)))

			// link from a => b = %rel(a) * avg rel(b)
			relMain := relSignificance * (mainQuality / float32(numMains))
			mainRel := mainSignificance * (relatedQuality / float32(numRels))
			score += (relMain + mainRel) * clusterStrength
		}
	}

	return score, numOverlaps
}