/* type Feature struct { id string data []float64 } // Len of the set func (f Feature) Len() int { return len(f.data) } // Get the ith func (f Feature) Get(i int) float32 { return f.data[i] } type Features []Feature func (f Features) Len() int { return len(f) } func (f Features) Values(i int) []float64 { return []float64(f[i].data) } */ func TestClusterSimple(t *testing.T) { t.Log("hey hey") // first 3 group 1 // 2nd 3 group 2 // 3rd 4 outliers vectors := [][]float64{{1.0, 2.0}, {1.1, 1.2}, {1.3, 1.1}, {-1.0, 0.9}, {-0.8, 1.1}, {-1.1, 1.0}, {0.0, 0.0}, {1.0, -1.0}, {-1.0, -1.0}} // build data features := make(Features, len(vectors)) for i := 0; i < len(vectors); i++ { features[i] = Feature{id: "0", data: vectors[i]} } t.Log(vectors) ms := meanshift.New(features, meanshift.NewTruncGauss(.60, 3), 0.1, 5) err := ms.Cluster() assert.Nil(t, err) for _, c := range ms.Centers() { t.Log("new group") for _, i := range c.Members() { f := features[i] t.Log(f) } } }
// ClusterOverlap for the two articles. // Care about the # of overlapping clusers and the "strength" of the overlapping clusters. // Calc strength based on relevance of the words in the cluster and the properties of the cluster its self. func ClusterOverlap(main, related map[string]word2vec.Vector, mainRelevance, relatedRelevance map[string]float32) (float32, int) { mainVecs, _ := buildFeatureArray(main, mainRelevance, mainArticleID) relatedVecs, _ := buildFeatureArray(related, relatedRelevance, relatedArticleID) allVecs := make(Features, len(mainVecs)+len(relatedVecs)) var totalMainRel float32 for i := range mainVecs { allVecs[i] = mainVecs[i] totalMainRel += mainVecs[i].relevance } var totalRelatedRel float32 for i := range relatedVecs { allVecs[i+len(mainVecs)] = relatedVecs[i] totalRelatedRel += relatedVecs[i].relevance } // need to have some kind of total relevance if totalRelatedRel < 0.0001 || totalMainRel < 0.0001 { return 0.0, 0 } // TODO: look into adaptive bandwidth stuff features := allVecs shifter := meanshift.NewTruncGauss(0.60, 2.6010) clusterer := meanshift.New(features, shifter, 0.01, 10) err := clusterer.Cluster() if err != nil { fmt.Println("err:", err) return 0.0, 0 } numOverlaps := 0 var score float32 for _, c := range clusterer.Centers() { numMains := 0 numRels := 0 var mainQuality float32 var relatedQuality float32 for _, i := range c.Members() { f := features[i] if f.which == mainArticleID { numMains++ mainQuality += f.relevance } else { numRels++ relatedQuality += f.relevance } } // found at least one of each article in the cluster if numMains > 0 && numRels > 0 { numOverlaps++ // how much of each story this cluster "captures" mainSignificance := mainQuality / totalMainRel relSignificance := relatedQuality / totalRelatedRel if totalRelatedRel < 0.001 || totalMainRel < 0.001 { panic("sig too low!!!") } // find cluster strength by doing a cluster covariance // clusterstrength is always [0:1] var clusterStrength float32 for _, i := range c.Members() { f := features[i] var dsum float32 for _, j := range c.Members() { if i == j { continue } ff := features[j] dsum += float32(dotVecs(f.data, ff.data)) } clusterStrength += dsum } if clusterStrength < 0 { panic("oh nose!!! expected to only have pos vals") } // denom is num itrs run, sub len c b/c we don't mul by ourselves // len(c) can't be one, so denom calc is OK denom := float32(len(c.Members())*len(c.Members()) - len(c.Members())) clusterStrength = float32(math.Sqrt(float64(clusterStrength / denom))) // link from a => b = %rel(a) * avg rel(b) relMain := relSignificance * (mainQuality / float32(numMains)) mainRel := mainSignificance * (relatedQuality / float32(numRels)) score += (relMain + mainRel) * clusterStrength } } return score, numOverlaps }