func GetWordTopicDist(m *MGLDA, vocabulary []string, wt *bufio.Writer) { zGlCount := make([]int, m.GlobalK) zLocCount := make([]int, m.LocalK) wordGlCount := []map[int]int{} wordLocCount := []map[int]int{} for i := 0; i < m.GlobalK; i++ { wordGlCount = append(wordGlCount, map[int]int{}) } for i := 0; i < m.LocalK; i++ { wordLocCount = append(wordLocCount, map[int]int{}) } glog.Info("Get words distribution") for d, doc := range *m.Docs { for s, sent := range doc.Sentenses { for w, wd := range sent.Words { r := m.Rdsn[d][s][w] z := m.Zdsn[d][s][w] if r == globalTopic { zGlCount[z] += 1 wordGlCount[z][wd] += 1 } else { zLocCount[z] += 1 wordLocCount[z][wd] += 1 } } } } glog.Info("Done dist") phiGl, phiLoc := m.WordDist() for i := 0; i < m.GlobalK; i++ { header := fmt.Sprintf("-- global topic: %d (%d words)\n", i, zGlCount[i]) wt.WriteString(header) glog.Info(header) rows := phiGl.RowCopy(i) idx := []int{} for j := 0; j < len(rows); j++ { idx = append(idx, j) } floats.Argsort(rows, idx) for j := len(idx) - 1; j > len(idx)-topicLimit; j-- { w := idx[j] tp := fmt.Sprintf("%s: %f (%d)\n", vocabulary[w], phiGl.Get(i, w), wordGlCount[i][w]) wt.WriteString(tp) glog.Info(tp) } } for i := 0; i < m.LocalK; i++ { header := fmt.Sprintf("-- local topic: %d (%d words)\n", i, zLocCount[i]) wt.WriteString(header) glog.Info(header) rows := phiLoc.RowCopy(i) idx := []int{} for j := 0; j < len(rows); j++ { idx = append(idx, j) } floats.Argsort(rows, idx) for j := len(idx) - 1; j > len(idx)-topicLimit; j-- { w := idx[j] tp := fmt.Sprintf("%s: %f (%d)\n", vocabulary[w], phiLoc.Get(i, w), wordLocCount[i][w]) wt.WriteString(tp) glog.Info(tp) } } }
/* Free up some synapses in this segment. We always free up inactive synapses (lowest permanence freed up first) before we start to free up active ones. param numToFree number of synapses to free up param inactiveSynapseIndices list of the inactive synapse indices. */ func (s *Segment) freeNSynapses(numToFree int, inactiveSynapseIndices []int) { //Make sure numToFree isn't larger than the total number of syns we have if numToFree > len(s.syns) { panic("Number to free cannot be larger than existing synapses.") } if s.tp.params.Verbosity >= 5 { fmt.Println("freeNSynapses with numToFree=", numToFree) fmt.Println("inactiveSynapseIndices= ", inactiveSynapseIndices) } var candidates []int // Remove the lowest perm inactive synapses first if len(inactiveSynapseIndices) > 0 { perms := make([]float64, len(inactiveSynapseIndices)) for idx, _ := range perms { perms[idx] = s.syns[idx].Permanence } var indexes []int floats.Argsort(perms, indexes) //sort perms cSize := mathutil.Min(numToFree, len(perms)) candidates = make([]int, cSize) //indexes[0:cSize] for i := 0; i < cSize; i++ { candidates[i] = inactiveSynapseIndices[indexes[i]] } } // Do we need more? if so, remove the lowest perm active synapses too var activeSynIndices []int if len(candidates) < numToFree { for i := 0; i < len(s.syns); i++ { if !utils.ContainsInt(i, inactiveSynapseIndices) { activeSynIndices = append(activeSynIndices, i) } } perms := make([]float64, len(activeSynIndices)) for i := range perms { perms[i] = s.syns[i].Permanence } var indexes []int floats.Argsort(perms, indexes) moreToFree := numToFree - len(candidates) //moreCandidates := make([]int, moreToFree) for i := 0; i < moreToFree; i++ { candidates = append(candidates, activeSynIndices[indexes[i]]) } } if s.tp.params.Verbosity >= 4 { fmt.Printf("Deleting %v synapses from segment to make room for new ones: %v \n", len(candidates), candidates) fmt.Println("Before:", s.ToString()) } // Delete candidate syns by copying undeleted to new slice var newSyns []Synapse for idx, val := range s.syns { if !utils.ContainsInt(idx, candidates) { newSyns = append(newSyns, val) } } s.syns = newSyns if s.tp.params.Verbosity >= 4 { fmt.Println("After:", s.ToString()) } }
// Inference runs a go routine for each doc. func (m *MGLDA) Inference() { for d, doc := range *m.Docs { for s, sent := range doc.Sentenses { for w, wd := range sent.Words { v := m.Vdsn[d][s][w] r := m.Rdsn[d][s][w] z := m.Zdsn[d][s][w] if r == globalTopic { m.Nglzw.Set(z, wd, m.Nglzw.Get(z, wd)-1) m.Nglz.Set(z, 0, m.Nglz.Get(z, 0)-1) m.Ndvgl[d][s+v] -= 1 m.Ndglz.Set(d, z, m.Ndglz.Get(d, z)-1) m.Ndgl.Set(d, 0, m.Ndgl.Get(d, 0)-1) } else { m.Nloczw.Set(z, wd, m.Nloczw.Get(z, wd)-1) m.Nlocz.Set(z, 0, m.Nlocz.Get(z, 0)-1) m.Ndvloc[d][s+v] -= 1 m.Ndvlocz[d][s+v][z] -= 1 } m.Ndsv[d][s][v] -= 1 m.Nds[d][s] -= 1 m.Ndv[d][s+v] -= 1 pvrz := []float64{} newVs := []int{} newRs := []string{} newZs := []int{} for vt := 0; vt < m.T; vt++ { for zt := 0; zt < m.GlobalK; zt++ { newVs = append(newVs, vt) newRs = append(newRs, globalTopic) newZs = append(newZs, zt) term1 := (m.Nglzw.Get(zt, wd) + m.GlobalBeta) / (m.Nglz.Get(zt, 0) + float64(m.W)*m.GlobalBeta) term2 := (m.Ndsv[d][s][vt] + m.Gamma) / (m.Nds[d][s] + float64(m.T)*m.Gamma) term3 := (m.Ndvgl[d][s+vt] + m.GlobalAlphaMix) / (m.Ndv[d][s+vt] + m.GlobalAlphaMix + m.LocalAlphaMix) term4 := (m.Ndglz.Get(d, zt) + m.GlobalAlpha) / (m.Ndgl.Get(d, 0) + float64(m.GlobalK)*m.GlobalAlpha) pvrz = append(pvrz, term1*term2*term3*term4) } for zt := 0; zt < m.LocalK; zt++ { newVs = append(newVs, vt) newRs = append(newRs, localTopic) newZs = append(newZs, zt) term1 := (m.Nloczw.Get(zt, wd) + m.LocalBeta) / (m.Nlocz.Get(zt, 0) + float64(m.W)*m.LocalBeta) term2 := (m.Ndsv[d][s][vt] + m.Gamma) / (m.Nds[d][s] + float64(m.T)*m.Gamma) term3 := (m.Ndvloc[d][s+vt] + m.LocalAlphaMix) / (m.Ndv[d][s+vt] + m.GlobalAlphaMix + m.LocalAlphaMix) term4 := (m.Ndvlocz[d][s+vt][zt] + m.LocalAlpha) / (m.Ndvloc[d][s+vt] + float64(m.LocalK)*m.LocalAlpha) pvrz = append(pvrz, term1*term2*term3*term4) } } // sampling from multinomial distribution origIdx := []int{} var sum float64 for j, item := range pvrz { sum += item origIdx = append(origIdx, j) } floats.Argsort(pvrz, origIdx) var randIdx int idxCount := map[int]int{} for i := 0; i < 100; i++ { var partialSum float64 threshold := rand.Float64() for j := len(pvrz) - 1; j >= 0; j-- { partialSum += pvrz[j] / sum if partialSum >= threshold { idxCount[origIdx[j]] += 1 break } } } var maxCount int for idx, cnt := range idxCount { if cnt > maxCount { randIdx = idx } } newV := newVs[randIdx] newR := newRs[randIdx] newZ := newZs[randIdx] // update if newR == globalTopic { m.Nglzw.Set(newZ, wd, m.Nglzw.Get(newZ, wd)+1) m.Nglz.Set(newZ, 0, m.Nglz.Get(newZ, 0)+1) m.Ndvgl[d][s+newV] += 1 m.Ndglz.Set(d, newZ, m.Ndglz.Get(d, newZ)+1) m.Ndgl.Set(d, 0, m.Ndgl.Get(d, 0)+1) } else { m.Nloczw.Set(newZ, wd, m.Nloczw.Get(newZ, wd)+1) m.Nlocz.Set(newZ, 0, m.Nlocz.Get(newZ, 0)+1) m.Ndvloc[d][s+newV] += 1 m.Ndvlocz[d][s+newV][newZ] += 1 } m.Ndsv[d][s][newV] += 1 m.Nds[d][s] += 1 m.Ndv[d][s+newV] += 1 m.Vdsn[d][s][w] = newV m.Rdsn[d][s][w] = newR m.Zdsn[d][s][w] = newZ } } } }