func (l *LdaModeler) Model(tokenss []util.Document) { // these guys hold the results of each "run", so that they can be averaged at the end theta_hat_ds_samples := make([]*[][]float64, l.repetitions) phi_hats_samples := make([]*[]util.TokenProbMap, l.repetitions) log_likelihood_samples := make([]float64, l.repetitions) perplexity_samples := make([]float64, l.repetitions) var initial_topicss [][]int if l.parallel && l.repetitions > 1 { // get a single pass to start with temp_res := l.model_one_pass(&tokenss, l.seed, initial_topicss) theta_hat_ds_samples[0] = temp_res.theta_hat_ds phi_hats_samples[0] = temp_res.phi_hats log_likelihood_samples[0] = temp_res.log_likelihood perplexity_samples[0] = temp_res.perplexity initial_topicss := l.last_model_token_topic_sample_assignments var wg sync.WaitGroup res_chan := make(chan *ModelResults, l.repetitions) for rep_num := 1; rep_num < l.repetitions; rep_num++ { wg.Add(1) go func(r_num int, init_tss [][]int) { m := NewModeler(l.ntopics, l.alpha, l.beta, l.iterations, l.repetitions, l.seed, l.parallel) res_chan <- m.model_one_pass(&tokenss, m.seed+r_num, init_tss) wg.Done() }(rep_num, initial_topicss) } wg.Wait() for rep_num := 1; rep_num < l.repetitions; rep_num++ { this_res := <-res_chan // get a result out of the channel theta_hat_ds_samples[rep_num] = this_res.theta_hat_ds phi_hats_samples[rep_num] = this_res.phi_hats log_likelihood_samples[rep_num] = this_res.log_likelihood perplexity_samples[rep_num] = this_res.perplexity } } else { for rep_num := 0; rep_num < l.repetitions; rep_num++ { log.Printf("Starting repetition %d of %d.\n", rep_num+1, l.repetitions) // theta_hat_ds, phi_hats, log_likelihood, perplexity := l.model_one_pass(tokenss, l.seed, initial_topicss) //temp_res temp_res := l.model_one_pass(&tokenss, l.seed+rep_num, initial_topicss) theta_hat_ds_samples[rep_num] = temp_res.theta_hat_ds phi_hats_samples[rep_num] = temp_res.phi_hats log_likelihood_samples[rep_num] = temp_res.log_likelihood perplexity_samples[rep_num] = temp_res.perplexity if initial_topicss == nil { initial_topicss = l.last_model_token_topic_sample_assignments } } } // average samples: theta_hat_ds := make([][]float64, len(*theta_hat_ds_samples[0])) for doc_idx := 0; doc_idx < len(*theta_hat_ds_samples[0]); doc_idx++ { // for each doc: // get the appropriate theta_hat_di for this document from each repetition (should equiv to zip(*theta_hat_ds_samples) in python... ) theta_hat_dis := make([][]float64, len(theta_hat_ds_samples)) for rep_idx := 0; rep_idx < len(theta_hat_ds_samples); rep_idx++ { this_rep := *theta_hat_ds_samples[rep_idx] this_rep_doc := this_rep[doc_idx] theta_hat_dis[rep_idx] = this_rep_doc } acc_vector := make([]float64, len(theta_hat_dis[0])) for _, th_di := range theta_hat_dis { acc_vector, _ = util.SumVectors(acc_vector, th_di) } theta_hat_ds[doc_idx] = util.L1NormalizeVector(acc_vector) } // phi_hats phi_hats := make([]util.TokenProbMap, len(*phi_hats_samples[0])) for topic_idx := 0; topic_idx < len(*phi_hats_samples[0]); topic_idx++ { // build list of topic dicts for each topic from each rep this_topic := make([]util.TokenProbMap, len(phi_hats_samples)) for rep_idx := 0; rep_idx < len(phi_hats_samples); rep_idx++ { this_topic[rep_idx] = (*phi_hats_samples[rep_idx])[topic_idx] } phi_hats[topic_idx] = util.SumAndNormalizeListOfDicts(this_topic) } // simple averaging for log-likelihood & perp log_likelihood := util.SumFloat(log_likelihood_samples) / float64(len(log_likelihood_samples)) perplexity := util.SumFloat(perplexity_samples) / float64(len(perplexity_samples)) l.last_model_document_topic_probability_assignments = theta_hat_ds l.last_model_topic_token_probability_assignments = phi_hats l.last_model_token_log_likelihood_given_topic_model = log_likelihood l.last_model_sample_token_perplexity_given_topic_model = perplexity log.Println("done") }
func (l *LdaModeler) model_one_pass(tokenss_ptr *[]util.Document, seed int, initial_topicss [][]int) *ModelResults { // log.Printf("we're in %p\n", l) // log.Printf("from %p: address of initial_topicss: %p\n", l, &initial_topicss) tokenss := *tokenss_ptr ntopics := l.ntopics ndocuments := len(tokenss) range_ntopics := util.PyRange(ntopics) range_ndocuments := util.PyRange(ndocuments) // set up a random number generator randomizer := rand.New(rand.NewSource(int64(len(tokenss) + seed))) // topicss is an document-token matrix where the values are topic assignments. topicss := make([][]int, len(tokenss)) if initial_topicss == nil { // set up new, randomly initialized matrix for i, doc := range tokenss { doc_topics := make([]int, len(doc)) for j, _ := range doc { // randomly assign each token in doc to a random topic doc_topics[j] = randomizer.Intn(ntopics) } topicss[i] = doc_topics } } else { // we were passed in a topic matrix; let's just set up our own local copy for i, doc := range initial_topicss { // NOTE: We *cannot* just say topicss[i] = doc- shallow copy vs. deep copy, pointers, etc. etc.- here there be possibilities for threading havoc! topicss[i] = make([]int, len(doc)) copy(topicss[i], doc) } } // compute doc-topic representation counts and topic-word representation counts document_tokens_counts := make([]int, len(tokenss)) for idx, doc := range tokenss { document_tokens_counts[idx] = len(doc) } document_topics_counts := make([][]int, len(tokenss)) topic_words_counts := make([]map[util.Token]int, ntopics) // maps in Go will, if asked for an element that doesn't exist, give the zero-value for that type (as well as an optional second return value indicating whether it was found or not) http://golang.org/doc/effective_go.html#maps total_topic_counts := make([]int, ntopics) // n.b.: the values in a "fresh" just-made slice are the zero-value for that type. for i, tokens := range tokenss { // for each document topics := topicss[i] counts := make([]int, ntopics) for j, token := range tokens { // for each token topic := topics[j] counts[topic] += 1 // count of topic mentions in this document if topic_words_counts[topic] == nil { topic_words_counts[topic] = map[util.Token]int{} } topic_words_counts[topic][token] += 1 // count tokens mentions for this topic total_topic_counts[topic] += 1 // total count of this topic } document_topics_counts[i] = counts // = append(document_topics_counts, counts) } all_keys := make([][]util.Token, len(range_ntopics)) // a set of per-topic vocab lists for i := range range_ntopics { all_keys[i] = util.KeysFromMap(topic_words_counts[i]) } all_tokens := util.SetFromLists(all_keys) // Dirichlet smoothing parameters alpha := l.alpha beta := l.beta W := len(all_tokens) // size of vocab: num of possible unique words in each topic T := ntopics // num topics betaW := beta * float64(W) alphaT := alpha * float64(T) // maxT := ntopics - 1 // uniform_random_func := randomizer.Float64 // seems to be equivalent to Python's random()- uniform dist between [0.0, 1.0] // loop over all docs and all tokens, resampling topic assignments & adjusting counts proportional_probabilities := make([]float64, ntopics) // probability of each topic fixups := make([]int, ntopics) // which topics need count adjustment for current token for iteration := 0; iteration < l.iterations; iteration++ { change_count := 0 for t_idx, tokens := range tokenss { // for each document topics := topicss[t_idx] document_index := range_ndocuments[t_idx] current_document_topics_counts := document_topics_counts[document_index] current_document_tokens_count := document_tokens_counts[document_index] n_di_minus_i := float64(current_document_tokens_count - 1) for token_index, token := range tokens { // for each token // Based on: // Griffiths TL, Steyvers M. Finding scientific topics. // Proceedings of the National Academy of Sciences of the United States of America. 2004;101(Suppl 1):5228-5235. // get topic assignment for current token: topic := topics[token_index] // compute conditional probabilities for each topic, // the "fixups" list is an optimization to avoid branching. fixups[topic] = 1 total_proportional_probabilities := 0.0 for _, j := range range_ntopics { // for each topic fixup := fixups[j] n_wi_minus_i_j := float64(topic_words_counts[j][token] - fixup) // most of the time, fixup will be zero n_di_minus_i_j := float64(current_document_topics_counts[j] - fixup) // ditto n_dot_minus_i_j := float64(total_topic_counts[j] - fixup) // eq. 5 from above paper p_token_topic := (n_wi_minus_i_j + beta) / (n_dot_minus_i_j + betaW) p_topic_document := (n_di_minus_i_j + alpha) / (n_di_minus_i + alphaT) p := p_topic_document * p_token_topic proportional_probabilities[j] = p total_proportional_probabilities += p } // end for topics fixups[topic] = 0 // resample current token topic, integrate the inline version of resample function new_topic := l.resample(randomizer.Float64(), proportional_probabilities, total_proportional_probabilities) // update assignments & counts if new_topic != topic { // update topic label for this token: topics[token_index] = new_topic // update total topic counts: total_topic_counts[topic] -= 1 total_topic_counts[new_topic] += 1 // update document-topic counts current_document_topics_counts[topic] -= 1 current_document_topics_counts[new_topic] += 1 topic_words_counts[topic][token] -= 1 topic_words_counts[new_topic][token] += 1 // count changes for this pass change_count += 1 } } // end for tokens } // end for document // log.Printf("LDA - iteration %d resulted in %d changes.\n", iteration, change_count) if iteration%100 == 0 { log.Printf("LDA - iteration %d of %d.\n", iteration, l.iterations) } } // for iterations // document-topic assignments (theta_hat_d_j) theta_hat_ds := make([][]float64, ndocuments) for document_index := 0; document_index < ndocuments; document_index++ { document_token_count := document_tokens_counts[document_index] theta_hat_d := make([]float64, ntopics) document_topics_count := document_topics_counts[document_index] if document_token_count > 0 { for j := 0; j < ntopics; j++ { p := (float64(document_topics_count[j]) + alpha) / (float64(document_token_count) + alphaT) theta_hat_d[j] = p } } else { // degenerate document with no tokens- equal prob of all topics // temp := float64((1.0 / ntopics) * ntopics) for j := range range_ntopics { theta_hat_d[j] = float64(1.0 / ntopics) //temp } } theta_hat_ds[document_index] = theta_hat_d } // compute topic-token assignments (phi_hat_w_j in paper) phi_hats := make([]util.TokenProbMap, ntopics) for t := 0; t < ntopics; t++ { // for each topic dx := util.TokenProbMap{} for token, top_tok_count := range topic_words_counts[t] { // for each token dx[token] = util.Probability((float64(top_tok_count) + beta) / (float64(total_topic_counts[t]) + betaW)) } phi_hats[t] = dx } // compute log-likelihood of tokens given topic model; Eq. 2 in Steyvers paper part_1, _ := math.Lgamma(float64(W) * beta) // note that Lgamma returns both the gamma, as well as a sign indicator- we don't care about the latter here part_2, _ := math.Lgamma(beta) log_likelihood := float64(T) * (part_1 - (float64(W) * part_2)) for t := 0; t < ntopics; t++ { for _, w := range all_tokens { n_t_w := topic_words_counts[t][w] ntw_gamma, _ := math.Lgamma(float64(n_t_w) + beta) log_likelihood += ntw_gamma } n_dot_t := total_topic_counts[t] ndt_gamma, _ := math.Lgamma(float64(n_dot_t) + betaW) log_likelihood -= ndt_gamma } log.Printf("LDA - log-likelihood of data given model: %0.8e\n", log_likelihood) // sum over samples: // See definition in: // Chemudugunta C, Steyvers PSM. Modeling General and Specific Aspects of Documents with a Probabilistic Topic Model. // In: Advances in Neural Information Processing Systems 19: Proceedings of the 2006 Conference. MIT Press; 2007. p. 241. // // modified implementation to add the logs of the P(tokens)'s rather than multiply // the P(tokens) and then take the log in order to avoid underflowing to zero. perplexity := 0.0 ntokens := 0 for doc_idx, tokens := range tokenss { // each document theta_hat_d := theta_hat_ds[doc_idx] for _, w := range tokens { // each token temp_phi_hat_theta_hat := make([]float64, len(range_ntopics)) for j, z := range range_ntopics { // each topic temp_phi_hat_theta_hat[j] = float64(phi_hats[z][w]) * theta_hat_d[z] } perplexity += math.Log2(util.SumFloat(temp_phi_hat_theta_hat)) ntokens += 1 } } perplexity = math.Pow(2.0, (-perplexity / float64(ntokens))) log.Printf("LDA - mean sample token perplexity of data given model: %0.4f\n", perplexity) // save final snapshot of token-topic assignment snapshot if l.last_model_token_topic_sample_assignments == nil { l.last_model_token_topic_sample_assignments = topicss } // return results sample return &ModelResults{&theta_hat_ds, &phi_hats, log_likelihood, perplexity} }