func main() { // Definition of Command-Line Params param_data_file := flag.String("f", "", "path to a file containing News-titles") // param_interactively := flag.Bool("i", false, "run interactively") param_purge := flag.Bool("purge", false, "reinitialize the database, purging all existing data") param_cosine := flag.Bool("cosine", false, "calculate CosineSimilarity instead of ngram-clusering") param_title := flag.String("title", "", "print out similar news-titles") param_count := flag.Int("n", 3, "how many similar news titles should be printed?") flag.Parse() data.InitializeDatabase("root", "") data.Connect(database_name, *param_purge) if len(*param_data_file) > 0 { data.ParseFile(*param_data_file) } if !*param_cosine { if len(*param_title) > 0 { for _, match := range bestmatches.GetBestMatches(*param_title, *param_count) { fmt.Println(match) } } } if *param_cosine { titleCount := data.GetCountOfTitles() // 6. Prepare to calculate distances var title_1 int var title_2 int fmt.Printf("Enter number for first title (%d): ", titleCount) fmt.Scanln(&title_1) fmt.Printf("Enter number for second title (%d): ", titleCount) fmt.Scanln(&title_2) data_1 := data.GetTrigramsByTitle(data.GetNewsTitle(title_1)) data_2 := data.GetTrigramsByTitle(data.GetNewsTitle(title_2)) fmt.Printf("+-----------------------------+\n"+ "| Cosine similarity: %f |\n"+ "+-----------------------------+", ai.CosineSimilarity(ai.NormalizeTwoVectors(data_1, data_2))) } }
// returns n=count titles matching to the title func GetBestMatches(title string, count int) []string { var trigram_matches []int // first calculate ngrams of the search string for _, trigram := range ngram.BuildNGram(title, 3) { for _, match := range data.GetIdsOfTrigram(trigram) { trigram_matches = append(trigram_matches, match) } } // TODO: now get the *count* most frequent news ids frequencies := map[int]int{} for _, id := range trigram_matches { _, contains := frequencies[id] if contains { frequencies[id] += 1 } else { frequencies[id] = 1 } } // now sort according to the count sorted := SortByFrequency(trigram_matches, frequencies) return_val := []int{} for i := 0; i <= count; i += 1 { return_val = append(return_val, sorted[i]) } // get the according titles titles := []string{} for id := range return_val { titles = append(titles, data.GetNewsTitle(id)) } return titles }