Beispiel #1
0
func main() {
	// Definition of Command-Line Params
	param_data_file := flag.String("f", "", "path to a file containing News-titles")
	//	param_interactively := flag.Bool("i", false, "run interactively")
	param_purge := flag.Bool("purge", false, "reinitialize the database, purging all existing data")
	param_cosine := flag.Bool("cosine", false, "calculate CosineSimilarity instead of ngram-clusering")
	param_title := flag.String("title", "", "print out similar news-titles")
	param_count := flag.Int("n", 3, "how many similar news titles should be printed?")

	flag.Parse()

	data.InitializeDatabase("root", "")
	data.Connect(database_name, *param_purge)

	if len(*param_data_file) > 0 {
		data.ParseFile(*param_data_file)
	}

	if !*param_cosine {
		if len(*param_title) > 0 {
			for _, match := range bestmatches.GetBestMatches(*param_title, *param_count) {
				fmt.Println(match)
			}
		}
	}

	if *param_cosine {

		titleCount := data.GetCountOfTitles()

		// 6. Prepare to calculate distances
		var title_1 int
		var title_2 int

		fmt.Printf("Enter number for first title (%d): ", titleCount)
		fmt.Scanln(&title_1)
		fmt.Printf("Enter number for second title (%d): ", titleCount)
		fmt.Scanln(&title_2)

		data_1 := data.GetTrigramsByTitle(data.GetNewsTitle(title_1))
		data_2 := data.GetTrigramsByTitle(data.GetNewsTitle(title_2))

		fmt.Printf("+-----------------------------+\n"+
			"| Cosine similarity: %f |\n"+
			"+-----------------------------+",
			ai.CosineSimilarity(ai.NormalizeTwoVectors(data_1, data_2)))

	}
}
Beispiel #2
0
// returns n=count titles matching to the title
func GetBestMatches(title string, count int) []string {
	var trigram_matches []int

	// first calculate ngrams of the search string
	for _, trigram := range ngram.BuildNGram(title, 3) {
		for _, match := range data.GetIdsOfTrigram(trigram) {
			trigram_matches = append(trigram_matches, match)
		}
	}

	// TODO: now get the *count* most frequent news ids
	frequencies := map[int]int{}

	for _, id := range trigram_matches {
		_, contains := frequencies[id]
		if contains {
			frequencies[id] += 1
		} else {
			frequencies[id] = 1
		}
	}

	// now sort according to the count
	sorted := SortByFrequency(trigram_matches, frequencies)
	return_val := []int{}
	for i := 0; i <= count; i += 1 {
		return_val = append(return_val, sorted[i])
	}

	// get the according titles
	titles := []string{}
	for id := range return_val {
		titles = append(titles, data.GetNewsTitle(id))
	}

	return titles
}