Exemple #1
0
func getWords(filePath string, tokenizer *goseg.Tokenizer) ([]string, error) {
	bytes, err := ioutil.ReadFile(filePath)
	if err != nil {
		return nil, err
	}

	words := tokenizer.Cut([]rune(string(bytes)))
	return analyzer.Normalize(words, cutset), nil
}
Exemple #2
0
func main() {
	classifier := bayesian.NewClassifier(Good, Bad)
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range trainingData {
		log.Printf("Traning %s", item.folder)
		totalNum := 0
		var totalSize int64

		startTime := time.Now()
		fis, err := ioutil.ReadDir(item.folder)

		if err != nil {
			log.Fatal(err)
		}

		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}
			totalSize += fi.Size()

			filePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(filePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset)
			classifier.Learn(words, item.class)
			totalNum += 1
		}

		elapsed := time.Now().Sub(startTime)

		fmt.Printf("TotalNum: %d\n", totalNum)
		fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n",
			time.Now().Sub(startTime),
			float64(totalNum)/(float64(elapsed)/float64(time.Second)),
			fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second)))))

	}

	classifier.WriteToFile(output)
}