func getWords(filePath string, tokenizer *goseg.Tokenizer) ([]string, error) { bytes, err := ioutil.ReadFile(filePath) if err != nil { return nil, err } words := tokenizer.Cut([]rune(string(bytes))) return analyzer.Normalize(words, cutset), nil }
func main() { classifier := bayesian.NewClassifier(Good, Bad) tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range trainingData { log.Printf("Traning %s", item.folder) totalNum := 0 var totalSize int64 startTime := time.Now() fis, err := ioutil.ReadDir(item.folder) if err != nil { log.Fatal(err) } for _, fi := range fis { if fi.IsDir() { continue } totalSize += fi.Size() filePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(filePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset) classifier.Learn(words, item.class) totalNum += 1 } elapsed := time.Now().Sub(startTime) fmt.Printf("TotalNum: %d\n", totalNum) fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n", time.Now().Sub(startTime), float64(totalNum)/(float64(elapsed)/float64(time.Second)), fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second))))) } classifier.WriteToFile(output) }