func main() { tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range allData { fis, err := ioutil.ReadDir(item.folder) termFreq := make(map[string]int) if err != nil { log.Fatal(err) } for _, fi := range fis { if fi.IsDir() { continue } filePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(filePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } words := tokenizer.Cut([]rune(post.Subject + " " + post.Content)) for _, word := range words { key := strings.Trim(word, cutset) if len(key) > 1 { termFreq[key] = termFreq[key] + 1 } } } pairList := sortMapByValue(termFreq) output, err := os.OpenFile(item.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644) writer := bufio.NewWriter(output) offset := int(float64(len(pairList)) * topTermRatio) for _, pair := range pairList[offset:] { if len([]rune(pair.Key)) > termMinLength { writer.WriteString(pair.Key + " " + strconv.Itoa(pair.Value) + "\n") } } writer.Flush() output.Close() } }
func main() { classifier := bayesian.NewClassifier(Good, Bad) tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range trainingData { log.Printf("Traning %s", item.folder) totalNum := 0 var totalSize int64 startTime := time.Now() fis, err := ioutil.ReadDir(item.folder) if err != nil { log.Fatal(err) } for _, fi := range fis { if fi.IsDir() { continue } totalSize += fi.Size() filePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(filePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset) classifier.Learn(words, item.class) totalNum += 1 } elapsed := time.Now().Sub(startTime) fmt.Printf("TotalNum: %d\n", totalNum) fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n", time.Now().Sub(startTime), float64(totalNum)/(float64(elapsed)/float64(time.Second)), fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second))))) } classifier.WriteToFile(output) }
func NewBayesianAnalyzer(traningDataFilePath string, dictDataFilePath string) (*BayesianAnalyzer, error) { tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { return nil, err } coordinator := &sync.RWMutex{} classifier, err := bayesian.NewClassifierFromFile(traningDataFilePath) if err != nil { return nil, err } return &BayesianAnalyzer{tokenizer, classifier, nil, coordinator}, nil }
func NewBayesianAnalyzerWithUpdater(traningDataFilePath string, dictDataFilePath string, updateDelay time.Duration) (*BayesianAnalyzer, error) { tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { return nil, err } coordinator := &sync.RWMutex{} classifier, err := bayesian.NewClassifierFromFile(traningDataFilePath) if err != nil { return nil, err } updater := NewDelayedUpdater(classifier, traningDataFilePath, updateDelay, coordinator) return &BayesianAnalyzer{tokenizer, classifier, updater, coordinator}, nil }
func main() { classifier := bayesian.NewClassifier(Good, Bad) tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) checkErr(err) goodWords, err := getWords("goodwords.txt", tokenizer) checkErr(err) fmt.Printf("Normalized Good Words:\n%s\n", goodWords) fmt.Println("") badWords, err := getWords("badwords.txt", tokenizer) checkErr(err) fmt.Printf("Normalized Bad Words:\n%s\n", badWords) classifier.Learn(goodWords, Good) classifier.Learn(badWords, Bad) classifier.WriteToFile(output) }