func main() { tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range allData { fis, err := ioutil.ReadDir(item.folder) termFreq := make(map[string]int) if err != nil { log.Fatal(err) } for _, fi := range fis { if fi.IsDir() { continue } filePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(filePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } words := tokenizer.Cut([]rune(post.Subject + " " + post.Content)) for _, word := range words { key := strings.Trim(word, cutset) if len(key) > 1 { termFreq[key] = termFreq[key] + 1 } } } pairList := sortMapByValue(termFreq) output, err := os.OpenFile(item.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644) writer := bufio.NewWriter(output) offset := int(float64(len(pairList)) * topTermRatio) for _, pair := range pairList[offset:] { if len([]rune(pair.Key)) > termMinLength { writer.WriteString(pair.Key + " " + strconv.Itoa(pair.Value) + "\n") } } writer.Flush() output.Close() } }
func main() { classifier := bayesian.NewClassifier(Good, Bad) tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range trainingData { log.Printf("Traning %s", item.folder) totalNum := 0 var totalSize int64 startTime := time.Now() fis, err := ioutil.ReadDir(item.folder) if err != nil { log.Fatal(err) } for _, fi := range fis { if fi.IsDir() { continue } totalSize += fi.Size() filePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(filePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset) classifier.Learn(words, item.class) totalNum += 1 } elapsed := time.Now().Sub(startTime) fmt.Printf("TotalNum: %d\n", totalNum) fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n", time.Now().Sub(startTime), float64(totalNum)/(float64(elapsed)/float64(time.Second)), fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second))))) } classifier.WriteToFile(output) }
func (cif *ContentInspectionFilter) Filter(mail mailfile.Mail) Result { log.Printf("Run %s, Mail:%s\n", cif, mail.Name()) cif.total.Inc(1) post, err := mailpost.Parse(mail) mail.Close() if err != nil { cif.malformed.Inc(1) log.Printf("ContentInspectionFilter: Err:%v, Mail:%s\n", err, mail.Path()) return cif.next.Filter(mail) } class := cif.anlz.Test(post.Subject + " " + post.Content) cif.counters[class].Inc(1) if cif.allPass || analyzer.Good == class { return cif.next.Filter(mail) } return Quarantine }
func (cih *ContentInspectionUpdater) Update(mail mailfile.Mail) { if leaner, ok := cih.anlz.(analyzer.Learner); ok { log.Printf("Run %s, Mail:%s\n", cih, mail.Name()) cih.total.Inc(1) post, err := mailpost.Parse(mail) mail.Close() if err != nil { cih.malformed.Inc(1) log.Printf("ContentInspectionUpdater: Err:%v, Mail:%s\n", err, mail.Path()) return } leaner.Learn(post.Subject, cih.class) leaner.Learn(post.Content, cih.class) err = os.Remove(mail.Path()) if err != nil { log.Println(err) } } }
func main() { anlz, err := analyzer.NewBayesianAnalyzer(traningDataFilePath, dictDataFilePath) if err != nil { log.Fatal(err) } for _, item := range testData { log.Printf("Testing %s", item.folder) totalNum, totalError, totalNeutral := 0, 0, 0 var totalSize int64 fis, err := ioutil.ReadDir(item.folder) if err != nil { log.Fatal(err) } startTime := time.Now() for _, fi := range fis { if fi.IsDir() { continue } totalSize += fi.Size() totalNum += 1 mailFilePath := filepath.Join(item.folder, fi.Name()) mail := mailfile.NewPOP3Mail(mailFilePath) if err = mail.Parse(); err != nil { log.Fatal(err) } post, err := mailpost.Parse(mail) mail.Close() if err != nil { log.Fatalf("Err: %v, Mail:%s", err, mail.Path()) } class := anlz.Test(post.Subject + " " + post.Content) switch { case analyzer.Neutral == class: totalNeutral += 1 fmt.Println(ansi.Color(mailFilePath, "cyan+b")) case item.class != class: totalError += 1 fmt.Println(ansi.Color(mailFilePath, "red+b")) } } elapsed := time.Now().Sub(startTime) fmt.Printf("TotalNum: %d, TotalError: %d, ErrRate: %f, TotalNeutral:%d, Confident:%f\n", totalNum, totalError, float64(totalError)/float64(totalNum), totalNeutral, float64(totalNum-totalNeutral)/float64(totalNum)) fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n", time.Now().Sub(startTime), float64(totalNum)/(float64(elapsed)/float64(time.Second)), fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second))))) } }