예제 #1
0
func main() {
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range allData {
		fis, err := ioutil.ReadDir(item.folder)

		termFreq := make(map[string]int)

		if err != nil {
			log.Fatal(err)
		}

		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}

			filePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(filePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			words := tokenizer.Cut([]rune(post.Subject + " " + post.Content))

			for _, word := range words {
				key := strings.Trim(word, cutset)
				if len(key) > 1 {
					termFreq[key] = termFreq[key] + 1
				}
			}
		}

		pairList := sortMapByValue(termFreq)

		output, err := os.OpenFile(item.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
		writer := bufio.NewWriter(output)

		offset := int(float64(len(pairList)) * topTermRatio)

		for _, pair := range pairList[offset:] {
			if len([]rune(pair.Key)) > termMinLength {
				writer.WriteString(pair.Key + " " + strconv.Itoa(pair.Value) + "\n")
			}
		}
		writer.Flush()
		output.Close()

	}
}
예제 #2
0
func main() {
	classifier := bayesian.NewClassifier(Good, Bad)
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range trainingData {
		log.Printf("Traning %s", item.folder)
		totalNum := 0
		var totalSize int64

		startTime := time.Now()
		fis, err := ioutil.ReadDir(item.folder)

		if err != nil {
			log.Fatal(err)
		}

		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}
			totalSize += fi.Size()

			filePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(filePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset)
			classifier.Learn(words, item.class)
			totalNum += 1
		}

		elapsed := time.Now().Sub(startTime)

		fmt.Printf("TotalNum: %d\n", totalNum)
		fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n",
			time.Now().Sub(startTime),
			float64(totalNum)/(float64(elapsed)/float64(time.Second)),
			fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second)))))

	}

	classifier.WriteToFile(output)
}
func (cif *ContentInspectionFilter) Filter(mail mailfile.Mail) Result {
	log.Printf("Run %s, Mail:%s\n", cif, mail.Name())
	cif.total.Inc(1)

	post, err := mailpost.Parse(mail)
	mail.Close()
	if err != nil {
		cif.malformed.Inc(1)
		log.Printf("ContentInspectionFilter: Err:%v, Mail:%s\n", err, mail.Path())
		return cif.next.Filter(mail)
	}

	class := cif.anlz.Test(post.Subject + " " + post.Content)
	cif.counters[class].Inc(1)
	if cif.allPass || analyzer.Good == class {
		return cif.next.Filter(mail)
	}

	return Quarantine
}
func (cih *ContentInspectionUpdater) Update(mail mailfile.Mail) {
	if leaner, ok := cih.anlz.(analyzer.Learner); ok {
		log.Printf("Run %s, Mail:%s\n", cih, mail.Name())
		cih.total.Inc(1)

		post, err := mailpost.Parse(mail)
		mail.Close()
		if err != nil {
			cih.malformed.Inc(1)
			log.Printf("ContentInspectionUpdater: Err:%v, Mail:%s\n", err, mail.Path())
			return
		}

		leaner.Learn(post.Subject, cih.class)
		leaner.Learn(post.Content, cih.class)

		err = os.Remove(mail.Path())
		if err != nil {
			log.Println(err)
		}
	}
}
예제 #5
0
func main() {
	anlz, err := analyzer.NewBayesianAnalyzer(traningDataFilePath, dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range testData {
		log.Printf("Testing %s", item.folder)

		totalNum, totalError, totalNeutral := 0, 0, 0
		var totalSize int64

		fis, err := ioutil.ReadDir(item.folder)

		if err != nil {
			log.Fatal(err)
		}

		startTime := time.Now()
		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}
			totalSize += fi.Size()
			totalNum += 1

			mailFilePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(mailFilePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			class := anlz.Test(post.Subject + " " + post.Content)

			switch {
			case analyzer.Neutral == class:
				totalNeutral += 1
				fmt.Println(ansi.Color(mailFilePath, "cyan+b"))

			case item.class != class:
				totalError += 1
				fmt.Println(ansi.Color(mailFilePath, "red+b"))
			}

		}

		elapsed := time.Now().Sub(startTime)

		fmt.Printf("TotalNum: %d, TotalError: %d, ErrRate: %f, TotalNeutral:%d, Confident:%f\n",
			totalNum, totalError, float64(totalError)/float64(totalNum), totalNeutral, float64(totalNum-totalNeutral)/float64(totalNum))
		fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n",
			time.Now().Sub(startTime),
			float64(totalNum)/(float64(elapsed)/float64(time.Second)),
			fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second)))))
	}
}