Esempio n. 1
0
func main() {
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range allData {
		fis, err := ioutil.ReadDir(item.folder)

		termFreq := make(map[string]int)

		if err != nil {
			log.Fatal(err)
		}

		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}

			filePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(filePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			words := tokenizer.Cut([]rune(post.Subject + " " + post.Content))

			for _, word := range words {
				key := strings.Trim(word, cutset)
				if len(key) > 1 {
					termFreq[key] = termFreq[key] + 1
				}
			}
		}

		pairList := sortMapByValue(termFreq)

		output, err := os.OpenFile(item.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
		writer := bufio.NewWriter(output)

		offset := int(float64(len(pairList)) * topTermRatio)

		for _, pair := range pairList[offset:] {
			if len([]rune(pair.Key)) > termMinLength {
				writer.WriteString(pair.Key + " " + strconv.Itoa(pair.Value) + "\n")
			}
		}
		writer.Flush()
		output.Close()

	}
}
Esempio n. 2
0
func main() {
	classifier := bayesian.NewClassifier(Good, Bad)
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range trainingData {
		log.Printf("Traning %s", item.folder)
		totalNum := 0
		var totalSize int64

		startTime := time.Now()
		fis, err := ioutil.ReadDir(item.folder)

		if err != nil {
			log.Fatal(err)
		}

		for _, fi := range fis {
			if fi.IsDir() {
				continue
			}
			totalSize += fi.Size()

			filePath := filepath.Join(item.folder, fi.Name())
			mail := mailfile.NewPOP3Mail(filePath)
			if err = mail.Parse(); err != nil {
				log.Fatal(err)
			}

			post, err := mailpost.Parse(mail)
			mail.Close()
			if err != nil {
				log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
			}

			words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset)
			classifier.Learn(words, item.class)
			totalNum += 1
		}

		elapsed := time.Now().Sub(startTime)

		fmt.Printf("TotalNum: %d\n", totalNum)
		fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n",
			time.Now().Sub(startTime),
			float64(totalNum)/(float64(elapsed)/float64(time.Second)),
			fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second)))))

	}

	classifier.WriteToFile(output)
}
Esempio n. 3
0
func NewBayesianAnalyzer(traningDataFilePath string, dictDataFilePath string) (*BayesianAnalyzer, error) {
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		return nil, err
	}

	coordinator := &sync.RWMutex{}
	classifier, err := bayesian.NewClassifierFromFile(traningDataFilePath)
	if err != nil {
		return nil, err
	}

	return &BayesianAnalyzer{tokenizer, classifier, nil, coordinator}, nil
}
Esempio n. 4
0
func NewBayesianAnalyzerWithUpdater(traningDataFilePath string, dictDataFilePath string, updateDelay time.Duration) (*BayesianAnalyzer, error) {
	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	if err != nil {
		return nil, err
	}

	coordinator := &sync.RWMutex{}
	classifier, err := bayesian.NewClassifierFromFile(traningDataFilePath)
	if err != nil {
		return nil, err
	}

	updater := NewDelayedUpdater(classifier, traningDataFilePath, updateDelay, coordinator)

	return &BayesianAnalyzer{tokenizer, classifier, updater, coordinator}, nil
}
Esempio n. 5
0
func main() {
	classifier := bayesian.NewClassifier(Good, Bad)

	tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
	checkErr(err)

	goodWords, err := getWords("goodwords.txt", tokenizer)
	checkErr(err)
	fmt.Printf("Normalized Good Words:\n%s\n", goodWords)

	fmt.Println("")

	badWords, err := getWords("badwords.txt", tokenizer)
	checkErr(err)
	fmt.Printf("Normalized Bad Words:\n%s\n", badWords)

	classifier.Learn(goodWords, Good)
	classifier.Learn(badWords, Bad)

	classifier.WriteToFile(output)
}