示例#1
0
func mainMerge() {
	var freqDoor int
	var scoreDoor float32
	freqDoor = 6
	scoreDoor = 0.010

	sign := dict.NewSign("../data/dictionary/sign.txt")
	stop := dict.NewSign("../data/dictionary/stopwords.txt")
	d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt")

	article := util.ReadFile("../data/testdata/125.txt")
	allsegs := segment.SegmentDoc(article, sign, d)

	fmt.Println(len(allsegs))
	str := segment.GetSegmentStr(allsegs)

	util.WriteFile("../data/test-125.log", str)
	allsegs = occurrence.FilterSegment(allsegs, stop)

	//occur := occurrence.NewOccurrence()
	//occur.AddSegments(allsegs, stop)
	//occur.Compute()
	//occur.Output()

	pairTerms := occurrence.Merge(allsegs, freqDoor, scoreDoor)
	str = term.GetPairTermStr(pairTerms)
	util.WriteFile("../data/main-test-125-merge-merge.log", str)
}
func LogSegments(segments []*segment.Segment) {
	format := "%d %d %s\t"
	outBuf := bytes.NewBufferString("Output words: \n")
	for _, v := range segments {
		//if len(v) == 12 {
		//    fmt.Println(v)
		//}
		//token := v.Token()
		//binary.Write(outBuf, binary.BigEndian, v.Start())
		//binary.Write(outBuf, binary.BigEndian, v.End())
		//fmt.Println(v.Start())
		//binary.BigEndian.PutUint32(outBuf, uint32(v.Start()))
		//binary.BigEndian.PutUint32(outBuf, uint32(v.End()))

		str := fmt.Sprintf(format, v.Start(), v.End(), v.Text())
		outBuf.WriteString(str)
		//outBuf.Write(Int32ToBytes(int32(v.Start())))
		//outBuf.WriteByte('\t')
		//outBuf.Write(Int32ToBytes(int32(v.End())))
		//outBuf.WriteByte('\t')
		//outBuf.Write(v.Start())
		//outBuf.Write(v.End())
		//outBuf.WriteString(token.Text())
		//outBuf.Write(token.Frequency())
		//binary.Write(outBuf, binary.BigEndian, token.Frequency())
		//outBuf.WriteByte('\t')
		// outBuf.Write(Int32ToBytes(int32(token.Frequency())))
		//outBuf.WriteByte('\t')
		//outBuf.WriteString(token.Pos())
		//outBuf.WriteByte('\n')
	}

	util.WriteFile("../data/segment.log", outBuf.String())
}
示例#3
0
func Test_Merge125(t *testing.T) {
	sign := dict.NewSign("../data/dictionary/sign.txt")
	stop := dict.NewSign("../data/dictionary/stopwords.txt")
	d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt")

	article := util.ReadFile("../data/testdata/125.txt")
	allsegs := segment.SegmentDoc(article, sign, d)
	str := segment.GetSegmentStr(allsegs)
	util.WriteFile("../data/test-125-segment-all.log", str)

	allsegs = occurrence.FilterSegment(allsegs, stop)

	str = segment.GetSegmentStr(allsegs)
	util.WriteFile("../data/test-125-segment-filter.log", str)

	pairTerms := occurrence.Merge(allsegs, 4, 15.0)
	str = term.GetPairTermStr(pairTerms)
	util.WriteFile("../data/test-125-merge-merge.log", str)
}
示例#4
0
func Test_SegmentMerger(t *testing.T) {
	sign := dict.NewSign("../data/dictionary/sign.txt")
	stop := dict.NewSign("../data/dictionary/stopwords.txt")
	d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt")

	article := util.ReadFile("../data/testdata/125-2.txt")
	allsegs := segment.SegmentDoc(article, sign, d)
	allsegs = occurrence.FilterSegment(allsegs, stop)
	fmt.Println(len(allsegs))
	str := segment.GetSegmentStr(allsegs)

	util.WriteFile("../data/test-125-2.log", str)

	occur := occurrence.NewOccurrence()
	occur.AddSegments(allsegs, 3)
	occur.Compute()
	occur.Output()

	pairTerms := occur.GetPairTerms(10.0)
	str = term.GetPairTermStr(pairTerms)
	util.WriteFile("../data/test-125-2-occur.log", str)

	newSegments := occurrence.MergeSegment(allsegs, pairTerms)
	fmt.Println(len(newSegments))
	str = segment.GetSegmentStr(newSegments)
	//fmt.Println(str)
	util.WriteFile("../data/test-125-2-merge.log", str)

	occur1 := occurrence.NewOccurrence()
	occur1.AddSegments(newSegments, 1)
	occur1.Compute()
	occur1.Output()
	pairTerms = occur1.GetPairTerms(10.0)
	str = term.GetPairTermStr(pairTerms)
	util.WriteFile("../data/test-125-2-second-merge.log", str)
}
func Test_Segment(t *testing.T) {
	article := util.ReadFile("../data/testdata/125-1.txt")
	sign := dict.NewSign("../data/dictionary/sign.txt")
	//sentences := SplitSentence([]rune(text), sign)
	d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt")
	allsegs := segment.SegmentDoc(article, sign, d)

	fmt.Println(len(allsegs))
	final := ""
	for _, seg := range allsegs {
		final += seg.ToString()
	}

	final += fmt.Sprintf(": %d", len(allsegs))
	util.WriteFile("../data/docsegment_test_125-1.log", final)
}
示例#6
0
func writeOutput(file *FilePath, pairTerms []*term.PairTerm) {
	ext := filepath.Ext(file.filename)
	pos := strings.Index(file.filename, ext)
	base := string(file.filename[:pos])
	outfile := base + ".csv"
	//fmt.Println("Outfile name:", outfile)
	outfile = filepath.Join(file.folder, outfile)

	format := "%s,%d,%f\n"
	str := ""
	for _, pt := range pairTerms {
		str += fmt.Sprintf(format, pt.GetKey(), pt.GetFrequency(), pt.GetScore())
	}

	util.WriteFile(outfile, str)
	fmt.Println("Store the word in: ", outfile)
}
func writeOutput(file *FilePath, pairTerms []*term.PairTerm, outputFreq int) {
	ext := filepath.Ext(file.filename)
	pos := strings.Index(file.filename, ext)
	base := string(file.filename[:pos])
	outfile := base + ".csv"
	//fmt.Println("Outfile name:", outfile)
	outfile = filepath.Join(file.folder, outfile)

	//fmt.Println("Total words: ", len(pairTerms))
	format := "%s,%d,%f\n"
	str := "短语,频率,分数\n"
	for _, pt := range pairTerms {
		if pt.GetFrequency() > outputFreq {
			str += fmt.Sprintf(format, pt.GetKey(), pt.GetFrequency(), pt.GetScore())
		}
	}

	//fmt.Println(str)
	encoder := mahonia.NewEncoder(Encoding)
	str = encoder.ConvertString(str)
	util.WriteFile(outfile, str)
	log.Printf("短语存储到文件: %s", outfile)
}