func mainMerge() { var freqDoor int var scoreDoor float32 freqDoor = 6 scoreDoor = 0.010 sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125.log", str) allsegs = occurrence.FilterSegment(allsegs, stop) //occur := occurrence.NewOccurrence() //occur.AddSegments(allsegs, stop) //occur.Compute() //occur.Output() pairTerms := occurrence.Merge(allsegs, freqDoor, scoreDoor) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/main-test-125-merge-merge.log", str) }
func LogSegments(segments []*segment.Segment) { format := "%d %d %s\t" outBuf := bytes.NewBufferString("Output words: \n") for _, v := range segments { //if len(v) == 12 { // fmt.Println(v) //} //token := v.Token() //binary.Write(outBuf, binary.BigEndian, v.Start()) //binary.Write(outBuf, binary.BigEndian, v.End()) //fmt.Println(v.Start()) //binary.BigEndian.PutUint32(outBuf, uint32(v.Start())) //binary.BigEndian.PutUint32(outBuf, uint32(v.End())) str := fmt.Sprintf(format, v.Start(), v.End(), v.Text()) outBuf.WriteString(str) //outBuf.Write(Int32ToBytes(int32(v.Start()))) //outBuf.WriteByte('\t') //outBuf.Write(Int32ToBytes(int32(v.End()))) //outBuf.WriteByte('\t') //outBuf.Write(v.Start()) //outBuf.Write(v.End()) //outBuf.WriteString(token.Text()) //outBuf.Write(token.Frequency()) //binary.Write(outBuf, binary.BigEndian, token.Frequency()) //outBuf.WriteByte('\t') // outBuf.Write(Int32ToBytes(int32(token.Frequency()))) //outBuf.WriteByte('\t') //outBuf.WriteString(token.Pos()) //outBuf.WriteByte('\n') } util.WriteFile("../data/segment.log", outBuf.String()) }
func Test_Merge125(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125.txt") allsegs := segment.SegmentDoc(article, sign, d) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125-segment-all.log", str) allsegs = occurrence.FilterSegment(allsegs, stop) str = segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125-segment-filter.log", str) pairTerms := occurrence.Merge(allsegs, 4, 15.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-merge-merge.log", str) }
func Test_SegmentMerger(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125-2.txt") allsegs := segment.SegmentDoc(article, sign, d) allsegs = occurrence.FilterSegment(allsegs, stop) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125-2.log", str) occur := occurrence.NewOccurrence() occur.AddSegments(allsegs, 3) occur.Compute() occur.Output() pairTerms := occur.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-occur.log", str) newSegments := occurrence.MergeSegment(allsegs, pairTerms) fmt.Println(len(newSegments)) str = segment.GetSegmentStr(newSegments) //fmt.Println(str) util.WriteFile("../data/test-125-2-merge.log", str) occur1 := occurrence.NewOccurrence() occur1.AddSegments(newSegments, 1) occur1.Compute() occur1.Output() pairTerms = occur1.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-second-merge.log", str) }
func Test_Segment(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") sign := dict.NewSign("../data/dictionary/sign.txt") //sentences := SplitSentence([]rune(text), sign) d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) final := "" for _, seg := range allsegs { final += seg.ToString() } final += fmt.Sprintf(": %d", len(allsegs)) util.WriteFile("../data/docsegment_test_125-1.log", final) }
func writeOutput(file *FilePath, pairTerms []*term.PairTerm) { ext := filepath.Ext(file.filename) pos := strings.Index(file.filename, ext) base := string(file.filename[:pos]) outfile := base + ".csv" //fmt.Println("Outfile name:", outfile) outfile = filepath.Join(file.folder, outfile) format := "%s,%d,%f\n" str := "" for _, pt := range pairTerms { str += fmt.Sprintf(format, pt.GetKey(), pt.GetFrequency(), pt.GetScore()) } util.WriteFile(outfile, str) fmt.Println("Store the word in: ", outfile) }
func writeOutput(file *FilePath, pairTerms []*term.PairTerm, outputFreq int) { ext := filepath.Ext(file.filename) pos := strings.Index(file.filename, ext) base := string(file.filename[:pos]) outfile := base + ".csv" //fmt.Println("Outfile name:", outfile) outfile = filepath.Join(file.folder, outfile) //fmt.Println("Total words: ", len(pairTerms)) format := "%s,%d,%f\n" str := "短语,频率,分数\n" for _, pt := range pairTerms { if pt.GetFrequency() > outputFreq { str += fmt.Sprintf(format, pt.GetKey(), pt.GetFrequency(), pt.GetScore()) } } //fmt.Println(str) encoder := mahonia.NewEncoder(Encoding) str = encoder.ConvertString(str) util.WriteFile(outfile, str) log.Printf("短语存储到文件: %s", outfile) }