func Test_MPSegment(t *testing.T) { d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") text := util.ReadFile("../data/testdata/125-1.txt") out := segment.SegmentSentence_MP(text, d) fmt.Println(out) }
func Test_SegmentSentenceMP(t *testing.T) { d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") text := util.ReadFile("../data/testdata/125-1.txt") segments := segment.SegmentSentenceMP([]rune(text), 0, d) //str := segment.Output(segments) fmt.Println(len(segments)) }
func mainMerge() { var freqDoor int var scoreDoor float32 freqDoor = 6 scoreDoor = 0.010 sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125.log", str) allsegs = occurrence.FilterSegment(allsegs, stop) //occur := occurrence.NewOccurrence() //occur.AddSegments(allsegs, stop) //occur.Compute() //occur.Output() pairTerms := occurrence.Merge(allsegs, freqDoor, scoreDoor) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/main-test-125-merge-merge.log", str) }
func Test_SplitDocument(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") //fmt.Println(article) article = segment.DeleteSpaceChar(article) sign := dict.NewSign("../data/dictionary/sign.txt") sentences := segment.SplitSentence([]rune(article), sign) d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") fmt.Println("Start====") allsegs := make([]*segment.Segment, 0) for _, sentence := range sentences { segments := segment.SegmentSentenceMP(sentence.Buffer(), sentence.Start(), d) //fmt.Println(len(segments)) //fmt.Println(segments) //str := "" //for _, seg := range segments { // str += seg.ToString() //} //fmt.Println(sentence.Start(), str) allsegs = append(allsegs, segments...) } fmt.Println(len(allsegs)) }
func (d *Document) load(filename string) { article := util.ReadFile(filename) article = DeleteSpaceChar(article) d.filename = filename d.buf = []rune(article) }
func Test_FMMSegement(t *testing.T) { seg := segment.NewFMMSegment("../data/dictionary/CoreNatureDictionary.mini.txt") content := util.ReadFile("../data/testdata/125-1.txt") words := seg.Segment(content) for _, w := range words { fmt.Println(w) } }
func handlePath(root string) { ws := NewWordSetting() files := getFilePath(root) for _, f := range files { fullfilepath := filepath.Join(f.folder, f.filename) fmt.Println("Handle the file: ", fullfilepath) content := util.ReadFile(fullfilepath) pairTerm := getWords(content, ws) writeOutput(f, pairTerm) } }
func Test_SplitSentence(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") //fmt.Println(article) article = segment.DeleteSpaceChar(article) d := dict.NewSign("../data/dictionary/sign.txt") sentences := segment.SplitSentence([]rune(article), d) fmt.Println(len(sentences)) //for _, s := range sentences { // fmt.Println(s.ToString()) //} }
func Test_DNASegment(t *testing.T) { s := segment.NewDNASegment() //s.InitDict("../data/dictionary/dictionary.txt") s.InitDict("../data/dictionary/sogoudictionary.txt") text := util.ReadFile("../data/testdata/125-1.txt") fmt.Println(len(text)) segs := s.MPSeg(text) fmt.Println(len(segs)) fmt.Println(segs) fmt.Println(306234192.0 / 301869396788.0) }
func Test_Merge(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125-2.txt") allsegs := segment.SegmentDoc(article, sign, d) allsegs = occurrence.FilterSegment(allsegs, stop) pairTerms := occurrence.Merge(allsegs, 4, 15.0) str := term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-merge-merge.log", str) }
func handlePath(root string) { ws := NewWordSetting() decoder := mahonia.NewDecoder(Encoding) files := getFilePath(root) for _, f := range files { fullfilepath := filepath.Join(f.folder, f.filename) log.Printf("正在处理文件: %s", fullfilepath) content := util.ReadFile(fullfilepath) //if ret, ok := decoder.ConvertStringOK(content); ok { // content = ret //} content = decoder.ConvertString(content) pairTerm := getWords(content, ws) writeOutput(f, pairTerm, ws.outputFreq) } }
func Test_Segment(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") sign := dict.NewSign("../data/dictionary/sign.txt") //sentences := SplitSentence([]rune(text), sign) d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) final := "" for _, seg := range allsegs { final += seg.ToString() } final += fmt.Sprintf(": %d", len(allsegs)) util.WriteFile("../data/docsegment_test_125-1.log", final) }
func Test_SegmentMerger(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125-2.txt") allsegs := segment.SegmentDoc(article, sign, d) allsegs = occurrence.FilterSegment(allsegs, stop) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125-2.log", str) occur := occurrence.NewOccurrence() occur.AddSegments(allsegs, 3) occur.Compute() occur.Output() pairTerms := occur.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-occur.log", str) newSegments := occurrence.MergeSegment(allsegs, pairTerms) fmt.Println(len(newSegments)) str = segment.GetSegmentStr(newSegments) //fmt.Println(str) util.WriteFile("../data/test-125-2-merge.log", str) occur1 := occurrence.NewOccurrence() occur1.AddSegments(newSegments, 1) occur1.Compute() occur1.Output() pairTerms = occur1.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-second-merge.log", str) }