func mainMerge() { var freqDoor int var scoreDoor float32 freqDoor = 6 scoreDoor = 0.010 sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125.log", str) allsegs = occurrence.FilterSegment(allsegs, stop) //occur := occurrence.NewOccurrence() //occur.AddSegments(allsegs, stop) //occur.Compute() //occur.Output() pairTerms := occurrence.Merge(allsegs, freqDoor, scoreDoor) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/main-test-125-merge-merge.log", str) }
func Test_Merge(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125-2.txt") allsegs := segment.SegmentDoc(article, sign, d) allsegs = occurrence.FilterSegment(allsegs, stop) pairTerms := occurrence.Merge(allsegs, 4, 15.0) str := term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-merge-merge.log", str) }
func NewWordSetting() *WordSetting { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") s := &WordSetting{ freqDoor: 6, scoreDoor: 0.01, signDict: sign, stopDict: stop, wordDict: d, } return s }
func Test_Stop(t *testing.T) { s := dict.NewSign("../data/dictionary/stopwords.txt") word := "out of" ret := s.IsContain(word) fmt.Println(ret) }
func Test_SplitDocument(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") //fmt.Println(article) article = segment.DeleteSpaceChar(article) sign := dict.NewSign("../data/dictionary/sign.txt") sentences := segment.SplitSentence([]rune(article), sign) d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") fmt.Println("Start====") allsegs := make([]*segment.Segment, 0) for _, sentence := range sentences { segments := segment.SegmentSentenceMP(sentence.Buffer(), sentence.Start(), d) //fmt.Println(len(segments)) //fmt.Println(segments) //str := "" //for _, seg := range segments { // str += seg.ToString() //} //fmt.Println(sentence.Start(), str) allsegs = append(allsegs, segments...) } fmt.Println(len(allsegs)) }
func NewWordSetting() *WordSetting { sign := dict.NewSign(SignDictionary) stop := dict.NewSign(StopDictionary) d := dict.NewDictionary(NormalDictionary) config := LoadConfig() s := &WordSetting{ freqDoor: config.FreqDoor, scoreDoor: config.ScoreDoor, outputFreq: config.OutputFreq, signDict: sign, stopDict: stop, wordDict: d, } return s }
func Test_Sign(t *testing.T) { s := dict.NewSign("../data/dictionary/sign.txt") end := "。" fmt.Println(len([]rune(end))) ret := s.IsContain(end) fmt.Println(ret) ret = s.IsContain(".") fmt.Println(ret) }
func Test_SplitSentence(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") //fmt.Println(article) article = segment.DeleteSpaceChar(article) d := dict.NewSign("../data/dictionary/sign.txt") sentences := segment.SplitSentence([]rune(article), d) fmt.Println(len(sentences)) //for _, s := range sentences { // fmt.Println(s.ToString()) //} }
func Test_SegmentMerger(t *testing.T) { sign := dict.NewSign("../data/dictionary/sign.txt") stop := dict.NewSign("../data/dictionary/stopwords.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") article := util.ReadFile("../data/testdata/125-2.txt") allsegs := segment.SegmentDoc(article, sign, d) allsegs = occurrence.FilterSegment(allsegs, stop) fmt.Println(len(allsegs)) str := segment.GetSegmentStr(allsegs) util.WriteFile("../data/test-125-2.log", str) occur := occurrence.NewOccurrence() occur.AddSegments(allsegs, 3) occur.Compute() occur.Output() pairTerms := occur.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-occur.log", str) newSegments := occurrence.MergeSegment(allsegs, pairTerms) fmt.Println(len(newSegments)) str = segment.GetSegmentStr(newSegments) //fmt.Println(str) util.WriteFile("../data/test-125-2-merge.log", str) occur1 := occurrence.NewOccurrence() occur1.AddSegments(newSegments, 1) occur1.Compute() occur1.Output() pairTerms = occur1.GetPairTerms(10.0) str = term.GetPairTermStr(pairTerms) util.WriteFile("../data/test-125-2-second-merge.log", str) }
func Test_Segment(t *testing.T) { article := util.ReadFile("../data/testdata/125-1.txt") sign := dict.NewSign("../data/dictionary/sign.txt") //sentences := SplitSentence([]rune(text), sign) d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") allsegs := segment.SegmentDoc(article, sign, d) fmt.Println(len(allsegs)) final := "" for _, seg := range allsegs { final += seg.ToString() } final += fmt.Sprintf(": %d", len(allsegs)) util.WriteFile("../data/docsegment_test_125-1.log", final) }
func Test_Occurrence_Compute(t *testing.T) { //var segmenter sego.Segmenter //segmenter.LoadDictionary("C:/Go/thirdpartlib/src/github.com/huichen/sego/data/dictionary.txt") filename := "../data/testdata/125-2.txt" buf, err := ioutil.ReadFile(filename) if err != nil { fmt.Println(err) panic(err) } sign := dict.NewSign("../data/dictionary/sign.txt") d := dict.NewDictionary("../data/dictionary/sogoudictionary.txt") //stop := dict.NewSign("../data/dictionary/stopwords.txt.txt") //segments := segmenter.Segment(buf) segments := segment.SegmentDoc(string(buf), sign, d) fmt.Println(len(segments)) LogSegments(segments) occur := occurrence.NewOccurrence() occur.AddSegments(segments, 3) occur.Compute(1) occur.Output() }