func NewLocutionStatus() *LocutionStatus { return &LocutionStatus{ accMW: set.New(), longestMW: set.New(), mwAnalysis: list.New(), components: make([]*Word, 0), } }
func (this *MITIE) Process(body string) *list.List { tokens := C.mitie_tokenize(C.CString(body)) defer C.mitie_free(unsafe.Pointer(tokens)) dets := C.mitie_extract_entities(this.ner, tokens) defer C.mitie_free(unsafe.Pointer(dets)) num_dets := C.mitie_ner_get_num_detections(dets) duplicates := set.New() entites := list.New() for i := 0; i < int(num_dets); i++ { centity := C.get_entity(tokens, dets, C.ulong(i)) model := C.GoString(centity.model) score := float64(centity.score) value := C.GoString(centity.value) key := fmt.Sprintf("%s:%s", value, model) if duplicates.Has(key) { continue } duplicates.Add(key) if score > 0.5 { entity := models.NewEntity(model, score, value) entites.PushBack(entity) } } return entites }
func TestJobQueuePushWaitStress(t *testing.T) { RegisterTestingT(t) if testing.Short() { t.Skip("skipping test in short mode.") } pushed := set.New() popped := set.New() q := newJobListQueue() Ω(q).ShouldNot(BeNil()) wg := sync.WaitGroup{} N := 50 * 1000 wg.Add(N) for i := 0; i < N; i++ { pusher, item := i%2 == 0, i/2 if pusher { go func(item int) { defer wg.Done() pushed.Add(item) q.Push(item) }(item) } else { go func() { defer wg.Done() item, ok := q.WaitForJob() Ω(ok).ShouldNot(BeFalse()) popped.Add(item) }() } } wg.Wait() unpopped := set.Difference(pushed, popped) Ω(set.IntSlice(unpopped)).Should(BeEmpty()) unpushed := set.Difference(popped, pushed) Ω(set.IntSlice(unpushed)).Should(BeEmpty()) Ω(set.IntSlice(pushed)).Should(HaveLen(N / 2)) Ω(set.IntSlice(popped)).Should(HaveLen(N / 2)) }
func NewDisambiguator(disFile string) *Disambiguator { this := Disambiguator{ wnids: make(map[string]*Synset), binds: make(map[string]*set.Set), } fileString, err := ioutil.ReadFile(disFile) if err != nil { LOG.Panic("Error loading file " + disFile) } lines := strings.Split(string(fileString), "\n") for _, line := range lines { if line == "" { continue } items := Split(line, "\t") sscope := items[0] scope := DOCUMENT_SCOPE if sscope == "d" { scope = DOCUMENT_SCOPE } else if sscope == "sd" { scope = SENTENCE_SCOPE } else if sscope == "sb" { scope = SENTENCE_BIND } else if sscope == "nd" { scope = ND_SCOPE } switch scope { case DOCUMENT_SCOPE, SENTENCE_SCOPE, ND_SCOPE: { lemma := items[1] wnid := items[2] pos, _ := strconv.ParseFloat(items[3], 64) neg, _ := strconv.ParseFloat(items[4], 64) domain := items[5][1:] score, _ := strconv.Atoi(items[6]) gloss := items[7] syn := NewSynset(scope, lemma, wnid, pos, neg, domain, score, gloss) this.wnids[wnid] = syn break } case SENTENCE_BIND: { key := items[1][1:] for i := 2; i < len(items); i++ { if this.binds[key] == nil { this.binds[key] = set.New() } this.binds[key].Add(items[i][1:]) } } } } return &this }
func (this *Dictionary) ParseDictEntry(data string, lems *list.List) bool { aux := make(map[string]*set.Set) dataItems := Split(data, " ") sl := set.New() for i := 0; i < len(dataItems)-1; i = i + 2 { lemma := dataItems[i] sl.Add(lemma) if i == len(dataItems) { return false } tag := dataItems[i+1] l := aux[lemma] if l == nil { l = set.New() aux[lemma] = l } l.Add(tag) } ll := list.New() for _, l := range sl.List() { ll.PushBack(l.(string)) } this.SortList(ll, this.lemmaPrefs) for k := ll.Front(); k != nil; k = k.Next() { l := aux[k.Value.(string)] lt := list.New() for _, s := range l.List() { lt.PushBack(s.(string)) } this.SortList(lt, this.posPrefs) lems.PushBack(Pair{k.Value.(string), lt}) } return true }
func NewAnalysis(lemma string, tag string) *Analysis { return &Analysis{ lemma: lemma, tag: tag, prob: -1.0, distance: -1.0, senses: list.New(), retok: list.New(), selectedKBest: set.New(), } }
func TestQueueStress(t *testing.T) { RegisterTestingT(t) if testing.Short() { t.Skip("skipping test in short mode.") } addedJobs := set.New() processedJobs := set.New() q := New("stress", 100, func(j Job) { time.Sleep(3 * time.Millisecond) processedJobs.Add(j) }) Ω(q.Start()).Should(BeNil()) N := 5 * 1000 wg := sync.WaitGroup{} wg.Add(N) for i := 0; i < N; i++ { go func(item int) { defer wg.Done() addedJobs.Add(item) Ω(q.Add(item)).Should(BeNil()) }(i) } wg.Wait() // Wait while all jobs are added to the queue Ω(q.Stop(true)).Should(BeNil()) unprocessed := set.Difference(addedJobs, processedJobs) Ω(set.IntSlice(unprocessed)).Should(BeEmpty()) unadded := set.Difference(processedJobs, addedJobs) Ω(set.IntSlice(unadded)).Should(BeEmpty()) Ω(set.IntSlice(addedJobs)).Should(HaveLen(N)) Ω(set.IntSlice(processedJobs)).Should(HaveLen(N)) }
func (this *HMMTagger) FindStates(sent *Sentence) *list.List { st := set.New() ls := list.New() w2 := sent.Front() TRACE(3, "obtaining the states that may have emmited the initial word: "+w2.Value.(*Word).getForm(), MOD_HMM) for a2 := w2.Value.(*Word).selectedBegin(0).Element; a2 != nil; a2 = a2.Next() { st.Add(&Bigram{"0", this.Tags.GetShortTag(a2.Value.(*Analysis).getTag())}) } ls.PushBack(st) for w1, w2 := w2, w2.Next(); w1 != nil && w2 != nil; w1, w2 = w2, w2.Next() { TRACE(3, "obtaining the states that may have emmited the word: "+w2.Value.(*Word).getForm(), MOD_HMM) st := set.New() for a1 := w1.Value.(*Word).selectedBegin(0).Element; a1 != nil; a1 = a1.Next() { for a2 := w2.Value.(*Word).selectedBegin(0).Element; a2 != nil; a2 = a2.Next() { st.Add(&Bigram{this.Tags.GetShortTag(a1.Value.(*Analysis).getTag()), this.Tags.GetShortTag(a2.Value.(*Analysis).getTag())}) } } ls.PushBack(st) } return ls }
func (this *Probability) guesser(w *Word, mass float64) float64 { form := w.getLCForm() sum := If(w.getNAnalysis() > 0, mass, 0.0).(float64) sum2 := 0.0 TRACE(2, "Initial sum="+strconv.FormatFloat(sum, 'f', 3, 64), MOD_PROBABILITY) stags := set.New() for li := w.Front(); li != nil; li = li.Next() { stags.Add(this.Tags.GetShortTag(li.Value.(*Analysis).getTag())) } la := list.New() for k, v := range this.unkTags { TRACE(2, " guesser checking tag "+k, MOD_PROBABILITY) hasit := stags.Has(this.Tags.GetShortTag(k)) if !hasit { p := this.computeProbability(k, v, form) a := NewAnalysis(form, k) a.setProb(p) if p >= this.ProbabilityThreshold { sum += p w.addAnalysis(a) TRACE(2, " added. sum is:"+strconv.FormatFloat(sum, 'f', 3, 64), MOD_PROBABILITY) } else { sum2 += p la.PushBack(a) } } } if w.getNAnalysis() == 0 { w.setAnalysis(List2Array(la)...) sum = sum2 } return sum }
func (this *Affixes) GenerateRoots(kind int, suf *sufrule, rt string) *set.Set { cand := set.New() var term, r string var pe int cand.Clear() term = suf.term TRACE(3, "Possible terminations/beginnings: "+term, MOD_AFFIX) pe = strings.Index(term, "|") for pe > -1 { r = term[0:pe] if r == "*" { r = "" } if kind == SUF { TRACE(3, "Adding to t_roots the element "+rt+r, MOD_AFFIX) cand.Add(rt + r) } else if kind == PREF { TRACE(3, "Adding to t_roots the element "+r+rt, MOD_AFFIX) cand.Add(r + rt) } term = term[pe+1:] pe = strings.Index(term, "|") } if term == "*" { term = "" } if kind == SUF { TRACE(3, "Adding to t_roots the element "+rt+term, MOD_AFFIX) cand.Add(rt + term) } else if kind == PREF { TRACE(3, "Adding to t_roots the element "+term+rt, MOD_AFFIX) cand.Add(term + rt) } return cand }
func NewAnalysisFromAnalysis(a *Analysis) *Analysis { this := Analysis{ lemma: a.lemma, tag: a.tag, prob: a.prob, distance: a.distance, senses: list.New(), retok: list.New(), selectedKBest: set.New(a.selectedKBest), } for s := a.senses.Front(); s != nil; s = s.Next() { this.senses.PushBack(s.Value.(*Senses)) } for r := a.retok.Front(); r != nil; r = r.Next() { this.retok.PushBack(r.Value.(*Word)) } return &this }
func NewGrammar(fname string) *Grammar { this := Grammar{ RulesMap: make(RulesMap), nonterminal: set.New(), wild: make(RulesMap), filemap: make(RulesMap), prior: make(map[string]int), hidden: set.New(), flat: set.New(), notop: set.New(), onlytop: set.New(), } MAX := 32 var tok, stat, newstat, i, j int what := 0 var trans [32][32]int first := false wildcard := false var head, err, categ, name string ls := list.New() var priorVal int for i = 0; i < MAX; i++ { for j = 0; j < MAX; j++ { trans[i][j] = 0 } } trans[1][GRAMMAR_COMMENT] = 1 trans[1][GRAMMAR_CATEGORY] = 2 trans[1][GRAMMAR_PRIOR] = 6 trans[1][GRAMMAR_START] = 8 trans[1][GRAMMAR_HIDDEN] = 6 trans[1][GRAMMAR_FLAT] = 6 trans[1][GRAMMAR_NOTOP] = 6 trans[1][GRAMMAR_ONLYTOP] = 6 trans[2][GRAMMAR_ARROW] = 3 trans[3][GRAMMAR_CATEGORY] = 4 trans[3][GRAMMAR_HEAD] = 10 trans[4][GRAMMAR_COMMA] = 3 trans[4][GRAMMAR_BAR] = 3 trans[4][GRAMMAR_DOT] = 1 trans[4][GRAMMAR_LEMMA] = 5 trans[4][GRAMMAR_FORM] = 5 trans[4][GRAMMAR_FILENAME] = 5 trans[5][GRAMMAR_COMMA] = 3 trans[5][GRAMMAR_BAR] = 3 trans[5][GRAMMAR_DOT] = 1 trans[6][GRAMMAR_CATEGORY] = 7 trans[7][GRAMMAR_CATEGORY] = 7 trans[7][GRAMMAR_DOT] = 1 trans[8][GRAMMAR_CATEGORY] = 9 trans[9][GRAMMAR_DOT] = 1 trans[10][GRAMMAR_CATEGORY] = 4 rules := make([]*Pair, 0) rules = append(rules, &Pair{regexp.MustCompile("[ \\t\\n\\r]+"), 0}) rules = append(rules, &Pair{regexp.MustCompile("%.*"), GRAMMAR_COMMENT}) rules = append(rules, &Pair{regexp.MustCompile("==>"), GRAMMAR_ARROW}) rules = append(rules, &Pair{regexp.MustCompile("\\([[:alpha:]_'·\\-]+\\)"), GRAMMAR_FORM}) rules = append(rules, &Pair{regexp.MustCompile("<[[:lower:]_'·\\-]+>"), GRAMMAR_LEMMA}) rules = append(rules, &Pair{regexp.MustCompile("\\(\\\"([A-Za-z]:)?[[:alnum:]_\\-\\./\\\\]+\\\"\\)"), GRAMMAR_FILENAME}) rules = append(rules, &Pair{regexp.MustCompile("<\\\"([A-Za-z]:)?[[:alnum:]_\\-\\./\\\\]+\\\">"), GRAMMAR_FILENAME}) rules = append(rules, &Pair{regexp.MustCompile("[A-Za-z][\\-A-Za-z0-9]*[*]?"), GRAMMAR_CATEGORY}) rules = append(rules, &Pair{regexp.MustCompile("@PRIOR"), GRAMMAR_PRIOR}) rules = append(rules, &Pair{regexp.MustCompile("@START"), GRAMMAR_START}) rules = append(rules, &Pair{regexp.MustCompile("@HIDDEN"), GRAMMAR_HIDDEN}) rules = append(rules, &Pair{regexp.MustCompile("@FLAT"), GRAMMAR_FLAT}) rules = append(rules, &Pair{regexp.MustCompile("@NOTOP"), GRAMMAR_NOTOP}) rules = append(rules, &Pair{regexp.MustCompile("@ONLYTOP"), GRAMMAR_ONLYTOP}) rules = append(rules, &Pair{regexp.MustCompile("\\|"), GRAMMAR_BAR}) rules = append(rules, &Pair{regexp.MustCompile("\\."), GRAMMAR_DOT}) rules = append(rules, &Pair{regexp.MustCompile(","), GRAMMAR_COMMA}) rules = append(rules, &Pair{regexp.MustCompile("\\+"), GRAMMAR_HEAD}) fl := NewLexer(rules) filestr, e := ioutil.ReadFile(fname) if e != nil { CRASH("Error opening file "+fname, MOD_GRAMMAR) } gov := 0 havegov := false stat = 1 priorVal = 1 stream := string(filestr) err = "" for { tok = fl.getToken(stream) if tok == -1 { break } newstat = trans[stat][tok] switch newstat { case 0: { if tok == GRAMMAR_COMMENT { err = "Unexpected comment. Missing dot ending previous rule/directive ?" } if err == "" { err = "Unexpected '" + fl.getText() + "' found." } LOG.Warnln("File "+fname+", line "+strconv.Itoa(fl.lineno())+":"+err, MOD_GRAMMAR) for tok > -1 && tok != GRAMMAR_DOT { tok = fl.getToken(stream) } newstat = 1 break } case 1: { if tok == GRAMMAR_DOT && (stat == 4 || stat == 5) { ls.PushBack(categ) if !havegov { gov = GRAMMAR_DEFGOV if ls.Len() != 1 { err = "Non-unary rule with no governor. First component taken as governor." LOG.Warnln("File "+fname+", line "+strconv.Itoa(fl.lineno())+":"+err, MOD_GRAMMAR) } } this.newRule(head, ls, wildcard, gov) gov = GRAMMAR_NOGOV havegov = false } break } case 2: { head = fl.getText() this.nonterminal.Add(head) break } case 3: { if tok == GRAMMAR_ARROW { ls = list.New() first = true wildcard = false } else if tok == GRAMMAR_COMMA { ls.PushBack(categ) } else if tok == GRAMMAR_BAR { ls.PushBack(categ) if !havegov { gov = GRAMMAR_DEFGOV if ls.Len() != -1 { err = "Non-unary rule with no governor. First component taken as governor." LOG.Warnln("File "+fname+", line "+strconv.Itoa(fl.lineno())+":"+err, MOD_GRAMMAR) } } this.newRule(head, ls, wildcard, gov) gov = GRAMMAR_NOGOV havegov = false ls = list.New() break } } case 4: { categ = fl.getText() if first && strings.Index(categ, "*") > -1 { wildcard = true } first = false break } case 5: { name = fl.getText() categ = categ + name if tok == GRAMMAR_FILENAME { var sname string sname = name[2 : len(name)-2] sname = fname[0:strings.LastIndex(fname, "/")+1] + "/" + sname fs, e := ioutil.ReadFile(sname) if e != nil { LOG.Stackln("Error opening file " + sname) } var op, clo string if string(name[0]) == "<" { op = "<" clo = ">" } else if string(name[0]) == "(" { op = "(" clo = ")" } lines := Split(string(fs), "\n") for _, line := range lines { lfm, ok := this.filemap[op+line+clo] if !ok { this.filemap[op+line+clo] = list.New() lfm = this.filemap[op+line+clo] } exists := false for l := lfm.Front(); l != nil && !exists; l = l.Next() { if l.Value.(string) == name { exists = true break } } if !exists { lfm.PushBack(name) } } } break } case 6: { what = tok break } case 7: { categ = fl.getText() if this.nonterminal.Has(categ) { switch what { case GRAMMAR_PRIOR: { _, ok := this.prior[categ] if !ok { this.prior[categ] = priorVal priorVal++ } break } case GRAMMAR_HIDDEN: { this.hidden.Add(categ) break } case GRAMMAR_FLAT: { this.flat.Add(categ) break } case GRAMMAR_NOTOP: { this.notop.Add(categ) break } case GRAMMAR_ONLYTOP: { this.onlytop.Add(categ) break } default: break } } else { err = "Terminal symbol '" + fl.getText() + "' not allowed in directive." newstat = 0 } break } case 8: { if this.start != "" { err = "@START specified more than once." newstat = 0 } break } case 9: { this.start = fl.getText() if !this.nonterminal.Has(this.start) { this.nonterminal.Add(this.start) } break } case 10: { gov = ls.Len() havegov = true break } default: break } stat = newstat } if this.start == "" { err = "@START symbol not specified." LOG.Warnln("File " + fname + ", line " + strconv.Itoa(fl.lineno()) + ":" + err) } if this.hidden.Has(this.start) { err = "@START symbol cannot be @HIDDEN." LOG.Warnln("File " + fname + ", line " + strconv.Itoa(fl.lineno()) + ":" + err) } if this.notop.Has(this.start) { err = "@START symbol cannot be @NOTOP." LOG.Warnln("File " + fname + ", line " + strconv.Itoa(fl.lineno()) + ":" + err) } for _, x := range this.onlytop.List() { if this.hidden.Has(x.(string)) { err = "@HIDDEN directive for '" + (x.(string)) + "' overrides @ONLYTOP." LOG.Warnln("File " + fname + ", line " + strconv.Itoa(fl.lineno()) + ":" + err) } } /* for k,v := range this.filemap { //println("FILEMAP ===== ",k," =====") for i := v.Front(); i != nil; i = i.Next() { //println(i.Value.(string)) } } for k,v := range this.RulesMap { //println("===== " + k + " =====") for i := v.Front(); i != nil; i = i.Next() { //println(i.Value.(*Rule).getHead(), i.Value.(*Rule).getRight().Front().Value.(string)) } } for k,v := range this.wild { //println("===== " + k + " =====") for i := v.Front(); i != nil; i = i.Next() { //println(i.Value.(*Rule).getHead(), i.Value.(*Rule).getRight().Front().Value.(string)) } } */ TRACE(3, "Grammar loaded", MOD_GRAMMAR) return &this }
func (this *NLPEngine) Workflow(document *models.DocumentEntity, output chan *models.DocumentEntity) { defer func() { if r := recover(); r != nil { err, _ := r.(error) if err != nil { output <- nil //err.Error() } else { output <- nil } } }() document.Init() tokens := list.New() url := document.Url content := document.Content if url != "" && content == "" { crawler := NewDefaultCrawler() article := crawler.Analyze(url) document.Title = article.Title document.Description = article.MetaDescription document.Keywords = article.MetaKeywords document.TopImage = article.TopImage document.Content = article.CleanedText } body := StringsAppend(document.Title, document.Description, document.Keywords, document.Content) if this.tokenizer != nil { this.tokenizer.Tokenize(body, 0, tokens) } sentences := list.New() if this.splitter != nil { sid := this.splitter.OpenSession() this.splitter.Split(sid, tokens, true, sentences) this.splitter.CloseSession(sid) } for ss := sentences.Front(); ss != nil; ss = ss.Next() { s := ss.Value.(*Sentence) if this.morfo != nil { this.morfo.Analyze(s) } if this.sense != nil { this.sense.Analyze(s) } if this.tagger != nil { this.tagger.Analyze(s) } if this.shallowParser != nil { this.shallowParser.Analyze(s) } } if this.dsb != nil { this.dsb.Analyze(sentences) } entities := make(map[string]int64) for ss := sentences.Front(); ss != nil; ss = ss.Next() { se := models.NewSentenceEntity() body := "" s := ss.Value.(*Sentence) for ww := s.Front(); ww != nil; ww = ww.Next() { w := ww.Value.(*Word) a := w.Front().Value.(*Analysis) te := models.NewTokenEntity(w.getForm(), a.getLemma(), a.getTag(), a.getProb()) if a.getTag() == TAG_NP { entities[w.getForm()]++ } body += w.getForm() + " " se.AddTokenEntity(te) } body = strings.Trim(body, " ") se.SetBody(body) se.SetSentence(s) document.AddSentenceEntity(se) } tempEntities := set.New() mitieEntities := this.mitie.Process(body) for e := mitieEntities.Front(); e != nil; e = e.Next() { entity := e.Value.(*models.Entity) tempEntities.Add(entity.GetValue()) } for name, frequency := range entities { name = strings.Replace(name, "_", " ", -1) if !tempEntities.Has(name) { document.AddUnknownEntity(name, frequency) } } document.Entities = mitieEntities output <- document }
func NewNP(npFile string) *NP { this := NP{ fun: set.New(), punct: set.New(), names: set.New(), ignoreTags: make(map[string]int), ignoreWords: make(map[string]int), prefixes: set.New(), suffixes: set.New(), RENounAdj: regexp.MustCompile(NP_RE_NA), REClosed: regexp.MustCompile(NP_RE_CLO), REDateNumPunct: regexp.MustCompile(NP_RE_DNP), } this.NERModule = NewNERModule(npFile) this.final = set.New() cfg := NewConfigFile(false, "##") cfg.AddSection("Type", NP_NER_TYPE) cfg.AddSection("FunctionWords", NP_FUNCTION) cfg.AddSection("SpecialPunct", NP_SPECIAL) cfg.AddSection("Names", NP_NAMES) cfg.AddSection("Ignore", NP_NE_IGNORE) cfg.AddSection("RE_NounAdj", NP_REX_NOUNADJ) cfg.AddSection("RE_Closed", NP_REX_CLOSED) cfg.AddSection("RE_DateNumPunct", NP_REX_DATNUMPUNT) cfg.AddSection("Affixes", NP_AFFIXES) cfg.skipUnknownSections = true if !cfg.Open(npFile) { CRASH("Error opening file "+npFile, MOD_NER) } line := "" for cfg.GetContentLine(&line) { items := Split(line, " ") switch cfg.GetSection() { case NP_NER_TYPE: { if strings.ToLower(line) != "basic" { CRASH("Invalid configuration file for 'basic' NER, "+npFile, MOD_NER) } break } case NP_FUNCTION: { this.fun.Add(line) break } case NP_SPECIAL: { this.punct.Add(line) break } case NP_NAMES: { this.names.Add(line) break } case NP_NE_IGNORE: { key := items[0] tpe, _ := strconv.Atoi(items[1]) if IsCapitalized(key) { this.ignoreTags[key] = tpe + 1 } else { this.ignoreWords[key] = tpe + 1 } break } case NP_REX_NOUNADJ: { this.RENounAdj = regexp.MustCompile(line) break } case NP_REX_CLOSED: { this.REClosed = regexp.MustCompile(line) break } case NP_REX_DATNUMPUNT: { this.REDateNumPunct = regexp.MustCompile(line) break } case NP_AFFIXES: { word := items[0] tpe := items[1] if tpe == "SUF" { this.suffixes.Add(word) } else if tpe == "PRE" { this.prefixes.Add(word) } else { WARNING("Ignored affix with unknown type '"+tpe+"' in file", MOD_NER) } break } } } this.initialState = NP_ST_IN this.stopState = NP_ST_STOP this.final.Add(NP_ST_NP) this.final.Add(NP_ST_SUF) var s, t int for s = 0; s < AUTOMAT_MAX_STATES; s++ { for t = 0; t < AUTOMAT_MAX_TOKENS; t++ { this.trans[s][t] = NP_ST_STOP } } this.trans[NP_ST_IN][NP_TK_sUnkUpp] = NP_ST_NP this.trans[NP_ST_IN][NP_TK_sNounUpp] = NP_ST_NP this.trans[NP_ST_IN][NP_TK_mUpper] = NP_ST_NP this.trans[NP_ST_IN][NP_TK_mPref] = NP_ST_PREF this.trans[NP_ST_PREF][NP_TK_mPref] = NP_ST_PREF this.trans[NP_ST_PREF][NP_TK_mUpper] = NP_ST_NP this.trans[NP_ST_NP][NP_TK_mUpper] = NP_ST_NP this.trans[NP_ST_NP][NP_TK_mFun] = NP_ST_FUN this.trans[NP_ST_NP][NP_TK_mSuf] = NP_ST_SUF this.trans[NP_ST_FUN][NP_TK_mUpper] = NP_ST_NP this.trans[NP_ST_FUN][NP_TK_mFun] = NP_ST_FUN this.trans[NP_ST_SUF][NP_TK_mSuf] = NP_ST_SUF LOG.Trace("analyzer succesfully created") return &this }
func NewAffixes(sufFile string) *Affixes { this := Affixes{} filestr, err := ioutil.ReadFile(sufFile) if err != nil { CRASH("Error opening file "+sufFile, MOD_AFFIX) return nil } lines := strings.Split(string(filestr), "\n") this.Longest[SUF] = 0 this.Longest[PREF] = 0 kind := -1 for _, line := range lines { if line != "" && !strings.HasPrefix(line, "#") { items := Split(line, "\t") if line == "<Suffixes>" { kind = SUF } else if line == "<Prefixes>" { kind = PREF } else if line == "</Suffixes>" { kind = -1 } else if line == "</Prefixes>" { kind = -1 } else if kind == SUF || kind == PREF { key := items[0] term := items[1] cond := items[2] output := items[3] acc := items[4] enc := items[5] nomore := items[6] lema := items[7] always := items[8] retok := items[9] suf := NewSufRuleFromRexEx(cond) suf.term = term suf.output = output suf.acc, _ = strconv.Atoi(acc) suf.enc, _ = strconv.Atoi(enc) suf.nomore, _ = strconv.Atoi(nomore) suf.lema = lema suf.always, _ = strconv.Atoi(always) suf.retok = retok if suf.retok == "-" { suf.retok = "" } if this.affix[kind] == nil { this.affix[kind] = make(map[string]*set.Set) } if this.affix[kind][key] == nil { this.affix[kind][key] = set.New() } this.affix[kind][key].Add(suf) if suf.always == 1 { if this.affixAlways[kind] == nil { this.affixAlways[kind] = make(map[string]*set.Set) } if this.affixAlways[kind][key] == nil { this.affixAlways[kind][key] = set.New() } this.affixAlways[kind][key].Add(suf) } if this.ExistingLength[kind] == nil { this.ExistingLength[kind] = set.New() } this.ExistingLength[kind].Add(len(key)) if len(key) > this.Longest[kind] { this.Longest[kind] = len(key) } } } } TRACE(3, "analyzer succesfully created", MOD_AFFIX) return &this }
func NewSemanticDB(wsdFile string) *SemanticDB { this := SemanticDB{ posMap: list.New(), } var formFile, dictFile, wnFile string path := wsdFile[0:strings.LastIndex(wsdFile, "/")] posset := set.New() cfg := NewConfigFile(true, "") cfg.AddSection("WNposMap", SEMDB_WN_POS_MAP) cfg.AddSection("DataFiles", SEMDB_DATA_FILES) if !cfg.Open(wsdFile) { LOG.Panic("Error opening configuration file " + wsdFile) } line := "" for cfg.GetContentLine(&line) { items := Split(line, " ") switch cfg.GetSection() { case SEMDB_WN_POS_MAP: { r := PosMapRule{} r.pos = items[0] r.wnpos = items[1] r.lemma = items[2] this.posMap.PushBack(r) if r.lemma != "L" && r.lemma != "F" { posset.Add(r.lemma) } break } case SEMDB_DATA_FILES: { key := items[0] fname := items[1] if key == "formDictFile" { formFile = path + "/" + strings.Replace(fname, "./", "", -1) } else if key == "senseDictFile" { dictFile = path + "/" + strings.Replace(fname, "./", "", -1) } else if key == "wnFile" { wnFile = path + "/" + strings.Replace(fname, "./", "", -1) } break } default: break } } if formFile == "" || posset.Size() == 0 { this.formDict = nil } else { fileString, err := ioutil.ReadFile(formFile) if err != nil { LOG.Panic("Error loading file " + formFile) } lines := strings.Split(string(fileString), "\n") this.formDict = NewDatabase(DB_MAP) for _, line := range lines { items := Split(line, " ") form := items[0] for i := 1; i < len(items); i = i + 2 { lemma := items[i] tag := items[i+1] if posset.Has(tag) { this.formDict.addDatabase(lemma+" "+tag, form) } } } } if dictFile == "" { this.senseDB = nil } else { fileString, err := ioutil.ReadFile(dictFile) if err != nil { LOG.Panic("Error loading file " + dictFile) } lines := strings.Split(string(fileString), "\n") this.senseDB = NewDatabase(DB_MAP) for _, line := range lines { items := Split(line, " ") sens := items[0] tag := sens[strings.Index(sens, "-")+1:] for i := 1; i < len(items); i++ { wd := items[i] this.senseDB.addDatabase("S:"+sens, wd) this.senseDB.addDatabase("W:"+wd+":"+tag, sens) } } } if wnFile == "" { this.wndb = nil } else { this.wndb = NewDatabaseFromFile(wnFile) } return &this }
func (this *Affixes) lookForCombinedAffixes(suff map[string]*set.Set, pref map[string]*set.Set, w *Word, dic *Dictionary) { var i, j, ln int var lws, formSuf, formPref, formRoot string lws = w.getLCForm() ln = len(lws) var rulesS *set.Set var rulesP *set.Set var candidates, cand1 *set.Set for i = 1; i <= this.Longest[SUF] && i < ln; i++ { if this.ExistingLength[SUF].Has(i) == false { TRACE(4, "No suffixes of size "+strconv.Itoa(i), MOD_AFFIX) continue } for j = 1; j <= this.Longest[PREF] && j <= ln-i; j++ { if this.ExistingLength[PREF].Has(j) == false { TRACE(4, "No prefixes of size "+strconv.Itoa(i), MOD_AFFIX) continue } formSuf = lws[ln-i:] formPref = lws[0:j] rulesS = suff[formSuf] if rulesS.Size() == 0 || rulesS.List()[0].(string) != formSuf { TRACE(3, "No rules for suffix "+formSuf+" (size "+strconv.Itoa(i), MOD_AFFIX) continue } rulesP = suff[formPref] if rulesP.Size() == 0 || rulesP.List()[0].(string) != formPref { TRACE(3, "No rules for prefix "+formPref+" (size "+strconv.Itoa(i), MOD_AFFIX) continue } formRoot = lws[0 : ln-i][j:] TRACE(3, "Trying a decomposition: "+formPref+"+"+formRoot+"+"+formSuf, MOD_AFFIX) TRACE(3, "Found "+strconv.Itoa(rulesS.Size())+" rules for suffix "+formSuf+" (size "+strconv.Itoa(i), MOD_AFFIX) TRACE(3, "Found "+strconv.Itoa(rulesP.Size())+" rules for prefix "+formPref+" (size "+strconv.Itoa(i), MOD_AFFIX) //wfid := w.foundInDict() lrulesS := rulesS.List() lrulesP := rulesP.List() for s := 0; s < rulesS.Size(); s++ { sufit := lrulesS[s].(*sufrule) for p := 0; p < rulesP.Size(); p++ { prefit := lrulesP[p].(*sufrule) candidates = set.New() cand1 = this.GenerateRoots(SUF, sufit, formRoot) this.accen.FixAccentutation(cand1, sufit) lcand1 := cand1.List() for _, c := range lcand1 { cand2 := this.GenerateRoots(PREF, prefit, c.(string)) this.accen.FixAccentutation(cand2, prefit) candidates.Add(cand2) } } } } } }
func NewTokenizer(tokenizerFile string) *Tokenizer { this := Tokenizer{ abrevs: set.New(), rules: list.New(), matches: make(map[string]int), } cfg := NewConfigFile(false, "##") cfg.AddSection("Macros", TOKENIZER_MACROS) cfg.AddSection("RegExps", TOKENIZER_REGEXPS) cfg.AddSection("Abbreviations", TOKENIZER_ABBREV) if !cfg.Open(tokenizerFile) { LOG.Panic("Error opening file " + tokenizerFile) } macros := list.New() rul := false var ci string line := "" for cfg.GetContentLine(&line) { items := Split(line, " ") switch cfg.GetSection() { case TOKENIZER_MACROS: { if rul { LOG.Panic("Error reading tokenizer configuration. Macros must be defined before rules.") } mname := items[0] mvalue := items[1] macros.PushBack(Pair{mname, mvalue}) LOG.Trace("Read macro " + mname + ": " + mvalue) break } case TOKENIZER_REGEXPS: { var substr int comm := items[0] substr, _ = strconv.Atoi(items[1]) re := items[2] rul = true for i := macros.Front(); i != nil; i = i.Next() { mname := "{" + i.Value.(Pair).first.(string) + "}" mvalue := i.Value.(Pair).second.(string) p := strings.Index(re, mname) for p > -1 { re = strings.Replace(re, mname, mvalue, -1) p = strings.Index(re[p:], mname) } } if len(items) > 3 { ci = items[3] } if ci == "CI" { newre := "(?i)" + re x, err := regexp.Compile(newre) if err == nil { this.rules.PushBack(Pair{comm, x}) } else { LOG.Warn("Rule " + comm + " [" + newre + "] failed to be compiled") } } else { x, err := regexp.Compile(re) if err == nil { this.rules.PushBack(Pair{comm, x}) } else { LOG.Warn("Rule " + comm + " [" + re + "] failed to be compiled") } } this.matches[comm] = substr LOG.Trace("Stored rule " + comm + " " + re + " " + strconv.Itoa(substr)) break } case TOKENIZER_ABBREV: { this.abrevs.Add(line) break } default: break } } LOG.Trace("analyzer succesfully created") return &this }
func NewSplitter(splitterFile string) *Splitter { this := Splitter{ starters: set.New(), enders: make(map[string]bool), markers: make(map[string]int), } cfg := NewConfigFile(false, "##") cfg.AddSection("General", SPLITTER_GENERAL) cfg.AddSection("Markers", SPLITTER_MARKERS) cfg.AddSection("SentenceEnd", SPLITTER_SENT_END) cfg.AddSection("SentenceStart", SPLITTER_SENT_START) if !cfg.Open(splitterFile) { CRASH("Error opening file "+splitterFile, MOD_SPLITTER) } this.SPLIT_AllowBetweenMarkers = true this.SPLIT_MaxWords = 0 nmk := 1 line := "" for cfg.GetContentLine(&line) { items := Split(line, " ") switch cfg.GetSection() { case SPLITTER_GENERAL: { name := items[0] if name == "AllowBetweenMarkers" { this.SPLIT_AllowBetweenMarkers, _ = strconv.ParseBool(items[1]) } else if name == "MaxWords" { this.SPLIT_MaxWords, _ = strconv.ParseInt(items[1], 10, 64) } else { LOG.Panic("Unexpected splitter option " + name) } break } case SPLITTER_MARKERS: { open := items[0] close := items[1] if open != close { this.markers[open] = nmk this.markers[close] = -nmk } else { this.markers[open] = SAME + nmk this.markers[close] = SAME + nmk } nmk++ break } case SPLITTER_SENT_END: { name := items[0] value, _ := strconv.ParseBool(items[1]) this.enders[name] = !value break } case SPLITTER_SENT_START: { this.starters.Add(line) break } default: break } } LOG.Trace("Analyzer succesfully created") return &this }
func (this *Locutions) ComputeToken(state int, j *list.Element, se *Sentence) int { st := se.getProcessingStatus().(*LocutionStatus) if st.components == nil { st.components = make([]*Word, 0) } st.components = append(st.components, j.Value.(*Word)) var form, lem, tag string form = j.Value.(*Word).getLCForm() token := LOCUTIONS_TK_other acc := set.New() mw := false pref := false if j.Value.(*Word).Len() == 0 { LOG.Trace("checking (" + form + ")") if st.accMW.Size() == 0 { this.check(form, acc, &mw, &pref, st) } else { for _, i := range st.accMW.List() { LOG.Trace(" acc_mw: [" + i.(string) + "]") this.check(i.(string)+"_"+form, acc, &mw, &pref, st) } } } else { first := j.Value.(*Word).Front() if this.onlySelected { first = j.Value.(*Word).selectedBegin(0).Element LOG.Trace("Only selected is set.") } for a := first; a != nil; a = a.Next() { bm := false bp := false lem = "<" + a.Value.(*Analysis).getLemma() + ">" tag = a.Value.(*Analysis).getTag() if this.Tags != nil { tag = this.Tags.GetShortTag(tag) } LOG.Trace("checking (" + form + "," + lem + "," + tag + ")") if st.accMW.Size() == 0 { this.check(form, acc, &bm, &bp, st) this.check(lem, acc, &bm, &bp, st) if this.check(tag, acc, &bm, &bp, st) { j.Value.(*Word).unselectAllAnalysis(0) a.Value.(*Analysis).markSelected(0) } mw = mw || bm pref = pref || bp } else { for _, i := range st.accMW.List() { LOG.Trace(" acc_mw: [" + i.(string) + "]") this.check(i.(string)+"_"+form, acc, &bm, &bp, st) this.check(i.(string)+"_"+lem, acc, &bm, &bp, st) if this.check(i.(string)+"_"+tag, acc, &bm, &bp, st) { j.Value.(*Word).unselectAllAnalysis(0) a.Value.(*Analysis).markSelected(0) } mw = mw || bm pref = pref || bp } } } } LOG.Trace(" fora :" + If(mw, "MW", "noMW").(string) + "," + If(pref, "PREF", "noPref").(string)) if mw { token = LOCUTIONS_TK_mw } else if pref { token = LOCUTIONS_TK_pref } st.overLongest++ st.accMW = acc LOG.Trace("Encoded word: [" + form + "," + lem + "," + tag + "] token=" + strconv.Itoa(token)) return token }
func NewLocutions(locFile string) *Locutions { this := Locutions{ locut: make(map[string]string), prefixes: set.New(), } /* cfg := NewConfigFile(false, "##") cfg.AddSection("TagSetFile", LOCUTIONS_TAGSET) cfg.AddSection("Multiwords", LOCUTIONS_MULTIWORDS) cfg.AddSection("OnlySelected", LOCUTIONS_ONLYSELECTED) */ filestr, err := ioutil.ReadFile(locFile) if err != nil { LOG.Panic("Error opening file " + locFile) } lines := strings.Split(string(filestr), "\n") for _, line := range lines { this.addLocution(line) } /* if !cfg.Open(locFile) { CRASH("Error opening file " + locFile, MOD_LOCUTIONS) } line := "" for cfg.GetContentLine(&line) { switch cfg.GetSection() { case LOCUTIONS_MULTIWORDS: { this.addLocution(line) break } case LOCUTIONS_TAGSET: { path := locFile[0:strings.LastIndex(locFile, "/")] this.Tags = NewTagset(path + "/" + strings.Replace(line, "./", "", -1)) break } case LOCUTIONS_ONLYSELECTED: { this.onlySelected = (line == "yes" || line == "true") break } default: break } } */ this.initialState = LOCUTIONS_ST_P this.stopState = LOCUTIONS_ST_STOP if this.final == nil { this.final = set.New() } this.final.Add(LOCUTIONS_ST_M) var s, t int for s = 0; s < AUTOMAT_MAX_STATES; s++ { for t = 0; t < AUTOMAT_MAX_TOKENS; t++ { this.trans[s][t] = LOCUTIONS_ST_STOP } } this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_pref] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_prefL] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_prefP] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_mw] = LOCUTIONS_ST_M this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_mwL] = LOCUTIONS_ST_M this.trans[LOCUTIONS_ST_P][LOCUTIONS_TK_mwP] = LOCUTIONS_ST_M this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_pref] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_prefL] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_prefP] = LOCUTIONS_ST_P this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_mw] = LOCUTIONS_ST_M this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_mwL] = LOCUTIONS_ST_M this.trans[LOCUTIONS_ST_M][LOCUTIONS_TK_mwP] = LOCUTIONS_ST_M LOG.Trace("analyzer succesfully created") return &this }
func NewTagset(ftagset string) *TagSet { this := &TagSet{ PAIR_SEP: "=", MSD_SEP: "|", feat: make(map[string]string), val: make(map[string]string), name: make(map[string]string), nameInv: make(map[string]string), direct: make(map[string]*Pair), directInv: make(map[*set.Set]string), valInv: make(map[string]string), shtagSize: make(map[string]*list.List), } cfg := NewConfigFile(false, "##") cfg.AddSection("DirectTranslations", DIRECT_TRANSLATIONS) cfg.AddSection("DecompositionRules", DECOMPOSITION_RULES) if !cfg.Open(ftagset) { CRASH("Error opening file "+ftagset, MOD_TAG_SET) } line := "" for cfg.GetContentLine(&line) { items := Split(line, " ") switch cfg.section { case DIRECT_TRANSLATIONS: { tag := items[0] shtag := items[1] msd := "" if len(items) > 2 { msd = items[2] } this.direct[tag] = &Pair{shtag, msd} this.directInv[set.New(msd, this.MSD_SEP)] = tag break } case DECOMPOSITION_RULES: { cat := items[0] shsz := items[1] if len(items) > 2 { pos := items[2] this.name[cat] = pos this.nameInv[pos] = cat } this.shtagSize[cat] = list.New() tokens := strings.Split(shsz, ",") for _, sitem := range tokens { item, _ := strconv.Atoi(sitem) this.shtagSize[cat].PushBack(item) } //TRACE(3, fmt.Sprintf("Read short tag size for %s (%s) %s\n", cat, pos, shsz), MOD_TAG_SET) i := 1 if len(items) > 4 { msd := items[4] key := cat + "#" + strconv.Itoa(i) k := strings.Split(msd, "/") this.feat[key] = k[0] this.feat[cat+"#"+k[0]] = strconv.Itoa(i) v := strings.Split(k[1], ";") for j := 0; j < len(v); j++ { t := strings.Split(v[j], ":") this.val[key+"#"+strings.ToUpper(t[0])] = t[1] this.valInv[key+"#"+t[1]] = strings.ToUpper(t[0]) } i++ } break } default: break } } TRACE(1, "Module created successfully", MOD_HMM) return this }