コード例 #1
0
ファイル: create.go プロジェクト: sereg/morf
func removeExcess() {
	dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable",
		DB_USER, DB_PASSWORD, DB_NAME)
	db, err := sql.Open("postgres", dbinfo)
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()
	rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `)
	if err != nil {
		log.Fatal(err)
	}
	var text string
	var countElement, id, i, j, chet int
	rubbx = f.NewRubAr()
	var gref Graf.Graf
	gref.Init()
	morfEng.InitEngl()
	sqlStart := `DELETE FROM sentenses WHERE 1=1 AND (`
	buf := bytes.NewBuffer(nil)
	for rows.Next() {
		rows.Scan(&id, &text, &countElement)
		_, prov := morfEng.getWordWithoutOmonemii(text)
		if !prov {
			i++
			//fmt.Println(i)
			buf.WriteString(`id = ` + strconv.Itoa(id))
			if j >= 300 {
				_, err = db.Exec(sqlStart + buf.String() + ")")
				if err != nil {
					fmt.Println(sqlStart + buf.String())
					log.Fatal(err)
				}
				j = 0
				buf.Reset()
			} else {
				buf.WriteString(` or `)
				j++
			}
		}
		chet++
		if chet%10000 == 0 {
			fmt.Printf("%d - %d - %d\r\n", i, chet, j)
		}
	}

	fmt.Println(i)
}
コード例 #2
0
ファイル: createEnlish.go プロジェクト: sereg/morf
func testWithout() {
	buf := bytes.NewBuffer(nil)
	buf.Reset()
	file, _ := os.Open("/home/serega/file_work/text.txt") // Error handling elided for brevity.
	io.Copy(buf, file)                                    // Error handling elided for brevity.
	file.Close()
	var gref Graf.Graf
	gref.Init()
	morfEng.InitEngl()
	i := 0
	//zn :=  make(map[string]string)
	//zn =  map[string]string{
	//	"cc":   "coordinating conjunction",
	//	"cd":   "cardinal number",
	//	"rb":   "adverb",
	//	"rbr":  "adverb, comparative",
	//	"rbs":  "adverb, superlative",
	//	"jjs":  "adjective, superlative",
	//	"jj":   "adjective",
	//	"jjr":  "adjective, comparative",
	//	"vb":   "verb, base form",
	//	"vbd":  "verb, past tense",
	//	"vbn":  "verb, past participle",
	//	"vbz":  "verb,3rd ps. sing. present",
	//	"vbg":  "verb, gerund/present participle",
	//	"vbp":  "verb, non-3rd ps. sing. present",
	//	"nn":   "noun, singular or masps",
	//	"nns":  "noun, plural",
	//	"nnp":  "proper noun, singular",
	//	"nnps": "proper noun plural",
	//	"dt":   "determiner",
	//	"ex":   "existential there",
	//	"rp":   "particle",
	//	"to":   "to",
	//	"uh":   "interjection",
	//	"ls":   "list item marker",
	//	"md":   "modal",
	//	"wdt":  "wh-determiner",
	//	"pdt":  "predeterminer",
	//	"wp":   "wh-pronoun",
	//	"pos":  "possessive ending",
	//	"wp$":  "possessive wh-pronoun",
	//	"in":   "preposition/subord. conjunction",
	//	"prp":  "personal pronoun",
	//	"prp$": "possessive pronoun",
	//	"wrb":  "wh-adverb",
	//	"(":    "open parenthesis",
	//	")":    "close parenthesis",
	//	"``":   "open quote",
	//	",":    "comma",
	//	"''":   "close quote",
	//	".":    "period",
	//	"#":    "pound sign (currency marker)",
	//	"$":    "dollar sign (currency marker)",
	//	":":    "colon",
	//	"sym":  "symbol (mathematical or scientific)",
	//	"fw":   "foreign word",
	//}
	//findElement := map[string]string{
	//	"cc":   "coordinating conjunction",
	//	//"cd":   "cardinal number",
	//	"adv":   "adverb",
	//	"rb":   "adverb",
	//	"rbr":  "adverb, comparative",
	//	"rbs":  "adverb, superlative",
	//	"adj":   "adjective",
	//	"jj":   "adjective",
	//	"jjs":  "adjective, superlative",
	//	"jjr":  "adjective, comparative",
	//	"vb":   "verb, base form",
	//	"verb":   "verb, base form",
	//	"vbd":  "verb, past tense",
	//	"vbn":  "verb, past participle",
	//	"vbg":  "verb, gerund/present participle",
	//	"vbz":  "verb,3rd ps. sing. present",
	//	"vbp":  "verb, non-3rd ps. sing. present",
	//	"noun":   "noun, singular or masps",
	//	"nn":   "noun, singular or masps",
	//	"nns":  "noun, plural",
	//	"nnp":  "proper noun, singular",
	//	"nnps": "proper noun plural",
	//}
	//findElement1 := map[string]string{
	//	"dt":   "determiner",
	//	"ex":   "existential there",
	//	"rp":   "particle",
	//	"to":   "to",
	//	"uh":   "interjection",
	//	"ls":   "list item marker",
	//	"md":   "modal",
	//	"wdt":  "wh-determiner",
	//	"pdt":  "predeterminer",
	//	"wp":   "wh-pronoun",
	//	"pos":  "possessive ending",
	//	"wp$":  "possessive wh-pronoun",
	//	"in":   "preposition/subord. conjunction",
	//	"prp":  "personal pronoun",
	//	//"pron":  "personal pronoun",
	//	"prp$": "possessive pronoun",
	//	"wrb":  "wh-adverb",
	//	"vbz":  "verb,3rd ps. sing. present",
	//	"vbp":  "verb, non-3rd ps. sing. present",
	//	"rb":   "adverb",
	//}

	//difine := map[string]map[string]map[string]string{
	//	"cc":   {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}},
	//	"cd":   {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}},
	//	"rb":   {"form":{"advv1":""}, "base":{"adv":""}, "extra":{"adj":""}},
	//	"adv":   {"form":{"advv1":""}, "base":{"adv":""}, "extra":{"adj":""}},
	//	"rbr":  {"form":{"advv2":""}, "base":{"adv":""}, "extra":{"adj":""}},
	//	"rbs":  {"form":{"adjv3":""}, "base":{"adv":""}, "extra":{"adj":""}},
	//	"adj":   {"form":{"adjv1":""}, "base":{"adj":""}, "extra":{"adv":""}},
	//	"jj":   {"form":{"adjv1":""}, "base":{"adj":""}, "extra":{"adv":""}},
	//	"jjr":  {"form":{"adjv2":""}, "base":{"adj":""}, "extra":{"adv":""}},
	//	"jjs":  {"form":{"adjv3":""}, "base":{"adj":""}, "extra":{"adv":""}},
	//	"vb":   {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}},
	//	"verb":   {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}},
	//	"vbd":  {"form":{"verbv2":"", "verbv3":"", "verbu":"", "verbu1":"", "verbu2":"", "verbu3":""}, "base":{"verb":""}, "extra":{}},
	//	"vbn":  {"form":{"verbv2":"", "verbv3":"", "verbu":"", "verbu1":"", "verbu2":"", "verbu3":""}, "base":{"verb":""}, "extra":{}},
	//	"vbz":  {"form":{"verbv5":""}, "base":{"verb":""}, "extra":{}},
	//	"vbg":  {"form":{"verbv4":""}, "base":{"verb":""}, "extra":{}},
	//	"vbp":  {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}},
	//	"noun":   {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}},
	//	"nn":   {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}},
	//	"nns":  {"form":{"nounv2":""}, "base":{"noun":""}, "extra":{}},
	//	"nnp":  {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}},
	//	"nnps": {"form":{"nounv2":""}, "base":{"noun":""}, "extra":{}},
	//	"conj": {"form":{"rp":""}, "base":{"noun":""}, "extra":{}},
	//}
	var jsontetext, text string
	//frequence := make(map[string]int, 0)
	//noFound := make(map[string]map[string]int)
	//foundZ := make(map[string]map[string]int)
	baseFrequence := [][]string{}
	scet := 0
	arraySentense := regexp.MustCompile(`\r\n`).Split(buf.String(), -1)
	for k, v := range arraySentense {
		_ = v
		if k%2 != 0 && k != 0 {
			continue
		}
		i++
		jsontetext, text = arraySentense[k+1], v
		var zn [][]string
		jsontetext = strings.ToLower(jsontetext)
		ffjson.Unmarshal([]byte(jsontetext), &zn)
		prov := false
		var insetFrequuence []string
		result, _ := morfEng.getIshForSentence(text)
		//fmt.Println(text)
		//fmt.Println(result)
		//fmt.Println(arraySentense[k + 1])
		//fmt.Println(zn)
		//break
		if len(result) == len(zn) {
			prov = true
		}
		for _, v := range zn {
			_ = v
			//if _, found := difine[v[1]]["form"][v1.types + v1.forma]; found {
			//	insetFrequuence = append(insetFrequuence, v1.types + v1.forma)
			//	//foundZ = addToMap(foundZ, v[1], "form")
			//	pr = true
			//	//localProv = true
			//	break
			//}
		}
		if prov {
			scet++
			baseFrequence = append(baseFrequence, insetFrequuence)
			//fmt.Println(jsontetext)
		} else {
			//fmt.Printf("%d failed: %d\r\n", len(result),len(zn))
			//fmt.Println(text)
			//fmt.Println(zn)
			//fmt.Println(result)

		}
		if i%10000 == 0 {
			fmt.Println(i)
			//break
		}
		if i > 100000 {
			break
		}
	}

	fmt.Printf("norm - %d ", scet)

	//wodfFile, _ := os.Create("file/englishLemm/baseFrequence")
	//frub.Encode(baseFrequence, wodfFile)
}
コード例 #3
0
ファイル: create.go プロジェクト: sereg/morf
func spliteonPredl() {
	dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable",
		DB_USER, DB_PASSWORD, DB_NAME)
	db, err := sql.Open("postgres", dbinfo)
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()

	//allName = dirHod("/home/serega/Copy/code/text/book/",, allName)
	allName := dirHod("/media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/", "", []string{})
	//fmt.Println(allName)
	//echoArray(allName)
	var gref Graf.Graf
	gref.Init()
	rubbx = f.NewRubAr()
	morfEng.InitEngl()
	buf := bytes.NewBuffer(nil)
	for k, f1 := range allName {
		if k < 30002 {
			// for /media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/
			continue
		}
		buf.Reset()
		f, _ := os.Open(f1) // Error handling elided for brevity.
		io.Copy(buf, f)     // Error handling elided for brevity.
		f.Close()
		//text, _ := html2text.FromString(buf.String())
		sentense := gref.PrepereText(buf.String())
		//echoArray(sentense)
		//fmt.Println(f1)
		//buf.Reset()
		i := 0
		sqlStart := `INSERT INTO sentenses(trans, text, count) VALUES `
		buf1 := bytes.NewBuffer(nil)
		for _, s := range sentense {
			_ = s
			if rubbx.IsTru1(s, `.*[-"].*`) {
				continue
			}
			sentenseItem, prov := morfEng.getWordWithoutOmonemii(s)
			if !prov || len(sentenseItem) < 4 || len(sentenseItem) > 20 {
				continue
			}
			s = rubbx.ReplaseRub(s, `''`, `'`)
			rows, err := db.Query(`SELECT count(*) as cou FROM sentenses WHERE text = '` + s + `' `)
			if err != nil {
				log.Fatal(err)
			}
			var cou string
			rows.Next()
			rows.Scan(&cou)
			rows.Close()
			//	//fmt.Println(cou)
			if cou == `0` && len(sentenseItem) > 4 && len(sentenseItem) < 20 {
				//fmt.Println(s)
				buf1.WriteString(`('','` + s + `',` + strconv.Itoa(len(sentenseItem)) + `)`)
				if i >= 300 {
					_, err = db.Exec(sqlStart + buf1.String())
					if err != nil {
						fmt.Println(sqlStart + buf1.String())
						log.Fatal(err)
					}
					i = 0
					buf1.Reset()
				} else {
					buf1.WriteString(`,`)
					i++
				}
			}
			//	//
			//	//	////fmt.Println(" ---------- ")
		}
		if i != 0 {
			str := buf1.String()
			str = string([]byte(str)[:len(str)-1])
			_, err = db.Exec(sqlStart + str)
			if err != nil {
				fmt.Println(sqlStart + buf1.String())
				log.Fatal(err)
			}
		}
		//
		if k%1 == 0 {
			fmt.Println(f1)
			fmt.Println(k)
		}
		if k > 10 {
			//break
		}
		////fmt.Println(f)
		//break
	}
}
コード例 #4
0
ファイル: create.go プロジェクト: sereg/morf
func fillTableWord() {
	dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable",
		DB_USER, DB_PASSWORD, DB_NAME)
	db, err := sql.Open("postgres", dbinfo)
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()
	rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `)
	if err != nil {
		log.Fatal(err)
	}
	var text string
	var countElement, id, i int
	rubbx = f.NewRubAr()
	var gref Graf.Graf
	gref.Init()
	type words struct {
		result string
		types  string
		forma  string
		count  int
	}
	type read struct {
		text         string
		countElement int
		id           int
	}
	word := make(map[string]words, 0)
	word_use := make(map[string][]int, 0)
	readderArray := make([]read, 6762636)
	morfEng.InitEngl()
	localCou := 0
	for rows.Next() {
		rows.Scan(&id, &text, &countElement)
		readderArray[localCou] = read{text, countElement, id}
		localCou++
	}
	for _, rowFrom := range readderArray {
		i++
		id = rowFrom.id
		text = rowFrom.text
		countElement = rowFrom.countElement
		sentenseItem, prov := morfEng.getWordWithoutOmonemii(text)
		if prov {
			fraz := []int{}
			for k, s := range sentenseItem {
				if len(fraz) > 0 {
					//fraz--
					if fraz[0] == k {
						fraz = fraz[1:]
						continue
					}
				}
				if s.types != "number" && s.types != "punctuation" {
					if zn, found := morfEng.fraz[s.result]; found {
						k1 := 0
						shift := 1
						for {
							fraz2 := false
							find := false
							for _, v1 := range zn {
								if k+shift+k1 < len(sentenseItem) {
									val := sentenseItem[k+shift+k1].result
									if len(v1) > k1 && val == v1[k1] {
										fraz = append(fraz, k+shift+k1)
										find = true
										if len(v1)-1 == k1 {
											for jc := 0; jc < len(v1); jc++ {
												s.result += " " + sentenseItem[k+jc+shift].result
											}
											s.types = "phras"
											if shift == 1 {
												s.forma = ""
											} else {
												s.forma = "stm"
											}
											shift = 2
											break
										} else {
											fraz2 = true
										}
									}
								}
							}
							if !find {

							}
							if !fraz2 && shift == 2 {
								break
							}
							if !fraz2 {
								shift++
							}
							k1++
						}
					}
					s.result = rubbx.ReplaseRub((s.result), `''`, `'`)
					keyWord := s.result + s.types + s.forma
					if _, found := word_use[keyWord]; found {
						word_use[keyWord] = append(word_use[keyWord], id)
					} else {
						word_use[keyWord] = []int{id}
					}
					if _, found := word[keyWord]; found {
						word[keyWord] = words{s.result, s.types, s.forma, word[keyWord].count + 1}
					} else {
						word[keyWord] = words{s.result, s.types, s.forma, 1}
					}
				}
			}
		}
		if i%1000 == 0 {
			fmt.Println(i)
			//break
		}
		if i > 100000 {
			//break
		}
	}
	//for k, v := range word {
	//	if v.forma == "phras"{
	//		fmt.Printf("%s - %v+\n", k, v)
	//	}
	//}
	//fmt.Println(word)
	//fmt.Println(word_use)
	i = 0
	jj := 0
	sqlStart := `INSERT INTO words (word_key, word, forma, type, frequensy) VALUES `
	buf := bytes.NewBuffer(nil)
	for k, s := range word {
		buf.WriteString(`('` + k + `','` + s.result + `','` + (s.forma) + `','` + (s.types) + `',` + strconv.Itoa(s.count) + `)`)
		if i >= 300 {
			_, err = db.Exec(sqlStart + buf.String())
			if err != nil {
				fmt.Println(sqlStart + buf.String())
				log.Fatal(err)
			}
			i = 0
			buf.Reset()
		} else {
			buf.WriteString(`,`)
			i++
		}
		jj++
		if jj%10000 == 0 {
			fmt.Println(jj)
			//break
		}
	}
	if i != 0 {
		str := buf.String()
		str = string([]byte(str)[:len(str)-1])
		_, err = db.Exec(sqlStart + str)
		if err != nil {
			fmt.Println(sqlStart + buf.String())
			log.Fatal(err)
		}
	}
	i = 0
	jj = 0
	sqlStart = `INSERT INTO word_use (id_word, id_sentense) VALUES `
	buf = bytes.NewBuffer(nil)
	var id_wodr int
	for k, s := range word_use {
		rows1, _ := db.Query(`SELECT id FROM words WHERE word_key = '` + k + `'`)
		rows1.Next()
		rows1.Scan(&id_wodr)
		rows1.Close()
		for _, s1 := range s {
			buf.WriteString(`('` + strconv.Itoa(id_wodr) + `',` + strconv.Itoa(s1) + `)`)
			if i >= 300 {
				_, err = db.Exec(sqlStart + buf.String())
				if err != nil {
					fmt.Println(sqlStart + buf.String())
					log.Fatal(err)
				}
				i = 0
				buf.Reset()
			} else {
				buf.WriteString(`,`)
				i++
			}
		}
		jj++
		if jj%10000 == 0 {
			fmt.Println(jj)
			//break
		}
	}
	if i != 0 {
		str := buf.String()
		str = string([]byte(str)[:len(str)-1])
		_, err = db.Exec(sqlStart + str)
		if err != nil {
			fmt.Println(sqlStart + buf.String())
			log.Fatal(err)
		}
	}
}