func createExceptionNoun() { name := "verb" buf := bytes.NewBuffer(nil) f, _ := os.Open("/home/serega/Copy/database/dict/" + name + ".exc") // Error handling elided for brevity. io.Copy(buf, f) // Error handling elided for brevity. f.Close() sentense := (buf.String()) buf.Reset() rubbx = frub.NewRubAr() sentense = strings.ToLower(sentense) lin := regexp.MustCompile(`\n`).Split(sentense, -1) //word := make(ByLength, 2000) //wordArray := make([]string, 2000) findIsh := make(map[string]string) //findForm := make(map[string]map[string]string) //fmt.Println(sentense) for _, s := range lin { words := regexp.MustCompile(` `).Split(s, -1) if len(words) == 2 { findIsh[words[0]] = words[1] } } fmt.Println(len(findIsh)) frub.CreateByZn(findIsh, "file/englishLemm/find"+name+".gob") wodfFile, _ := os.Create("file/englishLemm/find" + name) frub.Encode(findIsh, wodfFile) //frub.CreateByZn(findForm, "file/englishLemm/form" + name + ".gob") //wodfFile, _ = os.Create("file/englishLemm/form" + name) //frub.Encode(findForm, wodfFile) }
func (nn *MorfE) InitEngl() { nn.Graf.Init() nn.Unions = make(map[string]map[string]string, 0) frub.Decode(&nn.Unions, "file/englishLemm/noFounf") nn.frequence = make(map[string]int, 0) frub.GetByZn(&nn.frequence, "file/englishLemm/frequence1.gob") nn.fraz = make(map[string][][]string) frub.Decode(&nn.fraz, "file/englishLemm/fraz") nn.expVerb = make(map[string][]string, 0) frub.GetByZn(&nn.expVerb, "file/englishLemm/findverb1.gob") nn.expVerb1 = make(map[string]string, 0) frub.GetByZn(&nn.expVerb, "file/englishLemm/findverb.gob") nn.expAdj = make(map[string][]string, 0) frub.GetByZn(&nn.expAdj, "file/englishLemm/findadj1.gob") nn.expNoun = make(map[string]string, 0) frub.Decode(&nn.expNoun, "file/englishLemm/findnoun") nn.Verb = make(map[string]string, 0) frub.Decode(&nn.Verb, "file/englishLemm/verb") nn.Noun = make(map[string]string, 0) frub.Decode(&nn.Noun, "file/englishLemm/noun") nn.Adj = make(map[string]string, 0) frub.Decode(&nn.Adj, "file/englishLemm/adj") nn.Adv = make(map[string]string, 0) frub.Decode(&nn.Adv, "file/englishLemm/adv") nn.rubbx = frub.NewRubAr() }
func translateSentense() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, text, count FROM sentenses1 WHERE trans = '' ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i, i1, j int buf := bytes.NewBuffer(nil) rubbx = f.NewRubAr() for rows.Next() { rows.Scan(&id, &text, &countElement) buf.WriteString(text) //fmt.Println(text) if i >= 0 { respons, _ := getTranslation("en", "ru", buf.String()) respons = rubbx.ReplaseRub(respons, `"`, `'`) mapResult := (spltOnDimiter(respons)) //sql := `UPDATE sentenses1 SET trans = '`+ respons +`' WHERE id = ` + strconv.Itoa(id) + ` ` //_, err = db.Exec(sql) //if err != nil { // fmt.Println(sql) // log.Fatal(err) //} for k, v := range mapResult { sql := `UPDATE sentenses1 SET trans = '` + v + `' WHERE id = ` + strconv.Itoa(k) + ` ` _, err = db.Exec(sql) if err != nil { fmt.Println(sql) log.Fatal(err) } } fmt.Println(buf.String()) fmt.Println(j) buf.Reset() i = 0 if i1 >= 0 { time.Sleep(0 * time.Second) i1 = 0 } else { i1++ } //break } else { buf.WriteString(` || ` + strconv.Itoa(id) + ` || `) i++ } j++ } }
func createEngWord() { name := "verb" buf := bytes.NewBuffer(nil) f, _ := os.Open("/home/serega/Copy/database/dict/data." + name) // Error handling elided for brevity. io.Copy(buf, f) // Error handling elided for brevity. f.Close() sentense := (buf.String()) buf.Reset() rubbx = frub.NewRubAr() sentense = strings.ToLower(sentense) lin := regexp.MustCompile(`\n`).Split(sentense, -1) word := make(map[string]string, 20000) for k, s := range lin { if k < 29 { continue } words := regexp.MustCompile(` `).Split(s, -1) cicl := true key := 4 for cicl { if len(words) > key { pr := rubbx.IsTru1(words[key], `^[a-zA-Z]+$`) if pr { word[words[key]] = name } else { if rubbx.IsTru1(words[key], `^[a-zA-Z_]+$`) { words[key] = rubbx.ReplaseRub(words[key], ` `, `_`) word[words[key]] = "fraz" } //break } } if len(words) > key+2 { if words[key+1] != "0" { cicl = false } key = key + 2 } else { cicl = false } } } frub.CreateByZn(word, "file/englishLemm/"+name+".gob") wodfFile, _ := os.Create("file/englishLemm/" + name) frub.Encode(word, wodfFile) fmt.Println(len(word)) }
func removeExcess() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i, j, chet int rubbx = f.NewRubAr() var gref Graf.Graf gref.Init() morfEng.InitEngl() sqlStart := `DELETE FROM sentenses WHERE 1=1 AND (` buf := bytes.NewBuffer(nil) for rows.Next() { rows.Scan(&id, &text, &countElement) _, prov := morfEng.getWordWithoutOmonemii(text) if !prov { i++ //fmt.Println(i) buf.WriteString(`id = ` + strconv.Itoa(id)) if j >= 300 { _, err = db.Exec(sqlStart + buf.String() + ")") if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } j = 0 buf.Reset() } else { buf.WriteString(` or `) j++ } } chet++ if chet%10000 == 0 { fmt.Printf("%d - %d - %d\r\n", i, chet, j) } } fmt.Println(i) }
// инициализация словарей func (nn *Morf) Init() { nn.ItogOk = make([]map[string][]map[string]string, 20503) nn.Search = make(map[string]map[string][]uint16, 234457) var wg sync.WaitGroup wg.Add(2) go func() { defer wg.Done() f.GetByZn(&nn.ItogOk, "file/itogOkI") }() go func() { defer wg.Done() f.GetByZn(&nn.Search, "file/search1") }() wg.Wait() nn.rubbx = f.NewRubAr() }
func extractFromQz() { soursStart := "/media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/en/" allGzFile := dirHod(soursStart, "txt", []string{}) rubbx = f.NewRubAr() fmt.Println(len(allGzFile)) sours := "/media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/en1/" for k, v := range allGzFile { //path := rubbx.ReplaseRub(v, ``, `[0-9]*\.xml.gz`) //name := strings.Replace(v, path, "", -1) ungzip(v, sours) //fmt.Println(name) if k%100 == 0 { fmt.Println(k) } //break } //fmt.Println(allGzFile) }
func markerOnEnglish() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, trans, count FROM sentenses1 ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i int rubbx = f.NewRubAr() for rows.Next() { rows.Scan(&id, &text, &countElement) if !rubbx.IsTru1(text, `^[а-яА-Я ,.!?&\-0-9\:\;]*$`) { sql := `UPDATE sentenses1 SET engword = 'e' WHERE id = ` + strconv.Itoa(id) + ` ` _, err = db.Exec(sql) if err != nil { fmt.Println(sql) log.Fatal(err) } //fmt.Println(text) //fmt.Println("en") } if i > 2000 { //break } if i%1000 == 0 { fmt.Println(i) } i++ } }
func createException() { name := "verb1" buf := bytes.NewBuffer(nil) f, _ := os.Open("/home/serega/Copy/database/dict/" + name + ".exc") // Error handling elided for brevity. io.Copy(buf, f) // Error handling elided for brevity. f.Close() sentense := (buf.String()) buf.Reset() rubbx = frub.NewRubAr() sentense = strings.ToLower(sentense) lin := regexp.MustCompile(`\n`).Split(sentense, -1) //word := make(ByLength, 2000) //wordArray := make([]string, 2000) findIsh := make(map[string][]string) //findForm := make(map[string]map[string]string) //fmt.Println(sentense) for _, s := range lin { words := regexp.MustCompile(` `).Split(s, -1) if len(words) > 1 { //findForm[words[0]] = map[string]string{"v1":words[0]} words1 := regexp.MustCompile(`\/`).Split(words[1], -1) words2 := regexp.MustCompile(`\/`).Split(words[2], -1) //fmt.Println(words1) //findForm[words[0]]["v2"] = words1[0] for _, s1 := range words1 { types := "v2" if s1 == words[0] { types = "u1" } for _, s2 := range words2 { if s1 == s2 { if types == "v2" { types = "u2" } else { types = "u" } } } findIsh[s1] = []string{words[0], types} } //fmt.Println(words1) //findForm[words[0]]["v3"] = words1[0] for _, s1 := range words2 { types := "v3" if s1 == words[0] { types = "u3" } for _, s2 := range words1 { if s1 == s2 { if types == "v3" { types = "u2" } else { types = "u" } } } findIsh[s1] = []string{words[0], types} } } } frub.CreateByZn(findIsh, "file/englishLemm/find"+name+".gob") wodfFile, _ := os.Create("file/englishLemm/find" + name) frub.Encode(findIsh, wodfFile) //frub.CreateByZn(findForm, "file/englishLemm/form" + name + ".gob") //wodfFile, _ = os.Create("file/englishLemm/form" + name) //frub.Encode(findForm, wodfFile) }
func spliteonPredl() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() //allName = dirHod("/home/serega/Copy/code/text/book/",, allName) allName := dirHod("/media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/", "", []string{}) //fmt.Println(allName) //echoArray(allName) var gref Graf.Graf gref.Init() rubbx = f.NewRubAr() morfEng.InitEngl() buf := bytes.NewBuffer(nil) for k, f1 := range allName { if k < 30002 { // for /media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/ continue } buf.Reset() f, _ := os.Open(f1) // Error handling elided for brevity. io.Copy(buf, f) // Error handling elided for brevity. f.Close() //text, _ := html2text.FromString(buf.String()) sentense := gref.PrepereText(buf.String()) //echoArray(sentense) //fmt.Println(f1) //buf.Reset() i := 0 sqlStart := `INSERT INTO sentenses(trans, text, count) VALUES ` buf1 := bytes.NewBuffer(nil) for _, s := range sentense { _ = s if rubbx.IsTru1(s, `.*[-"].*`) { continue } sentenseItem, prov := morfEng.getWordWithoutOmonemii(s) if !prov || len(sentenseItem) < 4 || len(sentenseItem) > 20 { continue } s = rubbx.ReplaseRub(s, `''`, `'`) rows, err := db.Query(`SELECT count(*) as cou FROM sentenses WHERE text = '` + s + `' `) if err != nil { log.Fatal(err) } var cou string rows.Next() rows.Scan(&cou) rows.Close() // //fmt.Println(cou) if cou == `0` && len(sentenseItem) > 4 && len(sentenseItem) < 20 { //fmt.Println(s) buf1.WriteString(`('','` + s + `',` + strconv.Itoa(len(sentenseItem)) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf1.String()) if err != nil { fmt.Println(sqlStart + buf1.String()) log.Fatal(err) } i = 0 buf1.Reset() } else { buf1.WriteString(`,`) i++ } } // // // // ////fmt.Println(" ---------- ") } if i != 0 { str := buf1.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf1.String()) log.Fatal(err) } } // if k%1 == 0 { fmt.Println(f1) fmt.Println(k) } if k > 10 { //break } ////fmt.Println(f) //break } }
func fillTableWord() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i int rubbx = f.NewRubAr() var gref Graf.Graf gref.Init() type words struct { result string types string forma string count int } type read struct { text string countElement int id int } word := make(map[string]words, 0) word_use := make(map[string][]int, 0) readderArray := make([]read, 6762636) morfEng.InitEngl() localCou := 0 for rows.Next() { rows.Scan(&id, &text, &countElement) readderArray[localCou] = read{text, countElement, id} localCou++ } for _, rowFrom := range readderArray { i++ id = rowFrom.id text = rowFrom.text countElement = rowFrom.countElement sentenseItem, prov := morfEng.getWordWithoutOmonemii(text) if prov { fraz := []int{} for k, s := range sentenseItem { if len(fraz) > 0 { //fraz-- if fraz[0] == k { fraz = fraz[1:] continue } } if s.types != "number" && s.types != "punctuation" { if zn, found := morfEng.fraz[s.result]; found { k1 := 0 shift := 1 for { fraz2 := false find := false for _, v1 := range zn { if k+shift+k1 < len(sentenseItem) { val := sentenseItem[k+shift+k1].result if len(v1) > k1 && val == v1[k1] { fraz = append(fraz, k+shift+k1) find = true if len(v1)-1 == k1 { for jc := 0; jc < len(v1); jc++ { s.result += " " + sentenseItem[k+jc+shift].result } s.types = "phras" if shift == 1 { s.forma = "" } else { s.forma = "stm" } shift = 2 break } else { fraz2 = true } } } } if !find { } if !fraz2 && shift == 2 { break } if !fraz2 { shift++ } k1++ } } s.result = rubbx.ReplaseRub((s.result), `''`, `'`) keyWord := s.result + s.types + s.forma if _, found := word_use[keyWord]; found { word_use[keyWord] = append(word_use[keyWord], id) } else { word_use[keyWord] = []int{id} } if _, found := word[keyWord]; found { word[keyWord] = words{s.result, s.types, s.forma, word[keyWord].count + 1} } else { word[keyWord] = words{s.result, s.types, s.forma, 1} } } } } if i%1000 == 0 { fmt.Println(i) //break } if i > 100000 { //break } } //for k, v := range word { // if v.forma == "phras"{ // fmt.Printf("%s - %v+\n", k, v) // } //} //fmt.Println(word) //fmt.Println(word_use) i = 0 jj := 0 sqlStart := `INSERT INTO words (word_key, word, forma, type, frequensy) VALUES ` buf := bytes.NewBuffer(nil) for k, s := range word { buf.WriteString(`('` + k + `','` + s.result + `','` + (s.forma) + `','` + (s.types) + `',` + strconv.Itoa(s.count) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf.String()) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } i = 0 buf.Reset() } else { buf.WriteString(`,`) i++ } jj++ if jj%10000 == 0 { fmt.Println(jj) //break } } if i != 0 { str := buf.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } } i = 0 jj = 0 sqlStart = `INSERT INTO word_use (id_word, id_sentense) VALUES ` buf = bytes.NewBuffer(nil) var id_wodr int for k, s := range word_use { rows1, _ := db.Query(`SELECT id FROM words WHERE word_key = '` + k + `'`) rows1.Next() rows1.Scan(&id_wodr) rows1.Close() for _, s1 := range s { buf.WriteString(`('` + strconv.Itoa(id_wodr) + `',` + strconv.Itoa(s1) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf.String()) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } i = 0 buf.Reset() } else { buf.WriteString(`,`) i++ } } jj++ if jj%10000 == 0 { fmt.Println(jj) //break } } if i != 0 { str := buf.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } } }