func removeExcess() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i, j, chet int rubbx = f.NewRubAr() var gref Graf.Graf gref.Init() morfEng.InitEngl() sqlStart := `DELETE FROM sentenses WHERE 1=1 AND (` buf := bytes.NewBuffer(nil) for rows.Next() { rows.Scan(&id, &text, &countElement) _, prov := morfEng.getWordWithoutOmonemii(text) if !prov { i++ //fmt.Println(i) buf.WriteString(`id = ` + strconv.Itoa(id)) if j >= 300 { _, err = db.Exec(sqlStart + buf.String() + ")") if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } j = 0 buf.Reset() } else { buf.WriteString(` or `) j++ } } chet++ if chet%10000 == 0 { fmt.Printf("%d - %d - %d\r\n", i, chet, j) } } fmt.Println(i) }
func testWithout() { buf := bytes.NewBuffer(nil) buf.Reset() file, _ := os.Open("/home/serega/file_work/text.txt") // Error handling elided for brevity. io.Copy(buf, file) // Error handling elided for brevity. file.Close() var gref Graf.Graf gref.Init() morfEng.InitEngl() i := 0 //zn := make(map[string]string) //zn = map[string]string{ // "cc": "coordinating conjunction", // "cd": "cardinal number", // "rb": "adverb", // "rbr": "adverb, comparative", // "rbs": "adverb, superlative", // "jjs": "adjective, superlative", // "jj": "adjective", // "jjr": "adjective, comparative", // "vb": "verb, base form", // "vbd": "verb, past tense", // "vbn": "verb, past participle", // "vbz": "verb,3rd ps. sing. present", // "vbg": "verb, gerund/present participle", // "vbp": "verb, non-3rd ps. sing. present", // "nn": "noun, singular or masps", // "nns": "noun, plural", // "nnp": "proper noun, singular", // "nnps": "proper noun plural", // "dt": "determiner", // "ex": "existential there", // "rp": "particle", // "to": "to", // "uh": "interjection", // "ls": "list item marker", // "md": "modal", // "wdt": "wh-determiner", // "pdt": "predeterminer", // "wp": "wh-pronoun", // "pos": "possessive ending", // "wp$": "possessive wh-pronoun", // "in": "preposition/subord. conjunction", // "prp": "personal pronoun", // "prp$": "possessive pronoun", // "wrb": "wh-adverb", // "(": "open parenthesis", // ")": "close parenthesis", // "``": "open quote", // ",": "comma", // "''": "close quote", // ".": "period", // "#": "pound sign (currency marker)", // "$": "dollar sign (currency marker)", // ":": "colon", // "sym": "symbol (mathematical or scientific)", // "fw": "foreign word", //} //findElement := map[string]string{ // "cc": "coordinating conjunction", // //"cd": "cardinal number", // "adv": "adverb", // "rb": "adverb", // "rbr": "adverb, comparative", // "rbs": "adverb, superlative", // "adj": "adjective", // "jj": "adjective", // "jjs": "adjective, superlative", // "jjr": "adjective, comparative", // "vb": "verb, base form", // "verb": "verb, base form", // "vbd": "verb, past tense", // "vbn": "verb, past participle", // "vbg": "verb, gerund/present participle", // "vbz": "verb,3rd ps. sing. present", // "vbp": "verb, non-3rd ps. sing. present", // "noun": "noun, singular or masps", // "nn": "noun, singular or masps", // "nns": "noun, plural", // "nnp": "proper noun, singular", // "nnps": "proper noun plural", //} //findElement1 := map[string]string{ // "dt": "determiner", // "ex": "existential there", // "rp": "particle", // "to": "to", // "uh": "interjection", // "ls": "list item marker", // "md": "modal", // "wdt": "wh-determiner", // "pdt": "predeterminer", // "wp": "wh-pronoun", // "pos": "possessive ending", // "wp$": "possessive wh-pronoun", // "in": "preposition/subord. conjunction", // "prp": "personal pronoun", // //"pron": "personal pronoun", // "prp$": "possessive pronoun", // "wrb": "wh-adverb", // "vbz": "verb,3rd ps. sing. present", // "vbp": "verb, non-3rd ps. sing. present", // "rb": "adverb", //} //difine := map[string]map[string]map[string]string{ // "cc": {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}}, // "cd": {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}}, // "rb": {"form":{"advv1":""}, "base":{"adv":""}, "extra":{"adj":""}}, // "adv": {"form":{"advv1":""}, "base":{"adv":""}, "extra":{"adj":""}}, // "rbr": {"form":{"advv2":""}, "base":{"adv":""}, "extra":{"adj":""}}, // "rbs": {"form":{"adjv3":""}, "base":{"adv":""}, "extra":{"adj":""}}, // "adj": {"form":{"adjv1":""}, "base":{"adj":""}, "extra":{"adv":""}}, // "jj": {"form":{"adjv1":""}, "base":{"adj":""}, "extra":{"adv":""}}, // "jjr": {"form":{"adjv2":""}, "base":{"adj":""}, "extra":{"adv":""}}, // "jjs": {"form":{"adjv3":""}, "base":{"adj":""}, "extra":{"adv":""}}, // "vb": {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}}, // "verb": {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}}, // "vbd": {"form":{"verbv2":"", "verbv3":"", "verbu":"", "verbu1":"", "verbu2":"", "verbu3":""}, "base":{"verb":""}, "extra":{}}, // "vbn": {"form":{"verbv2":"", "verbv3":"", "verbu":"", "verbu1":"", "verbu2":"", "verbu3":""}, "base":{"verb":""}, "extra":{}}, // "vbz": {"form":{"verbv5":""}, "base":{"verb":""}, "extra":{}}, // "vbg": {"form":{"verbv4":""}, "base":{"verb":""}, "extra":{}}, // "vbp": {"form":{"verbv1":"", "verbu":"", "verbu1":"", "verbu3":""}, "base":{"verb":""}, "extra":{}}, // "noun": {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}}, // "nn": {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}}, // "nns": {"form":{"nounv2":""}, "base":{"noun":""}, "extra":{}}, // "nnp": {"form":{"nounv1":""}, "base":{"noun":""}, "extra":{}}, // "nnps": {"form":{"nounv2":""}, "base":{"noun":""}, "extra":{}}, // "conj": {"form":{"rp":""}, "base":{"noun":""}, "extra":{}}, //} var jsontetext, text string //frequence := make(map[string]int, 0) //noFound := make(map[string]map[string]int) //foundZ := make(map[string]map[string]int) baseFrequence := [][]string{} scet := 0 arraySentense := regexp.MustCompile(`\r\n`).Split(buf.String(), -1) for k, v := range arraySentense { _ = v if k%2 != 0 && k != 0 { continue } i++ jsontetext, text = arraySentense[k+1], v var zn [][]string jsontetext = strings.ToLower(jsontetext) ffjson.Unmarshal([]byte(jsontetext), &zn) prov := false var insetFrequuence []string result, _ := morfEng.getIshForSentence(text) //fmt.Println(text) //fmt.Println(result) //fmt.Println(arraySentense[k + 1]) //fmt.Println(zn) //break if len(result) == len(zn) { prov = true } for _, v := range zn { _ = v //if _, found := difine[v[1]]["form"][v1.types + v1.forma]; found { // insetFrequuence = append(insetFrequuence, v1.types + v1.forma) // //foundZ = addToMap(foundZ, v[1], "form") // pr = true // //localProv = true // break //} } if prov { scet++ baseFrequence = append(baseFrequence, insetFrequuence) //fmt.Println(jsontetext) } else { //fmt.Printf("%d failed: %d\r\n", len(result),len(zn)) //fmt.Println(text) //fmt.Println(zn) //fmt.Println(result) } if i%10000 == 0 { fmt.Println(i) //break } if i > 100000 { break } } fmt.Printf("norm - %d ", scet) //wodfFile, _ := os.Create("file/englishLemm/baseFrequence") //frub.Encode(baseFrequence, wodfFile) }
func spliteonPredl() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() //allName = dirHod("/home/serega/Copy/code/text/book/",, allName) allName := dirHod("/media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/", "", []string{}) //fmt.Println(allName) //echoArray(allName) var gref Graf.Graf gref.Init() rubbx = f.NewRubAr() morfEng.InitEngl() buf := bytes.NewBuffer(nil) for k, f1 := range allName { if k < 30002 { // for /media/serega/c48f1bd8-a939-4630-ab12-9787df1f1fa0/home/paralel/OpenSubtitles2016/xml/txt/ continue } buf.Reset() f, _ := os.Open(f1) // Error handling elided for brevity. io.Copy(buf, f) // Error handling elided for brevity. f.Close() //text, _ := html2text.FromString(buf.String()) sentense := gref.PrepereText(buf.String()) //echoArray(sentense) //fmt.Println(f1) //buf.Reset() i := 0 sqlStart := `INSERT INTO sentenses(trans, text, count) VALUES ` buf1 := bytes.NewBuffer(nil) for _, s := range sentense { _ = s if rubbx.IsTru1(s, `.*[-"].*`) { continue } sentenseItem, prov := morfEng.getWordWithoutOmonemii(s) if !prov || len(sentenseItem) < 4 || len(sentenseItem) > 20 { continue } s = rubbx.ReplaseRub(s, `''`, `'`) rows, err := db.Query(`SELECT count(*) as cou FROM sentenses WHERE text = '` + s + `' `) if err != nil { log.Fatal(err) } var cou string rows.Next() rows.Scan(&cou) rows.Close() // //fmt.Println(cou) if cou == `0` && len(sentenseItem) > 4 && len(sentenseItem) < 20 { //fmt.Println(s) buf1.WriteString(`('','` + s + `',` + strconv.Itoa(len(sentenseItem)) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf1.String()) if err != nil { fmt.Println(sqlStart + buf1.String()) log.Fatal(err) } i = 0 buf1.Reset() } else { buf1.WriteString(`,`) i++ } } // // // // ////fmt.Println(" ---------- ") } if i != 0 { str := buf1.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf1.String()) log.Fatal(err) } } // if k%1 == 0 { fmt.Println(f1) fmt.Println(k) } if k > 10 { //break } ////fmt.Println(f) //break } }
func fillTableWord() { dbinfo := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", DB_USER, DB_PASSWORD, DB_NAME) db, err := sql.Open("postgres", dbinfo) if err != nil { log.Fatal(err) } defer db.Close() rows, err := db.Query(`SELECT id, text, count FROM sentenses ORDER BY count ASC `) if err != nil { log.Fatal(err) } var text string var countElement, id, i int rubbx = f.NewRubAr() var gref Graf.Graf gref.Init() type words struct { result string types string forma string count int } type read struct { text string countElement int id int } word := make(map[string]words, 0) word_use := make(map[string][]int, 0) readderArray := make([]read, 6762636) morfEng.InitEngl() localCou := 0 for rows.Next() { rows.Scan(&id, &text, &countElement) readderArray[localCou] = read{text, countElement, id} localCou++ } for _, rowFrom := range readderArray { i++ id = rowFrom.id text = rowFrom.text countElement = rowFrom.countElement sentenseItem, prov := morfEng.getWordWithoutOmonemii(text) if prov { fraz := []int{} for k, s := range sentenseItem { if len(fraz) > 0 { //fraz-- if fraz[0] == k { fraz = fraz[1:] continue } } if s.types != "number" && s.types != "punctuation" { if zn, found := morfEng.fraz[s.result]; found { k1 := 0 shift := 1 for { fraz2 := false find := false for _, v1 := range zn { if k+shift+k1 < len(sentenseItem) { val := sentenseItem[k+shift+k1].result if len(v1) > k1 && val == v1[k1] { fraz = append(fraz, k+shift+k1) find = true if len(v1)-1 == k1 { for jc := 0; jc < len(v1); jc++ { s.result += " " + sentenseItem[k+jc+shift].result } s.types = "phras" if shift == 1 { s.forma = "" } else { s.forma = "stm" } shift = 2 break } else { fraz2 = true } } } } if !find { } if !fraz2 && shift == 2 { break } if !fraz2 { shift++ } k1++ } } s.result = rubbx.ReplaseRub((s.result), `''`, `'`) keyWord := s.result + s.types + s.forma if _, found := word_use[keyWord]; found { word_use[keyWord] = append(word_use[keyWord], id) } else { word_use[keyWord] = []int{id} } if _, found := word[keyWord]; found { word[keyWord] = words{s.result, s.types, s.forma, word[keyWord].count + 1} } else { word[keyWord] = words{s.result, s.types, s.forma, 1} } } } } if i%1000 == 0 { fmt.Println(i) //break } if i > 100000 { //break } } //for k, v := range word { // if v.forma == "phras"{ // fmt.Printf("%s - %v+\n", k, v) // } //} //fmt.Println(word) //fmt.Println(word_use) i = 0 jj := 0 sqlStart := `INSERT INTO words (word_key, word, forma, type, frequensy) VALUES ` buf := bytes.NewBuffer(nil) for k, s := range word { buf.WriteString(`('` + k + `','` + s.result + `','` + (s.forma) + `','` + (s.types) + `',` + strconv.Itoa(s.count) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf.String()) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } i = 0 buf.Reset() } else { buf.WriteString(`,`) i++ } jj++ if jj%10000 == 0 { fmt.Println(jj) //break } } if i != 0 { str := buf.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } } i = 0 jj = 0 sqlStart = `INSERT INTO word_use (id_word, id_sentense) VALUES ` buf = bytes.NewBuffer(nil) var id_wodr int for k, s := range word_use { rows1, _ := db.Query(`SELECT id FROM words WHERE word_key = '` + k + `'`) rows1.Next() rows1.Scan(&id_wodr) rows1.Close() for _, s1 := range s { buf.WriteString(`('` + strconv.Itoa(id_wodr) + `',` + strconv.Itoa(s1) + `)`) if i >= 300 { _, err = db.Exec(sqlStart + buf.String()) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } i = 0 buf.Reset() } else { buf.WriteString(`,`) i++ } } jj++ if jj%10000 == 0 { fmt.Println(jj) //break } } if i != 0 { str := buf.String() str = string([]byte(str)[:len(str)-1]) _, err = db.Exec(sqlStart + str) if err != nil { fmt.Println(sqlStart + buf.String()) log.Fatal(err) } } }