func xstatsmeta(q *Context) { var errval error var download bool var db *dbxml.Db var docs, docs2 *dbxml.Docs defer func() { if docs2 != nil { docs2.Close() } if docs != nil { docs.Close() } if db != nil { db.Close() } if errval != nil { updateError(q, errval, !download) } completedmeta(q, download) if !download { fmt.Fprintln(q.w, "</body>\n</html>") } }() var rows *sql.Rows now := time.Now() now2 := time.Now() if first(q.r, "d") != "" { download = true } itemselect := first(q.r, "item") methode := first(q.r, "mt") if methode != "dx" { methode = "std" } if download { contentType(q, "text/plain; charset=utf-8") q.w.Header().Set("Content-Disposition", "attachment; filename=telling.txt") cache(q) } else { contentType(q, "text/html; charset=utf-8") cache(q) fmt.Fprint(q.w, `<!DOCTYPE html> <html> <head> <title></title> <script type="text/javascript"><!-- function setvalue(n) { window.parent._fn.setmetaval(n); } function setmetavars(idx, lbl, fl, max, ac, bc) { window.parent._fn.setmetavars(idx, lbl, fl, max, ac, bc); } function setmetalines(idx, a, b) { window.parent._fn.setmetalines(idx, a, b); } function makemetatable(idx) { window.parent._fn.makemetatable(idx); } function f(s) { window.parent._fn.updatemeta(s); } function f1(s) { window.parent._fn.updatemetatop(s); } function c(i, j) { window.parent._fn.countmeta(i, j); } //--></script> </head> <body> <script type="text/javascript"> window.parent._fn.startedmeta(); c("0", "0"); </script> `) if ff, ok := q.w.(http.Flusher); ok { ff.Flush() } } prefix := getprefix(q) if !q.prefixes[prefix] { if download { fmt.Fprintf(q.w, "Invalid corpus: "+prefix) } else { updateText(q, "Invalid corpus: "+html.EscapeString(prefix)) } return } query := first(q.r, "xpath") if query == "" { if download { fmt.Fprintln(q.w, "Query ontbreekt") } else { updateText(q, "Query ontbreekt") } return } if strings.Contains(query, "%") { rules := getMacrosRules(q) query = macroKY.ReplaceAllStringFunc(query, func(s string) string { return rules[s[1:len(s)-1]] }) } var owner string var nlines uint64 rows, errval = q.db.Query(fmt.Sprintf("SELECT `owner`,`nline` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, prefix)) if logerr(errval) { return } for rows.Next() { errval = rows.Scan(&owner, &nlines) if logerr(errval) { rows.Close() return } } errval = rows.Err() if logerr(errval) { return } dactfiles := make([]string, 0) if strings.Contains(owner, "@") { dactfiles = append(dactfiles, filepath.Join(paqudir, "data", prefix, "data.dact")) } else { rows, errval = q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix)) if logerr(errval) { return } for rows.Next() { var s string errval = rows.Scan(&s) if logerr(errval) { rows.Close() return } if strings.HasSuffix(s, ".dact") { dactfiles = append(dactfiles, s) } } errval = rows.Err() if logerr(errval) { return } } if len(dactfiles) == 0 { if download { fmt.Fprintln(q.w, "Er zijn geen dact-bestanden voor dit corpus") } else { updateText(q, "Er zijn geen dact-bestanden voor dit corpus") } return } if !q.hasmeta[prefix] { if download { fmt.Fprintln(q.w, "Geen metadata voor dit corpus") } else { updateText(q, "Geen metadata voor dit corpus") } return } metas := getMeta(q, prefix) metat := make(map[string]string) metai := make(map[string]int) tranges := make(map[string]map[string]int) dranges := make(map[string]*drange) franges := make(map[string]*frange) iranges := make(map[string]*irange) for _, m := range metas { metat[m.name] = m.mtype metai[m.name] = m.id if m.mtype == "TEXT" { tranges[m.name] = make(map[string]int) rows, errval = q.db.Query(fmt.Sprintf( "SELECT `idx`,`text` FROM `%s_c_%s_mval` WHERE `id` = %d", Cfg.Prefix, prefix, m.id)) if logerr(errval) { return } for rows.Next() { var t string var i int errval = rows.Scan(&i, &t) if logerr(errval) { rows.Close() return } tranges[m.name][t] = i } errval = rows.Err() if logerr(errval) { return } continue } var indexed bool var size, dtype, imin, istep int var dmin, dmax time.Time var fmin, fstep float64 row := q.db.QueryRow(fmt.Sprintf( "SELECT `indexed`, `size`, `dmin`, `dmax`, `dtype`, `fmin`, `fstep`, `imin`, `istep` FROM `%s_c_%s_minf` WHERE `id` = %d", Cfg.Prefix, prefix, m.id)) errval = row.Scan(&indexed, &size, &dmin, &dmax, &dtype, &fmin, &fstep, &imin, &istep) if logerr(errval) { return } switch m.mtype { case "INT": iranges[m.name] = oldIrange(imin, istep, size, indexed) case "FLOAT": franges[m.name] = oldFrange(fmin, fstep, size) case "DATE", "DATETIME": dranges[m.name] = oldDrange(dmin, dmax, dtype, indexed) } } // for _, m := range metas queryparts := strings.Split(query, "+|+") telling := make(map[string]map[string][3]int) for _, m := range metas { telling[m.name] = make(map[string][3]int) } seen := make(map[string]bool) var chClose <-chan bool if f, ok := q.w.(http.CloseNotifier); ok { chClose = f.CloseNotify() } else { chClose = make(<-chan bool) } counter := 0 for _, dactfile := range dactfiles { if !download && time.Now().Sub(now2) > 2*time.Second { updateCount(q, counter, len(seen)) now2 = time.Now() } select { case <-chClose: logerr(errConnectionClosed) return default: } if Cfg.Dactx && methode == "dx" { dactfile += "x" } db, errval = dbxml.Open(dactfile) if logerr(errval) { return } var qu *dbxml.Query qu, errval = db.Prepare(queryparts[0]) if logerr(errval) { return } done := make(chan bool, 1) interrupted := make(chan bool, 1) go func() { select { case <-chClose: interrupted <- true logerr(errConnectionClosed) qu.Cancel() case <-done: } }() docs, errval = qu.Run() if logerr(errval) { return } filename := "" var seenId map[string]bool NEXTDOC: for docs.Next() { if !download && time.Now().Sub(now2) > 2*time.Second { updateCount(q, counter, len(seen)) now2 = time.Now() } matches := 0 if len(queryparts) == 1 { name := docs.Name() if name != filename { filename = name seenId = make(map[string]bool) } alp := Alpino_ds{} errval = xml.Unmarshal([]byte(`<?xml version="1.0" encoding="UTF-8"?> <alpino_ds version="1.3"> `+docs.Match()+` </alpino_ds>`), &alp) if logerr(errval) { return } sid := "" if alp.Node0 != nil { sid = alp.Node0.Id if alp.Node0.OtherId != "" { sid = alp.Node0.OtherId } } if !seenId[sid] { matches = 1 seenId[sid] = true } } else { name := docs.Name() if name == filename { continue } filename = name seenId = make(map[string]bool) doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", filename) for i := 1; i < len(queryparts)-1; i++ { docs2, errval = db.Query(doctxt + queryparts[i]) if logerr(errval) { return } if !docs2.Next() { docs2 = nil continue NEXTDOC } docs2.Close() docs2 = nil } docs2, errval = db.Query(doctxt + queryparts[len(queryparts)-1]) if logerr(errval) { return } for docs2.Next() { alp := Alpino_ds{} errval = xml.Unmarshal([]byte(`<?xml version="1.0" encoding="UTF-8"?> <alpino_ds version="1.3"> `+docs2.Match()+` </alpino_ds>`), &alp) if logerr(errval) { return } sid := "" if alp.Node0 != nil { sid = alp.Node0.Id if alp.Node0.OtherId != "" { sid = alp.Node0.OtherId } } if !seenId[sid] { seenId[sid] = true matches++ } } docs2 = nil } if matches == 0 { continue } counter += matches alpino := Alpino_ds_meta{} errval = xml.Unmarshal([]byte(docs.Content()), &alpino) if logerr(errval) { return } values := make(map[string][]string) for _, m := range metas { values[m.name] = make([]string, 0) } for _, m := range alpino.Meta { values[m.Name] = append(values[m.Name], m.Value) } for _, m := range metas { if len(values[m.name]) == 0 { values[m.name] = append(values[m.name], "") } } f := dactfile + "\t" + docs.Name() c := 0 if !seen[f] { seen[f] = true c = 1 } for name := range values { for _, value := range values[name] { var idx int if value == "" { idx = 2147483647 } else { switch metat[name] { case "TEXT": idx = tranges[name][value] case "INT": v, _ := strconv.Atoi(value) value, idx = iranges[name].value(v) case "FLOAT": v, _ := strconv.ParseFloat(value, 32) // 32 is dezelfde precisie als gebruikt door MySQL value, idx = franges[name].value(v) case "DATE": v, _ := time.Parse("2006-01-02", value) value, idx = dranges[name].value(v) case "DATETIME": v, _ := time.Parse("2006-01-02 15:04", value) value, idx = dranges[name].value(v) } } telling[name][value] = [3]int{idx, telling[name][value][1] + matches, telling[name][value][2] + c} } } } // for docs.Next() errval = docs.Error() docs = nil if logerr(errval) { return } db.Close() db = nil done <- true select { case <-interrupted: return default: } } // for _, dactfile := range dactfiles if !download { updateCount(q, counter, len(seen)) } var buf bytes.Buffer pow10 := math.Pow10(int(math.Log10(float64(q.lines[prefix])) + .5)) if pow10 < 10 { pow10 = 10 } if !download { fmt.Fprintf(q.w, `<script type="text/javascript"> setvalue(%d); </script> `, int(pow10)) } else { fmt.Fprintf(q.w, "# items: %d\n# zinnen: %d\n# n = %d\n", counter, len(seen), int(pow10)) } for number, meta := range metas { items := make([]*MetaItem, 0, len(telling[meta.name])) for name := range telling[meta.name] { items = append(items, &MetaItem{ text: name, idx: telling[meta.name][name][0], count: [2]int{telling[meta.name][name][1], telling[meta.name][name][2]}, }) } rows, errval = q.db.Query(fmt.Sprintf( "SELECT `idx`, `text`, `n` FROM `%s_c_%s_mval` WHERE `id`=%d ORDER BY `idx`", Cfg.Prefix, prefix, metai[meta.name])) if logerr(errval) { return } nn := make(map[int]int) values := make([]StructIS, 0) for rows.Next() { var idx, n int var txt string errval = rows.Scan(&idx, &txt, &n) if logerr(errval) { rows.Close() return } nn[idx] = n values = append(values, StructIS{idx, txt}) } errval = rows.Err() if logerr(errval) { return } if !download { var hide string if itemselect != meta.name { hide = " hide" } var hex string for _, c := range meta.name { hex += fmt.Sprintf("%04x", uint16(c)) } fmt.Fprintf(&buf, ` <div class="metasub%s" id="meta%s"> <p> <b>%s</b> — <a href="javascript:void(0)" onclick="javascript:metahelp()">toelichting bij tabel</a> <p> <table> <tr> <td>per item: <table class="right" id="meta%da"> </table> <td class="next">per zin: <table class="right" id="meta%db"> </table> </table> </div> `, hide, hex, html.EscapeString(meta.name), number, number) updateText(q, buf.String()) buf.Reset() fl := "right" max := 99999 ac := 1 bc := 2 if meta.mtype == "TEXT" { fl = "left" max = METAMAX ac = 0 bc = 0 } fmt.Fprintf(q.w, `<script type="text/javascript"> setmetavars(%d,"%s","%s",%d,%d,%d); setmetalines(%d`, number, meta.value, fl, max, ac, bc, number) } if metat[meta.name] != "TEXT" { sort.Sort(MetaItems(items)) } for run := 0; run < 2; run++ { if !download { fmt.Fprint(q.w, ",[") } if metat[meta.name] == "TEXT" { if run == 0 { sort.Sort(MetaItems0(items)) } else { sort.Sort(MetaItems1(items)) } } lines := make([]Statline, 0) if download { if run == 0 { fmt.Fprintln(q.w, "# "+meta.name+" per item\t") } else { fmt.Fprintln(q.w, "# "+meta.name+" per zin\t") } } select { case <-chClose: logerr(errConnectionClosed) return default: } seen := make(map[int]*Statline) for _, item := range items { lines = append(lines, Statline{item.text, item.count[run], nn[item.idx], item.idx}) seen[item.idx] = &lines[len(lines)-1] } if download || (meta.mtype != "TEXT" && len(seen)*NEEDALL > len(values)) { // ontbrekende waardes (count==0) toevoegen if meta.mtype == "TEXT" { for _, v := range values { if _, ok := seen[v.i]; !ok { lines = append(lines, Statline{v.s, 0, 1, v.i}) } } } else { lines2 := make([]Statline, len(values)) for i, v := range values { if s, ok := seen[v.i]; ok { lines2[i] = *s } else { lines2[i] = Statline{v.s, 0, 1, v.i} } } lines = lines2 } } p := "\n" for _, line := range lines { if download { if run == 1 { v := int(.5 + pow10*float64(line.i)/float64(line.n)) fmt.Fprintf(q.w, "%d\t%d\t%s\n", line.i, v, line.s) } else { fmt.Fprintf(q.w, "%d\t%s\n", line.i, line.s) } } else { fmt.Fprintf(q.w, "%s[%d,", p, line.i) p = ",\n" if run == 1 { v := int(.5 + pow10*float64(line.i)/float64(line.n)) fmt.Fprintf(q.w, "%d,", v) } fmt.Fprintf(q.w, "%d,\"%s\"]", line.idx, line.s) } } // for _, line := range lines if !download { fmt.Fprintln(q.w, "]") } } // for run := 0; run < 2; run++ if !download { fmt.Fprintf(q.w, `); makemetatable(%d); //--></script> `, number) } } // for number, meta := range metas if !download { fmt.Fprintf(&buf, "<hr>tijd: %s\n<p>\n<a href=\"xstatsmeta?%s&d=1\">download</a>\n", tijd(time.Now().Sub(now)), strings.Replace(q.r.URL.RawQuery, "&", "&", -1)) updateText(q, buf.String()) buf.Reset() } }
// TAB: xpath func xpath(q *Context) { prefix := getprefix(q) if !q.prefixes[prefix] { http.Error(q.w, "Invalid corpus: "+prefix, http.StatusPreconditionFailed) return } xpathmax := getxpathmax(q) methode := first(q.r, "mt") if methode != "dx" { methode = "std" } var errval error var db *dbxml.Db var docs *dbxml.Docs var loading bool defer func() { if docs != nil { docs.Close() } if db != nil { db.Close() } if loading { clearLoading(q.w) } if errval != nil { fmt.Fprintf(q.w, "<div class=\"error\">FOUT: %s</div>\n", html.EscapeString(errval.Error())) } html_footer(q) }() // HTML-uitvoer van begin van de pagina writeHead(q, "", 2) html_xpath_header(q) // HTML-uitvoer van het formulier // Returnwaarde is true als er een query was gedefinieerd has_query := html_xpath_form(q, xpathmax) // Als er geen query is gedefinieerd, HTML-uitvoer van korte helptekst, pagina-einde, en exit if !has_query { html_xpath_uitleg(q) return } var chClose <-chan bool if f, ok := q.w.(http.CloseNotifier); ok { chClose = f.CloseNotify() } else { chClose = make(<-chan bool) } _, errval = q.db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `active` = NOW() WHERE `id` = %q", Cfg.Prefix, prefix)) if logerr(errval) { return } offset := 0 o, err := strconv.Atoi(first(q.r, "offset")) if err == nil { offset = o } if offset < 0 { offset = 0 } fmt.Fprintln(q.w, "<hr>") now := time.Now() var owner string var nlines uint64 var rows *sql.Rows rows, errval = q.db.Query(fmt.Sprintf("SELECT `owner`,`nline` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, prefix)) if logerr(errval) { return } for rows.Next() { errval = rows.Scan(&owner, &nlines) if logerr(errval) { rows.Close() return } } errval = rows.Err() if logerr(errval) { return } dactfiles := make([]string, 0) global := false if strings.Contains(owner, "@") { dactfiles = append(dactfiles, filepath.Join(paqudir, "data", prefix, "data.dact")) } else { global = true rows, errval = q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix)) if logerr(errval) { return } for rows.Next() { var s string errval = rows.Scan(&s) if logerr(errval) { rows.Close() return } if strings.HasSuffix(s, ".dact") { dactfiles = append(dactfiles, s) } } errval = rows.Err() if logerr(errval) { return } } if len(dactfiles) == 0 { fmt.Fprintln(q.w, "Er zijn geen dact-bestanden voor dit corpus") return } fmt.Fprintf(q.w, "<ol start=\"%d\" id=\"ol\" class=\"xpath\">\n</ol>\n", offset+1) fmt.Fprintln(q.w, "<div id=\"loading\"><img src=\"busy.gif\" alt=\"[bezig]\"> <span></span></div>") if ff, ok := q.w.(http.Flusher); ok { ff.Flush() } loading = true found := false curno := 0 filename := "" curdac := "" xmlall := "" xmlparts := make([]string, 0) query := first(q.r, "xpath") fullquery := query if strings.Contains(query, "%") { rules := getMacrosRules(q) fullquery = macroKY.ReplaceAllStringFunc(query, func(s string) string { return rules[s[1:len(s)-1]] }) } queryparts := strings.Split(fullquery, "+|+") var seen uint64 for i, dactfile := range dactfiles { select { case <-chClose: logerr(errConnectionClosed) return default: } if Cfg.Dactx && methode == "dx" { if i == 0 { if _, err := os.Stat(dactfile + "x"); err != nil { methode = "std" fmt.Fprintln(q.w, `<script type="text/javascript"><!-- $('#ol').before('<div class="warning">Geen geëxpandeerde indexnodes beschikbaar voor dit corpus.<br>De standaardmethode wordt gebruikt.</div>'); //--></script>`) } } if methode == "dx" { dactfile += "x" } } if curno > offset+xpathmax { break } if seen > 0 { fmt.Fprintf(q.w, `<script type="text/javascript"><!-- $('#loading span').html('%.1f%%'); //--></script> `, float64(seen)*100/float64(nlines)) if ff, ok := q.w.(http.Flusher); ok { ff.Flush() } } errval = bugtest(dactfile, queryparts[0]) if errval != nil { return } db, errval = dbxml.Open(dactfile) if logerr(errval) { return } var qu *dbxml.Query qu, errval = db.Prepare(queryparts[0]) if logerr(errval) { return } done := make(chan bool, 1) interrupted := make(chan bool, 1) go func() { select { case <-chClose: interrupted <- true logerr(errConnectionClosed) qu.Cancel() case <-done: } }() docs, errval = qu.Run() if logerr(errval) { done <- true return } filename = "" NEXTDOC: for docs.Next() { name := docs.Name() newdoc := false if name != filename { if found && curno > offset && curno <= offset+xpathmax { found = false xpath_result(q, curno, curdac, filename, xmlall, xmlparts, prefix, global) xmlparts = xmlparts[0:0] } if len(queryparts) == 1 { curno++ if curno > offset+xpathmax { docs.Close() continue } } curdac = dactfile filename = name newdoc = true } if len(queryparts) == 1 { found = true if curno > offset+xpathmax { docs.Close() } else { if curno > offset && curno <= offset+xpathmax { xmlall = docs.Content() xmlparts = append(xmlparts, docs.Match()) } } } else if newdoc { newdoc = false doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", name) var docs2 *dbxml.Docs for i := 1; i < len(queryparts)-1; i++ { docs2, errval = db.Query(doctxt + queryparts[i]) if logerr(errval) { done <- true return } if !docs2.Next() { continue NEXTDOC } docs2.Close() } docs2, errval = db.Query(doctxt + queryparts[len(queryparts)-1]) if logerr(errval) { done <- true return } found = false for docs2.Next() { if !found { found = true curno++ if curno > offset+xpathmax { docs.Close() } } if curno > offset && curno <= offset+xpathmax { xmlall = docs2.Content() xmlparts = append(xmlparts, docs2.Match()) } else { docs2.Close() } } } } // for docs.Next() errval = docs.Error() docs = nil if logerr(errval) { done <- true return } if len(dactfiles) > 1 { if n, err := db.Size(); err == nil { seen += n } } db.Close() db = nil done <- true select { case <-interrupted: return default: } if found && curno > offset && curno <= offset+xpathmax { found = false xpath_result(q, curno, curdac, filename, xmlall, xmlparts, prefix, global) xmlparts = xmlparts[0:0] } if curno > offset+xpathmax { break } } // for _, dactfile := range dactfiles clearLoading(q.w) loading = false if curno == 0 { fmt.Fprintf(q.w, "geen match gevonden") } // Links naar volgende en vorige pagina's met resultaten qs := "xpath=" + urlencode(query) + "&mt=" + methode if offset > 0 || curno > offset+xpathmax { if offset > 0 { fmt.Fprintf(q.w, "<a href=\"xpath?%s&offset=%d\">vorige</a>", qs, offset-xpathmax) } else { fmt.Fprint(q.w, "vorige") } fmt.Fprint(q.w, " | ") if curno > offset+xpathmax { fmt.Fprintf(q.w, "<a href=\"xpath?%s&offset=%d\">volgende</a>", qs, offset+xpathmax) } else { fmt.Fprint(q.w, "volgende") } } if q.auth && curno > 0 { fmt.Fprintf(q.w, `<p> <form action="xsavez" method="POST" accept-charset="UTF-8" enctype="multipart/form-data"> <input type="hidden" name="xpath" value="%s"> <input type="hidden" name="db" value="%s"> <input type="hidden" name="mt" value="%s"> <input type="submit" value="nieuw corpus maken op basis van deze zoekopdracht"> </form> `, html.EscapeString(first(q.r, "xpath")), html.EscapeString(prefix), methode) } fmt.Fprintln(q.w, "<hr><small>tijd:", tijd(time.Now().Sub(now)), "</small>") if curno == 0 { return } fmt.Fprintln(q.w, "<hr>") var metas []MetaType if q.hasmeta[prefix] { metas = getMeta(q, prefix) } // Links naar statistieken fmt.Fprintf(q.w, ` <p> <div id="xstats"> <form action="javascript:$.fn.xpathstats()" name="xstatsform"> <input type="hidden" name="xpath" value="%s"> <input type="hidden" name="db" value="%s"> <input type="hidden" name="mt" value="%s"> Selecteer één tot vijf attributen: <p> `, html.EscapeString(query), html.EscapeString(prefix), methode) for i := 1; i <= 5; i++ { fmt.Fprintf(q.w, "<select name=\"attr%d\">\n<option value=\"\">--</option>\n", i) if q.hasmeta[prefix] { fmt.Fprintln(q.w, "<optgroup label=\"— metadata —\">") for _, m := range metas { fmt.Fprintf(q.w, "<option value=\":%s\">%s</option>\n", html.EscapeString(m.name), html.EscapeString(m.name)) } fmt.Fprintln(q.w, "</optgroup>") fmt.Fprintln(q.w, "<optgroup label=\"— attributen —\">") } for _, s := range NodeTags { fmt.Fprintf(q.w, "<option>%s</option>\n", s) } if q.hasmeta[prefix] { fmt.Fprintln(q.w, "</optgroup>") } fmt.Fprintln(q.w, "</select>") } fmt.Fprint(q.w, ` <p> <input type="submit" value="doe telling"> </form> <p> <iframe src="leeg.html" id="xframe" class="hide"></iframe> <div id="result" class="hide"></div> `) if q.hasmeta[prefix] { metahelp(q) fmt.Fprintln(q.w, `<p> <div id="statsmeta" class="hide"> <div id="innermetatop"></div> <div id="metacount" class="hide"> <table> <tr><td>items:<td class="right" id="metacount1"> <tr><td>zinnen:<td class="right" id="metacount2"> </table> </div> <div id="innermeta"></div> <img src="busy.gif" id="busymeta" class="hide" alt="aan het werk..." style="margin-top:1em"> </div>`) } fmt.Fprintln(q.w, "</div>") }
func unpackDact(data, xmldir, dact, stderr string, chKill chan bool) (tokens, nline int, err error) { os.Mkdir(xmldir, 0777) dc, err := dbxml.Open(dact) if err != nil { return 0, 0, fmt.Errorf( "Openen mislukt. PaQu kan geen dact-bestanden lezen die gemaakt zijn met DbXml nieuwer dan versie %d", dbxml_version_major()) } defer dc.Close() var dbx *dbxml.Db if Cfg.Dactx { os.Remove(dact + "x") dbx, err = dbxml.Open(dact + "x") if err != nil { return 0, 0, err } defer dbx.Close() } docs, err := dc.All() if err != nil { return 0, 0, fmt.Errorf("Lezen dact-bestand: %s", err) } defer docs.Close() fperr, err := os.Create(stderr) if err != nil { return 0, 0, err } defer fperr.Close() fplines, err := os.Create(data + ".lines") if err != nil { return 0, 0, err } defer fplines.Close() tokens = 0 nline = 0 nd := -1 sdir := "" for docs.Next() { select { case <-chGlobalExit: return 0, 0, errGlobalExit case <-chKill: return 0, 0, errKilled default: } nline++ if nline%10000 == 1 { nd++ sdir = fmt.Sprintf("%04d", nd) os.Mkdir(filepath.Join(xmldir, sdir), 0777) } name := docs.Name() if strings.HasSuffix(strings.ToLower(name), ".xml") { name = name[:len(name)-4] } encname := encode_filename(name) data := []byte(docs.Content()) fp, err := os.Create(filepath.Join(xmldir, sdir, encname+".xml")) if err != nil { return 0, 0, err } _, err = fp.Write(data) fp.Close() if err != nil { return 0, 0, err } alpino := Alpino_ds_no_node{} err = xml.Unmarshal(data, &alpino) if err != nil { return 0, 0, fmt.Errorf("Parsen van %q uit dact-bestand: %s", docs.Name(), err) } tokens += len(strings.Fields(alpino.Sentence)) fmt.Fprintf(fplines, "%s|%s\n", name, strings.TrimSpace(alpino.Sentence)) for _, c := range alpino.Comments { if strings.HasPrefix(c.Comment, "Q#") { a := strings.SplitN(c.Comment, "|", 2) if len(a) == 2 { fmt.Fprintf(fperr, "Q#%s|%s\n", name, strings.TrimSpace(a[1])) break } } } if Cfg.Dactx { content, err := dactExpand(data) if err != nil { return 0, 0, err } if content == "" { content = string(data) } err = dbx.PutXml(docs.Name(), content, false) if err != nil { return 0, 0, err } } } return tokens, nline, nil }
func xsavez2(q *Context) { var fpz, fpgz *os.File var z *zip.Writer var gz *gzip.Reader var dact *dbxml.Db var docs *dbxml.Docs var dirname, fulldirname string var okall bool defer func() { if z != nil { z.Close() } if fpz != nil { fpz.Close() } if gz != nil { gz.Close() } if fpgz != nil { fpgz.Close() } if docs != nil { docs.Close() } if dact != nil { dact.Close() } if !okall { os.RemoveAll(fulldirname) q.db.Exec(fmt.Sprintf("DELETE FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, dirname)) } }() protected := 0 if !q.auth { http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized) return } methode := firstf(q.form, "mt") if methode != "dx" { methode = "std" } corpora := make([]string, 0, len(q.form.Value["db"])) for _, c := range q.form.Value["db"] { if s := strings.TrimSpace(c); s != "" { corpora = append(corpora, s) } } for _, corpus := range corpora { if !q.prefixes[corpus] { http.Error(q.w, "Geen toegang tot corpus", http.StatusUnauthorized) return } if q.protected[corpus] || !q.myprefixes[corpus] { protected = 1 } } if len(corpora) == 0 { writeHtml(q, "Fout", "Geen corpora gekozen") return } xpath := firstf(q.form, "xpath") if xpath == "" { writeHtml(q, "Fout", "Zoekterm ontbreekt") return } title := maxtitlelen(firstf(q.form, "title")) if title == "" { writeHtml(q, "Fout", "Titel ontbreekt") return } maxdup, _ := strconv.Atoi(firstf(q.form, "maxdup")) if maxdup < 1 || maxdup > Cfg.Maxdup { maxdup = Cfg.Maxdup } dirname, fulldirname, ok := beginNewCorpus(q, q.db, title, hErr) if !ok { return } fpz, err := os.Create(fulldirname + "/data") if hErr(q, err) { fpz = nil return } z = zip.NewWriter(fpz) linecount := 0 for _, prefix := range corpora { if linecount == maxdup && maxdup > 0 { break } global, ok := isGlobal(q, prefix) if !ok { return } pathlen, ok := getPathLen(q, prefix, global, true) if !ok { return } dactfiles := make([]string, 0) if !global { dactfiles = append(dactfiles, fmt.Sprintf("%s/data/%s/data.dact", paqudir, prefix)) } else { rows, err := q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix)) if hErr(q, err) { return } for rows.Next() { var s string if hErr(q, rows.Scan(&s)) { rows.Close() return } if strings.HasSuffix(s, ".dact") { dactfiles = append(dactfiles, s) } } if hErr(q, rows.Err()) { return } } fullquery := xpath if strings.Contains(xpath, "%") { rules := getMacrosRules(q) fullquery = macroKY.ReplaceAllStringFunc(xpath, func(s string) string { return rules[s[1:len(s)-1]] }) } queryparts := strings.Split(fullquery, "+|+") for _, dactfile := range dactfiles { if linecount == maxdup && maxdup > 0 { break } if Cfg.Dactx && methode == "dx" { dactfile += "x" } var data []byte dact, err = dbxml.Open(dactfile) if hErr(q, err) { dact = nil return } qu, err := dact.Prepare(queryparts[0]) if hErr(q, err) { return } docs, err = qu.Run() if hErr(q, err) { docs = nil return } seen := make(map[string]bool) NEXTDOC: for docs.Next() { if linecount == maxdup && maxdup > 0 { break } filename := docs.Name() if seen[filename] { continue } seen[filename] = true found := false if len(queryparts) == 1 { found = true data = []byte(docs.Content()) } else { doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", filename) for i := 1; i < len(queryparts)-1; i++ { docs2, err := dact.Query(doctxt + queryparts[i]) if hErr(q, err) { return } if !docs2.Next() { continue NEXTDOC } docs2.Close() } docs2, err := dact.Query(doctxt + queryparts[len(queryparts)-1]) if hErr(q, err) { return } found = false if docs2.Next() { found = true data = []byte(docs2.Content()) docs2.Close() } } if !found { continue } newfile := filename if global { newfile = dactfile[pathlen:len(dactfile)-5] + "::" + filename } if len(corpora) > 1 { newfile = prefix + "/" + newfile data = xmlSetSource(data, prefix) } f, err := z.Create(newfile) if hErr(q, err) { return } if methode == "dx" { data, err = unexpandDact(data) if hErr(q, err) { return } } _, err = f.Write(data) if hErr(q, err) { return } linecount++ } // for docs.Next() err = docs.Error() docs = nil if hErr(q, err) { return } dact.Close() dact = nil } // for range dactfiles } // for range corpora err = z.Close() z = nil if hErr(q, err) { return } fpz.Close() fpz = nil s := "xmlzip-d" if protected != 0 { s = "xmlzip-p" } newCorpus(q, q.db, dirname, title, s, protected, hErr, true) okall = true }
func makeDact(dact, xml string, stripchar string, chKill chan bool) error { files, err := filenames2(xml, false) if err != nil { return err } os.Remove(dact) db, err := dbxml.Open(dact) if err != nil { return err } defer db.Close() var dbx *dbxml.Db if Cfg.Dactx { os.Remove(dact + "x") dbx, err = dbxml.Open(dact + "x") if err != nil { return err } defer dbx.Close() } for _, name := range files { select { case <-chGlobalExit: return errGlobalExit case <-chKill: return errKilled default: } data, err := ioutil.ReadFile(filepath.Join(xml, name)) if err != nil { return err } name = decode_filename(name) if stripchar != "" { name = name[1+strings.Index(name, stripchar):] } err = db.PutXml(name, string(data), false) if err != nil { return err } if Cfg.Dactx { content, err := dactExpand(data) if err != nil { return err } if content == "" { content = string(data) } err = dbx.PutXml(name, content, false) if err != nil { return err } } } return nil }