func browserr(q *Context) { if !q.auth { http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized) return } db := first(q.r, "db") if !q.myprefixes[db] { http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized) return } lbl := first(q.r, "s") coded := false if lbl == "" { http.Error(q.w, "Label ontbreekt", http.StatusPreconditionFailed) return } if !reBasicName.MatchString(lbl) { lbl = encode_filename(lbl) coded = true } contentType(q, "text/plain; charset=utf-8") datadir := filepath.Join(paqudir, "data", db) fp, err := os.Open(filepath.Join(datadir, "stderr.txt.gz")) if err != nil { sysErr(err) fmt.Fprintln(q.w, err) return } defer fp.Close() gz, err := gzip.NewReader(fp) if err != nil { sysErr(err) fmt.Fprintln(q.w, err) return } defer gz.Close() rd := util.NewReader(gz) state := 0 for { line, err := rd.ReadLineString() if err == io.EOF { break } if err != nil { sysErr(err) fmt.Fprintln(q.w, err) return } if state == 0 { if strings.HasPrefix(line, "****") { f := strings.Fields(line) if len(f) > 2 && f[1] == "parsing" { if coded { if f[2][5:] == lbl { state = 1 } else if len(f[2]) > 10 && f[2][9] == '-' && f[2][10:] == lbl && reCodedPrefix.MatchString(f[2]) { state = 1 } } else { if f[2] == lbl { state = 1 } } if state == 1 { fmt.Fprintln(q.w, line) } } } } else { fmt.Fprintln(q.w, line) if strings.HasPrefix(line, "****") { break } } } }
// TAB: browse (zinnen) func browse(q *Context) { if !q.auth { http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized) return } id := first(q.r, "id") if !q.myprefixes[id] { http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized) return } datadir := filepath.Join(paqudir, "data", id) fp, err := os.Open(filepath.Join(datadir, "summary.txt.gz")) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } defer fp.Close() gz, err := gzip.NewReader(fp) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } defer gz.Close() rd := util.NewReader(gz) line, _ := rd.ReadLineString() a := strings.SplitN(line, "\t", 3) nline, _ := strconv.Atoi(a[0]) nerr, _ := strconv.Atoi(a[1]) // HTML-uitvoer van begin van de pagina writeHead(q, "Overzicht", 0) fmt.Fprintf(q.w, ` <script type="text/javascript"><!-- function formclear(f) { f.lbl.value = ""; } //--></script> Corpus: <b>%s</b> <p> Bron: %s <p> `, q.desc[id], a[2]) if nerr > 0 { fmt.Fprintf(q.w, ` Er waren problemen met %d van de %d zinnen: <p> <table class="corpora"> <tr><th>Label<th>Fout<th>Zin</tr> `, nerr, nline) lineno := 0 for { lineno++ line, e := rd.ReadLineString() if e != nil { break } a := strings.SplitN(line, "\t", 4) eo := "even" if lineno%2 == 1 { eo = "odd" } if lineno == 1 { eo += " first" } if lineno == nerr { eo += " last" } fmt.Fprintf(q.w, "<tr class=\"%s\"><td class=\"odd first\"><b><a href=\"browserr?db=%s&s=%s\" target=\"_blank\">%s</a></b><td class=\"even\">%s | %s<td class=\"odd\">%s", eo, id, html.EscapeString(a[0]), html.EscapeString(a[0]), html.EscapeString(a[1]), a[2], html.EscapeString(a[3])) } fmt.Fprint(q.w, "</table>\n<p>\n") } // HTML-uitvoer van het formulier fmt.Fprintf(q.w, ` <form action="browse" method="get" accept-charset="utf-8"> <input type="hidden" name="id" value="%s"> Label: <input type="text" name="lbl" size="20" value="%s"> <input type="submit" value="Zoeken"> <input type="button" value="Wissen" onClick="javascript:formclear(form)"> </form> `, id, html.EscapeString(first(q.r, "lbl"))) // Maximaal 2*ZINMAX matchende xml-bestanden opvragen offset := 0 o, err := strconv.Atoi(first(q.r, "offset")) if err == nil { offset = o } lbl := first(q.r, "lbl") query := "" if lbl != "" { query = fmt.Sprintf("WHERE `lbl` LIKE %q", lbl) } rows, err := q.db.Query( fmt.Sprintf( "SELECT `arch`,`file`,`sent`,`lbl` FROM `%s_c_%s_sent` %s LIMIT %d,%d", Cfg.Prefix, id, query, offset, 2*ZINMAX)) if doErr(q, err) { return } zinnen := make([]ZinArchFile, 0, 2*ZINMAX) nzin := 0 for rows.Next() { nzin++ var arch, file int var sent, lbl string err := rows.Scan(&arch, &file, &sent, &lbl) if doErr(q, err) { return } zinnen = append(zinnen, ZinArchFile{zin: sent, arch: arch, file: file}) } for i, zin := range zinnen { rows, err := q.db.Query( fmt.Sprintf( "SELECT `lbl` FROM `%s_c_%s_sent` WHERE `file` = %d AND `arch` = %d", Cfg.Prefix, id, zin.file, zin.arch)) if err == nil && rows.Next() { var lbl string rows.Scan(&lbl) rows.Close() zinnen[i].lbl = lbl } else { doErr(q, fmt.Errorf("Database error")) } } fmt.Fprintln(q.w, "<p>\n<dl>") for _, zin := range zinnen { fmt.Fprintf(q.w, "<dt><a href=\"tree?db=%s&arch=%d&file=%d\">%s</a>\n<dd>%s\n", id, zin.arch, zin.file, html.EscapeString(zin.lbl), html.EscapeString(zin.zin)) if !q.hasmeta[id] { continue } rows, err := q.db.Query(fmt.Sprintf( "SELECT `idx`,`type`,`name`,`tval`,`ival`,`fval`,`dval` FROM `%s_c_%s_meta` JOIN `%s_c_%s_midx` USING (`id`) "+ "WHERE `file` = %d AND `arch` = %d ORDER BY `name`,`idx`", Cfg.Prefix, id, Cfg.Prefix, id, zin.file, zin.arch)) if err == nil { pre := "<p>" for rows.Next() { var v, mtype, name, tval string var idx, ival int var fval float32 var dval time.Time rows.Scan(&idx, &mtype, &name, &tval, &ival, &fval, &dval) if idx == 2147483647 { continue } name = unHigh(name) switch mtype { case "TEXT": v = unHigh(tval) case "INT": v = iformat(ival) case "FLOAT": v = fmt.Sprintf("%g", fval) case "DATE": v = printDate(dval, false) case "DATETIME": v = printDate(dval, true) } fmt.Fprintf(q.w, "%s %s: %s\n", pre, html.EscapeString(name), html.EscapeString(v)) pre = "<br>" } } else { doErr(q, fmt.Errorf("Database error")) } } fmt.Fprint(q.w, "</dl>\n<p>\n") // Links naar volgende en vorige pagina's met resultaten qs := fmt.Sprintf("id=%s&lbl=%s", urlencode(id), urlencode(lbl)) if offset > 0 || nzin == 2*ZINMAX { if offset > 0 { fmt.Fprintf(q.w, "<a href=\"browse?%s&offset=%d\">vorige</a>", qs, offset-2*ZINMAX) } else { fmt.Fprint(q.w, "vorige") } fmt.Fprint(q.w, " | ") if nzin == 2*ZINMAX { fmt.Fprintf(q.w, "<a href=\"browse?%s&offset=%d\">volgende</a>", qs, offset+2*ZINMAX) } else { fmt.Fprint(q.w, "volgende") } } fmt.Fprint(q.w, "</body>\n</html>\n") }
// Run command, maar onderbreek het als chKill of chGlobalExit gesloten is func run(cmd *exec.Cmd, chKill chan bool, chPipe chan string) error { cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} chRet := make(chan error, 2) // deze functie schrijft twee keer op chRet go func() { if chPipe == nil { cmd.Start() chRet <- nil err := cmd.Wait() chRet <- err return } pipe, err := cmd.StdoutPipe() if err != nil { chRet <- nil chRet <- err return } cmd.Start() chRet <- nil var err1, err2 error rd := util.NewReader(pipe) for { var line string line, err1 = rd.ReadLineString() if err1 == io.EOF { err1 = nil break } if err1 != nil { break } chPipe <- line } close(chPipe) err2 = cmd.Wait() if err1 != nil { chRet <- err1 } else { chRet <- err2 } }() <-chRet // commando is gestart pgid, err := syscall.Getpgid(cmd.Process.Pid) if err != nil { // misschien betekent een fout alleen maar dat het process net klaar is logf("BIG TROUBLE: syscall.Getpgid(cmd.Process.Pid) error: %v", err) pgid = 0 } FORSELECT: for { select { case err = <-chRet: return err case <-chGlobalExit: break FORSELECT case <-chKill: break FORSELECT } } for _, sig := range []int{15, 9} { err = syscall.Kill(-pgid, syscall.Signal(sig)) if err != nil { logf("syscall.Kill(-pgid, %d) error: %v", sig, err) } if sig != 9 { time.Sleep(2 * time.Second) } } err = <-chRet return err }
func dowork(db *sql.DB, task *Process) (user string, title string, err error) { logf("WORKING: " + task.id) processLock.Lock() if task.nr > taskWorkNr { taskWorkNr = task.nr if taskWaitNr == taskWorkNr { // queue is leeg: reset counters om (ooit) overflow te voorkomen taskWaitNr = 0 taskWorkNr = 0 } } processLock.Unlock() _, err = db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `status` = \"WORKING\", `nword` = 0 WHERE `id` = %q", Cfg.Prefix, task.id)) if err != nil { return } params := "unknown" isArch := false var rows *sql.Rows rows, err = db.Query(fmt.Sprintf("SELECT `description`,`owner`,`params` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, task.id)) if err != nil { return } if rows.Next() { err = rows.Scan(&title, &user, ¶ms) rows.Close() if err != nil { return } if strings.Contains(params, "-arch") { params = strings.Replace(params, "-arch", "", 1) isArch = true } } dirname := filepath.Join(paqudir, "data", task.id) data := filepath.Join(dirname, "data") xml := filepath.Join(dirname, "xml") dact := filepath.Join(dirname, "data.dact") stdout := filepath.Join(dirname, "stdout.txt") stderr := filepath.Join(dirname, "stderr.txt") summary := filepath.Join(dirname, "summary.txt") // gzip var fp *os.File fp, err = os.Open(data) if err != nil { return } b := make([]byte, 2) io.ReadFull(fp, b) fp.Close() if string(b) == "\x1F\x8B" { // gzip fpin, _ := os.Open(data) r, e := gzip.NewReader(fpin) if e != nil { fpin.Close() err = e return } fpout, _ := os.Create(data + ".tmp") _, err = io.Copy(fpout, r) fpout.Close() r.Close() fpin.Close() if err != nil { return } os.Rename(data+".tmp", data) } var ar *arch ar, err = NewArchReader(data) if err == nil { // als eerste bestand alpino-xml is // params is xmlzip // anders uitpakken en alles aan elkaar plakken in data, met newline aan eind van elk deel e := ar.Next() if e == io.EOF { ar.Close() err = LeegArchief return } if e != nil { ar.Close() err = e return } var b []byte b, err = ar.ReadN(1000) if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { ar.Close() return } if strings.Contains(string(b), "<alpino_ds") { if !strings.HasPrefix(params, "xmlzip") { params = "xmlzip" } setinvoer(db, params, task.id, false) ar.Close() } else { if strings.Contains(string(b), "<FoLiA") { params = "folia-arch" } else if strings.Contains(string(b), "<TEI") { params = "tei-arch" } isArch = true ar.Close() var fp *os.File fp, err = os.Create(data + ".unzip") if err != nil { return } ar, err = NewArchReader(data) for { e := ar.Next() if e == io.EOF { break } if e != nil { ar.Close() fp.Close() err = e return } if params == "folia-arch" || params == "tei-arch" { fmt.Fprintf(fp, "##PAQUFILE %s\n", hex.EncodeToString([]byte(ar.Name()))) } else { fmt.Fprintf(fp, "\n##PAQUFILE %s\n", ar.Name()) } var buf bytes.Buffer if params == "folia-arch" || params == "tei-arch" { err = ar.Copy(&buf) } else { err = ar.Copy(fp) } if err != nil { fp.Close() ar.Close() return } if params == "folia-arch" { folia(ar.Name()+".", &buf, fp) } else if params == "tei-arch" { tei(ar.Name()+".", &buf, fp) } else { fmt.Fprintln(fp) } } fp.Close() ar.Close() if params == "folia-arch" || params == "tei-arch" { os.Rename(data+".unzip", data+".tmp") } } } if params == "auto" { if isArch { params, err = invoersoort(db, data+".unzip", task.id) } else { params, err = invoersoort(db, data, task.id) } if err != nil { return } if params == "dact" { os.Rename(data, dact) } } if isArch { setinvoer(db, params, task.id, true) } defer func() { select { case <-chGlobalExit: err = errGlobalExit return case <-task.chKill: err = errKilled return default: } os.Remove(data + ".lines.tmp") os.Remove(data + ".tmp") os.Remove(data + ".tmp2") for _, f := range []string{data, data + ".lines", stdout, stderr, summary} { if f == data && (params == "dact" || strings.HasPrefix(params, "xmlzip")) { continue } logerr(gz(f)) } fnames, e := filenames2(xml, false) if logerr(e) { return } for _, fname := range fnames { select { case <-chGlobalExit: err = errGlobalExit return case <-task.chKill: err = errKilled return default: } logerr(gz(filepath.Join(xml, fname))) } if params == "dact" && !Cfg.Dact { os.Remove(dact) } if strings.HasPrefix(params, "xmlzip") { os.Remove(data) } }() select { case <-chGlobalExit: err = errGlobalExit return case <-task.chKill: err = errKilled return default: } if params == "dact" { var tokens, nlines int tokens, nlines, err = unpackDact(data, xml, dact, stderr, task.chKill) if err != nil { return } err = do_quotum(db, task.id, user, tokens, nlines) if err != nil { os.Remove(dact) os.Remove(dact + "x") os.Remove(data + ".lines") os.Remove(stderr) os.RemoveAll(xml) return } } else if strings.HasPrefix(params, "xmlzip") { var tokens, nlines int tokens, nlines, err = unpackXml(data, xml, stderr, task.chKill) if err != nil { return } err = do_quotum(db, task.id, user, tokens, nlines) if err != nil { os.Remove(data + ".lines") os.Remove(stderr) os.RemoveAll(xml) return } } else { // if params != (dact || xmlzip) reuse := false reuse_more := false if files, e := filenames2(xml, false); e == nil && len(files) > 0 { reuse = true if isArch { os.Remove(data + ".unzip") } done := make(map[string]bool) for _, f := range files { b, e := ioutil.ReadFile(filepath.Join(xml, f)) if e != nil { err = e return } if strings.Index(string(b), "</alpino_ds>") > 0 { done[f] = true } } fpin, e := os.Open(data + ".lines") if e != nil { err = e return } fpout, e := os.Create(data + ".lines.tmp") if e != nil { fpin.Close() err = e return } nword := 0 r := util.NewReader(fpin) for { line, e := r.ReadLineString() if e != nil { fpout.Close() fpin.Close() if e != io.EOF { err = e return } break } nword += len(strings.Fields(line)) key := line[:strings.Index(line, "|")] if !done[key+".xml"] { fmt.Fprintln(fpout, line) reuse_more = true } } _, err = db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `nword` = %d WHERE `id` = %q", Cfg.Prefix, nword, task.id)) if err != nil { return } } else { // if !reuse var has_tok, has_lbl, is_xml, is_arch bool if strings.Contains(params, "-arch") { is_arch = true } if strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") { is_xml = true has_lbl = true has_tok = true } if strings.Contains(params, "-lbl") { has_lbl = true } if strings.Contains(params, "-tok") { has_tok = true } if is_xml { if !is_arch { var fpin, fpout *os.File fpin, err = os.Open(data) if err != nil { return } fpout, err = os.Create(data + ".tmp") if err != nil { fpin.Close() return } if params == "folia" { err = folia("", fpin, fpout) } else { err = tei("", fpin, fpout) } fpout.Close() fpin.Close() if err != nil { return } } } else { // pqtexter var pqtexter, unzip string if params == "run" { pqtexter = "-r" } else if has_lbl { pqtexter = "-l" } if isArch { unzip = ".unzip" } err = shell("pqtexter %s %s%s > %s.tmp 2>> %s", pqtexter, data, unzip, data, stderr).Run() if err != nil { return } if isArch { os.Remove(data + ".unzip") } } // verwijderen van commentaren en labels zonder tekst if params == "run" || strings.HasPrefix(params, "line") { var fpin, fpout *os.File fpin, err = os.Open(data + ".tmp") if err != nil { return } fpout, err = os.Create(data + ".tmp2") if err != nil { fpin.Close() return } rd := util.NewReader(fpin) for { line, e := rd.ReadLineString() if e != nil { break } if strings.TrimSpace(line) == "" || line[0] == '%' || reRunLabel.MatchString(line) { continue } fmt.Fprintln(fpout, line) } fpout.Close() fpin.Close() os.Remove(data + ".tmp") os.Rename(data+".tmp2", data+".tmp") } // ontdubbelen van labels if strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || strings.Contains(params, "-lbl") { var fpin, fpout *os.File var dubbel bool fpin, err = os.Open(data + ".tmp") if err != nil { return } fpout, err = os.Create(data + ".tmp2") if err != nil { fpin.Close() return } rd := util.NewReader(fpin) seen := make(map[string]bool) for { line, e := rd.ReadLineString() if e != nil { break } if strings.HasPrefix(line, "##PAQULBL") { var b []byte b, err = hex.DecodeString(strings.Fields(line)[1]) if err != nil { fpout.Close() fpin.Close() return } lbl := string(b) if seen[lbl] { dubbel = true for i := 1; true; i++ { lbl2 := fmt.Sprintf("%s.dup.%d", lbl, i) if !seen[lbl2] { lbl = lbl2 break } } line = "##PAQULBL " + hex.EncodeToString([]byte(lbl)) } seen[lbl] = true } fmt.Fprintln(fpout, line) } fpout.Close() fpin.Close() if dubbel { os.Remove(data + ".tmp") os.Rename(data+".tmp2", data+".tmp") } else { os.Remove(data + ".tmp2") } } // tokenizer if !has_tok { var tok string if params == "run" { tok = "tokenize.sh" } else { tok = "tokenize_no_breaks.sh" } err = shell("$ALPINO_HOME/Tokenization/%s < %s.tmp > %s.tmp2 2>> %s", tok, data, data, stderr).Run() if err != nil { return } os.Rename(data+".tmp2", data+".tmp") } var fp, fpin *os.File fpin, err = os.Open(data + ".tmp") if err != nil { return } fp, err = os.Create(data + ".lines") if err != nil { fpin.Close() return } metalines := make([]string, 0) inmeta := false rd := util.NewReader(fpin) var filename, lbl string var tokens, nlines, i int var metaseen map[string]bool for { line, e := rd.ReadLineString() if e == io.EOF { break } if e != nil { err = e fp.Close() fpin.Close() return } if line == "" { continue } if strings.HasPrefix(line, "##PAQU") { a := strings.Fields(line) var val string if len(a) == 2 { b, e := hex.DecodeString(a[1]) if e != nil { err = e fp.Close() fpin.Close() return } val = strings.TrimSpace(string(b)) } if a[0] == "##PAQUFILE" { filename = val if strings.HasSuffix(filename, ".txt") || strings.HasSuffix(filename, ".doc") { filename = filename[:len(filename)-4] } i = 0 metalines = metalines[0:0] pp := strings.Split(filename, "/") for i, p := range pp { metalines = append(metalines, fmt.Sprintf("text\tpaqu.path%d\t%s", i+1, p)) } metaseen = make(map[string]bool) inmeta = true } else if a[0] == "##PAQULBL" { lbl = val } } else if strings.HasPrefix(line, "##META") { if !inmeta { metaseen = make(map[string]bool) inmeta = true } a := strings.Fields(line) if len(a) == 2 { b, e := hex.DecodeString(a[1]) if e != nil { err = e fp.Close() fpin.Close() return } f := strings.Fields(string(b)) if len(f) > 2 && f[2] == "=" { // als deze voor het eerst, dan alle oude met dezelfde naam wegdoen if !metaseen[f[1]] { metaseen[f[1]] = true for i := 0; i < len(metalines); i++ { if i == 0 && filename != "" { continue } if a := strings.Split(metalines[i], "\t"); a[1] == f[1] { metalines = append(metalines[:i], metalines[i+1:]...) i-- } } } if len(f) > 3 { metalines = append(metalines, fmt.Sprintf("%s\t%s\t%s", f[0], f[1], strings.Join(f[3:], " "))) } } } } else { inmeta = false tokens += len(strings.Fields(line)) var fname string if lbl == "" { if filename != "" { i++ fname = fmt.Sprintf("%04d/%04d-%s.%d", nlines/10000, nlines%10000, encode_filename(filename), i) } else { fname = fmt.Sprintf("%04d/%04d", nlines/10000, nlines%10000) } } else { fname = fmt.Sprintf("%04d/%04d-%s", nlines/10000, nlines%10000, encode_filename(lbl)) lbl = "" } fmt.Fprintf(fp, "%s|%s\n", fname, strings.TrimSpace(line)) nlines++ if len(metalines) > 0 { fpm, e := os.Create(filepath.Join(xml, fname+".meta")) if e != nil { err = e fp.Close() fpin.Close() return } for _, m := range metalines { fmt.Fprintln(fpm, m) } fpm.Close() } } } fp.Close() fpin.Close() os.Remove(data + ".tmp") err = do_quotum(db, task.id, user, tokens, nlines) if err != nil { os.Remove(data) os.Remove(data + ".lines") os.Remove(stderr) os.RemoveAll(xml) return } if err != nil { return } } // end if !reuse if !reuse || reuse_more { var ext string if reuse { ext = ".tmp" } var server, timeout string if Cfg.Alpinoserver != "" { server = "-s " + Cfg.Alpinoserver } if Cfg.Timeout > 0 { timeout = fmt.Sprint("-t ", Cfg.Timeout) } cmd := shell( `pqalpino -e half -l -T -q -n %d -d %s %s %s %s.lines%s >> %s 2>> %s`, Cfg.Maxtokens, xml, server, timeout, data, ext, stdout, stderr) err = run(cmd, task.chKill, nil) if err != nil { return } } } // end if params != (dact || xmlzip) nlines := 0 fp, e := os.Open(data + ".lines") if e != nil { err = e return } rd := util.NewReader(fp) for { _, e := rd.ReadLine() if e != nil { fp.Close() if e == io.EOF { break } else { err = e return } } nlines++ } errlines := make([]string, 0) fp, e = os.Open(stderr) if e != nil { err = e return } rd = util.NewReader(fp) errseen := make(map[string]bool) for { line, e := rd.ReadLineString() if e != nil { fp.Close() if e == io.EOF { break } else { err = e return } } if strings.HasPrefix(line, "Q#") { a := strings.Split(line, "|") ln := len(a) n, err := strconv.Atoi(a[ln-3]) if err == nil && n > 0 { continue } if ln > 5 { a[1] = strings.Join(a[1:ln-3], "|") } fname := a[0][2:] if params == "run" || strings.HasPrefix(params, "line") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") { fname = decode_filename(a[0][2:]) } if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") { fname = fname[1+strings.Index(fname, "-"):] } // bij herstart worden mislukte zinnen opnieuw geparst, en mislukken dan opnieuw errline := fname + "\t" + a[ln-3] + "\t" + a[ln-2] + "\t" + a[1] + "\n" if !errseen[errline] { errlines = append(errlines, errline) errseen[errline] = true } } } fp, err = os.Create(summary) if err != nil { return } fmt.Fprintf(fp, "%d\t%d\t%s\n", nlines, len(errlines), invoertabel[params]) for _, line := range errlines { fmt.Fprint(fp, line) } fp.Close() p := regexp.QuoteMeta(xml + "/") d := "" if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || isArch { p += "[0-9]+/[0-9]+-" d = "-d" } else if params == "dact" || strings.HasPrefix(params, "xmlzip") { p += "[0-9]+/" d = "-d" } filenames, e := filenames2(xml, true) if e != nil { err = e return } for _, filename := range filenames { m := filepath.Join(xml, filename) x := m[:len(m)-4] + "xml" var xb, mb []byte xb, err = ioutil.ReadFile(x) if err != nil { os.Remove(m) continue } mb, err = ioutil.ReadFile(m) if err != nil { return } fp, err = os.Create(x + ".tmp") if err != nil { return } xt := string(xb) mt := strings.Split(strings.TrimSpace(string(mb)), "\n") i := strings.Index(xt, "<node") fmt.Fprint(fp, xt[:i], "<metadata>\n") for _, m := range mt { mm := strings.Split(m, "\t") fmt.Fprintf(fp, " <meta type=%q name=%q value=%q/>\n", html.EscapeString(strings.ToLower(mm[0])), html.EscapeString(mm[1]), html.EscapeString(mm[2])) } fmt.Fprint(fp, " </metadata>\n ", xt[i:]) fp.Close() os.Rename(x+".tmp", x) os.Remove(m) } cmd := shell( // optie -w i.v.m. recover() `find %s -name '*.xml' | sort | pqbuild -w -p %s %s -s %s %s %s 0 >> %s 2>> %s`, dirname, quote(p), d, filepath.Base(dirname), quote(title), quote(user), stdout, stderr) err = run(cmd, task.chKill, nil) if err != nil { return } if Cfg.Dact && params != "dact" { p := "" if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || isArch { p = "-" } else if strings.HasPrefix(params, "xmlzip") { p = "/" } err = makeDact(dact, xml, p, task.chKill) if err != nil { return } } return }
func download(q *Context) { id := first(q.r, "id") dl := first(q.r, "dl") params := q.params[id] if !q.myprefixes[id] { // misschien een corpus dat mislukt is rows, err := q.db.Query( fmt.Sprintf("SELECT `params` FROM `%s_info` WHERE `id` = %q AND `owner` = %q", Cfg.Prefix, id, q.user)) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } if rows.Next() { err := rows.Scan(¶ms) rows.Close() if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } } else { http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized) return } } if q.protected[id] && (dl == "xml" || dl == "dact") { http.Error(q.w, "Dat is afgeleid van een corpus dat niet van jou is", http.StatusUnauthorized) return } datadir := filepath.Join(paqudir, "data", id) var filename string switch dl { case "summary": filename = "summary.txt" case "stdout": filename = "stdout.txt" case "stderr": filename = "stderr.txt" case "zinnen": if !strings.Contains(params, "-lbl") && !strings.Contains(params, "-arch") && !strings.HasPrefix(params, "folia") && !strings.HasPrefix(params, "tei") { filename = "data.lines" } case "dact": case "xml": default: http.Error(q.w, "Ongeldige selectie: "+dl, http.StatusUnauthorized) return } if filename != "" { fp, err := os.Open(filepath.Join(datadir, filename+".gz")) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } r, err := gzip.NewReader(fp) if err != nil { fp.Close() http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } q.w.Header().Set("Content-Type", "text/plain; charset=utf-8") io.Copy(q.w, r) r.Close() fp.Close() return } // dact if dl == "dact" { fp, err := os.Open(filepath.Join(datadir, "data.dact")) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } q.w.Header().Set("Content-Type", "application/octet-stream") q.w.Header().Set("Content-Disposition", "attachment; filename="+id+".dact") io.Copy(q.w, fp) fp.Close() return } // data.lines met verkeerde labels if dl == "zinnen" { fp, err := os.Open(filepath.Join(datadir, "data.lines.gz")) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } r, err := gzip.NewReader(fp) if err != nil { fp.Close() http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } q.w.Header().Set("Content-Type", "text/plain; charset=utf-8") rd := util.NewReader(r) for { line, err := rd.ReadLineString() if err != nil { break } a := strings.SplitN(line, "|", 2) lbl := decode_filename(a[0][1+strings.Index(a[0], "-"):]) fmt.Fprintf(q.w, "%s|%s\n", lbl, a[1]) } r.Close() fp.Close() return } // xml datadir = filepath.Join(datadir, "xml") files, err := filenames2(datadir, false) if err != nil { http.Error(q.w, err.Error(), http.StatusInternalServerError) logerr(err) return } q.w.Header().Set("Content-Type", "application/zip") q.w.Header().Set("Content-Disposition", "attachment; filename="+id+".zip") w := zip.NewWriter(q.w) for _, gzname := range files { fullgzname := filepath.Join(datadir, gzname) file, err := os.Stat(fullgzname) name := decode_filename(gzname[:len(gzname)-3]) if params == "dact" || strings.HasPrefix(params, "xmlzip") { name = name[1+strings.Index(name, "/"):] } else if strings.Contains(params, "-lbl") || strings.Contains(params, "-arch") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") { name = name[1+strings.Index(name, "-"):] } if err != nil { logerr(err) return } fh, err := zip.FileInfoHeader(file) if err != nil { logerr(err) return } fh.Name = filepath.Join(id, name) f, err := w.CreateHeader(fh) if err != nil { logerr(err) return } fp, err := os.Open(filepath.Join(datadir, gzname)) if err != nil { logerr(err) return } r, err := gzip.NewReader(fp) if err != nil { fp.Close() logerr(err) return } io.Copy(f, r) r.Close() fp.Close() } err = w.Close() if err != nil { logerr(err) return } }
func invoersoort(db *sql.DB, data, id string) (string, error) { set := func(soort string) (string, error) { return soort, setinvoer(db, soort, id, false) } fp, err := os.Open(data) if err != nil { return "", err } defer fp.Close() b := make([]byte, 200) n, _ := io.ReadFull(fp, b) fp.Seek(0, 0) if n >= 3 { s := string(b[:4]) if s == "\x00\x06\x15\x61" || s == "\x61\x15\x06\x00" || s == "\x00\x05\x31\x62" || s == "\x62\x31\x05\x00" { return set("dact") } } if n > 15 { s := string(b[12:16]) if s == "\x00\x06\x15\x61" || s == "\x61\x15\x06\x00" || s == "\x00\x05\x31\x62" || s == "\x62\x31\x05\x00" || s == "\x00\x04\x22\x53" || s == "\x53\x22\x04\x00" { return set("dact") } } if strings.Contains(string(b), "<FoLiA") { return set("folia") } if strings.Contains(string(b), "<TEI") { return set("tei") } lines := make([]string, 0, 20) rd := util.NewReader(fp) for i := 0; i < 20; i++ { line, e := rd.ReadLineString() if e != nil { break } line = strings.ToUpper(line) line = strings.Replace(line, "\000", "", -1) // utf-16, utf-32, grove methode if strings.TrimSpace(line) == "" || strings.HasPrefix(line, "##PAQU") || strings.HasPrefix(line, "##META") || line[0] == '%' || reRunLabel.MatchString(line) { i-- } else { lines = append(lines, line) } } ln := len(lines) if ln < 2 { return set("run") } endletter := 0 midpoint := 0 nlabel := 0 for _, line := range lines { if !reEndPoint.MatchString(line) { endletter++ } midpoint += len(reMidPoint.FindAllString(line, -1)) if strings.Contains(line, "|") { nlabel++ } } if nlabel < ln && (endletter > ln/3 || midpoint > endletter/2) { return set("run") } soort := "line" ntok := 0 for _, line := range lines { line = strings.TrimSpace(line) if strings.HasSuffix(line, " .") || strings.HasSuffix(line, " !") || strings.HasSuffix(line, " ?") { ntok++ } } if nlabel == ln { soort += "-lbl" } if ntok > (3*ln)/4 { soort += "-tok" } return set(soort) }
func main() { flag.Parse() if *opt_f == "" && flag.NArg() == 0 && util.IsTerminal(os.Stdin) && !*opt_a { fmt.Fprintf(os.Stderr, "\nUsage: %s [args] [text]\n\nargs with default values are:\n\n", os.Args[0]) flag.PrintDefaults() fmt.Fprintf(os.Stderr, "\nIf both -f and text are missing, read from stdin\n\n") return } extras := make([]string, 0) tc := textcat.NewTextCat() if *opt_p != "" { for _, i := range strings.Split(*opt_p, ",") { name := strings.Split(path.Base(i), ".")[0] extras = append(extras, name) e := tc.AddLanguage(name, i) util.CheckErr(e) } } if *opt_z { if *opt_r || *opt_b { for _, extra := range extras { tc.EnableLanguages(extra + ".raw") } } if *opt_b || !*opt_r { for _, extra := range extras { tc.EnableLanguages(extra + ".utf8") } } } else { if *opt_r || *opt_b { tc.EnableAllRawLanguages() } if *opt_b || !*opt_r { tc.EnableAllUtf8Languages() } } if *opt_i != "" { tc.DisableLanguages(strings.Split(*opt_i, ",")...) } if *opt_a { for _, i := range tc.ActiveLanguages() { fmt.Println(i) } return } if *opt_l { var r *util.Reader if *opt_f != "" { fp, err := os.Open(*opt_f) util.CheckErr(err) defer fp.Close() r = util.NewReader(fp) } else if flag.NArg() > 0 { b := bytes.NewBufferString(strings.Join(flag.Args(), " ")) r = util.NewReader(b) } else { r = util.NewReader(os.Stdin) } for { line, err := r.ReadLineString() if err == io.EOF { break } util.CheckErr(err) l, err := tc.Classify(line) if err != nil { fmt.Print(err) } else { fmt.Print(strings.Join(l, ",")) } fmt.Println("\t" + line) } return } var text string if *opt_f != "" { t, err := ioutil.ReadFile(*opt_f) util.CheckErr(err) text = string(t) } else if flag.NArg() > 0 { text = strings.Join(flag.Args(), " ") } else { t, err := ioutil.ReadAll(os.Stdin) util.CheckErr(err) text = string(t) } l, e := tc.Classify(text) if e != nil { fmt.Println(e) } else { fmt.Println(strings.Join(l, "\n")) } }