Exemplo n.º 1
0
func browserr(q *Context) {

	if !q.auth {
		http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized)
		return
	}

	db := first(q.r, "db")
	if !q.myprefixes[db] {
		http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized)
		return
	}

	lbl := first(q.r, "s")
	coded := false
	if lbl == "" {
		http.Error(q.w, "Label ontbreekt", http.StatusPreconditionFailed)
		return
	}
	if !reBasicName.MatchString(lbl) {
		lbl = encode_filename(lbl)
		coded = true
	}

	contentType(q, "text/plain; charset=utf-8")

	datadir := filepath.Join(paqudir, "data", db)

	fp, err := os.Open(filepath.Join(datadir, "stderr.txt.gz"))
	if err != nil {
		sysErr(err)
		fmt.Fprintln(q.w, err)
		return
	}
	defer fp.Close()
	gz, err := gzip.NewReader(fp)
	if err != nil {
		sysErr(err)
		fmt.Fprintln(q.w, err)
		return
	}
	defer gz.Close()
	rd := util.NewReader(gz)
	state := 0
	for {
		line, err := rd.ReadLineString()
		if err == io.EOF {
			break
		}
		if err != nil {
			sysErr(err)
			fmt.Fprintln(q.w, err)
			return
		}
		if state == 0 {
			if strings.HasPrefix(line, "****") {
				f := strings.Fields(line)
				if len(f) > 2 && f[1] == "parsing" {
					if coded {
						if f[2][5:] == lbl {
							state = 1
						} else if len(f[2]) > 10 && f[2][9] == '-' && f[2][10:] == lbl && reCodedPrefix.MatchString(f[2]) {
							state = 1
						}
					} else {
						if f[2] == lbl {
							state = 1
						}
					}
					if state == 1 {
						fmt.Fprintln(q.w, line)
					}
				}
			}
		} else {
			fmt.Fprintln(q.w, line)
			if strings.HasPrefix(line, "****") {
				break
			}
		}
	}
}
Exemplo n.º 2
0
// TAB: browse (zinnen)
func browse(q *Context) {

	if !q.auth {
		http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized)
		return
	}

	id := first(q.r, "id")
	if !q.myprefixes[id] {
		http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized)
		return
	}

	datadir := filepath.Join(paqudir, "data", id)
	fp, err := os.Open(filepath.Join(datadir, "summary.txt.gz"))
	if err != nil {
		http.Error(q.w, err.Error(), http.StatusInternalServerError)
		logerr(err)
		return
	}
	defer fp.Close()
	gz, err := gzip.NewReader(fp)
	if err != nil {
		http.Error(q.w, err.Error(), http.StatusInternalServerError)
		logerr(err)
		return
	}
	defer gz.Close()

	rd := util.NewReader(gz)
	line, _ := rd.ReadLineString()
	a := strings.SplitN(line, "\t", 3)
	nline, _ := strconv.Atoi(a[0])
	nerr, _ := strconv.Atoi(a[1])

	// HTML-uitvoer van begin van de pagina
	writeHead(q, "Overzicht", 0)
	fmt.Fprintf(q.w, `
<script type="text/javascript"><!--
  function formclear(f) {
    f.lbl.value = "";
  }
//--></script>
Corpus: <b>%s</b>
<p>
Bron: %s
<p>
`, q.desc[id], a[2])

	if nerr > 0 {
		fmt.Fprintf(q.w, `
Er waren problemen met %d van de %d zinnen:
<p>
<table class="corpora">
<tr><th>Label<th>Fout<th>Zin</tr>
`, nerr, nline)
		lineno := 0
		for {
			lineno++
			line, e := rd.ReadLineString()
			if e != nil {
				break
			}
			a := strings.SplitN(line, "\t", 4)
			eo := "even"
			if lineno%2 == 1 {
				eo = "odd"
			}
			if lineno == 1 {
				eo += " first"
			}
			if lineno == nerr {
				eo += " last"
			}
			fmt.Fprintf(q.w, "<tr class=\"%s\"><td class=\"odd first\"><b><a href=\"browserr?db=%s&amp;s=%s\" target=\"_blank\">%s</a></b><td class=\"even\">%s&nbsp;|&nbsp;%s<td class=\"odd\">%s",
				eo,
				id, html.EscapeString(a[0]), html.EscapeString(a[0]),
				html.EscapeString(a[1]), a[2], html.EscapeString(a[3]))
		}
		fmt.Fprint(q.w, "</table>\n<p>\n")
	}

	// HTML-uitvoer van het formulier
	fmt.Fprintf(q.w, `
<form action="browse" method="get" accept-charset="utf-8">
<input type="hidden" name="id" value="%s">
Label: <input type="text" name="lbl" size="20" value="%s">
<input type="submit" value="Zoeken">
<input type="button" value="Wissen" onClick="javascript:formclear(form)">
</form>
`, id, html.EscapeString(first(q.r, "lbl")))

	// Maximaal 2*ZINMAX matchende xml-bestanden opvragen

	offset := 0
	o, err := strconv.Atoi(first(q.r, "offset"))
	if err == nil {
		offset = o
	}

	lbl := first(q.r, "lbl")
	query := ""
	if lbl != "" {
		query = fmt.Sprintf("WHERE `lbl` LIKE %q", lbl)
	}

	rows, err := q.db.Query(
		fmt.Sprintf(
			"SELECT `arch`,`file`,`sent`,`lbl` FROM `%s_c_%s_sent` %s LIMIT %d,%d",
			Cfg.Prefix,
			id,
			query,
			offset,
			2*ZINMAX))
	if doErr(q, err) {
		return
	}

	zinnen := make([]ZinArchFile, 0, 2*ZINMAX)

	nzin := 0
	for rows.Next() {
		nzin++
		var arch, file int
		var sent, lbl string
		err := rows.Scan(&arch, &file, &sent, &lbl)
		if doErr(q, err) {
			return
		}
		zinnen = append(zinnen, ZinArchFile{zin: sent, arch: arch, file: file})
	}

	for i, zin := range zinnen {
		rows, err := q.db.Query(
			fmt.Sprintf(
				"SELECT `lbl` FROM `%s_c_%s_sent` WHERE `file` = %d AND `arch` = %d", Cfg.Prefix, id, zin.file, zin.arch))
		if err == nil && rows.Next() {
			var lbl string
			rows.Scan(&lbl)
			rows.Close()
			zinnen[i].lbl = lbl
		} else {
			doErr(q, fmt.Errorf("Database error"))
		}
	}

	fmt.Fprintln(q.w, "<p>\n<dl>")
	for _, zin := range zinnen {
		fmt.Fprintf(q.w, "<dt><a href=\"tree?db=%s&amp;arch=%d&amp;file=%d\">%s</a>\n<dd>%s\n",
			id, zin.arch, zin.file,
			html.EscapeString(zin.lbl),
			html.EscapeString(zin.zin))
		if !q.hasmeta[id] {
			continue
		}
		rows, err := q.db.Query(fmt.Sprintf(
			"SELECT `idx`,`type`,`name`,`tval`,`ival`,`fval`,`dval` FROM `%s_c_%s_meta` JOIN `%s_c_%s_midx` USING (`id`) "+
				"WHERE `file` = %d AND `arch` = %d ORDER BY `name`,`idx`",
			Cfg.Prefix, id,
			Cfg.Prefix, id,
			zin.file, zin.arch))
		if err == nil {
			pre := "<p>"
			for rows.Next() {
				var v, mtype, name, tval string
				var idx, ival int
				var fval float32
				var dval time.Time
				rows.Scan(&idx, &mtype, &name, &tval, &ival, &fval, &dval)
				if idx == 2147483647 {
					continue
				}
				name = unHigh(name)
				switch mtype {
				case "TEXT":
					v = unHigh(tval)
				case "INT":
					v = iformat(ival)
				case "FLOAT":
					v = fmt.Sprintf("%g", fval)
				case "DATE":
					v = printDate(dval, false)
				case "DATETIME":
					v = printDate(dval, true)
				}
				fmt.Fprintf(q.w, "%s&nbsp; %s: %s\n", pre, html.EscapeString(name), html.EscapeString(v))
				pre = "<br>"
			}
		} else {
			doErr(q, fmt.Errorf("Database error"))
		}
	}
	fmt.Fprint(q.w, "</dl>\n<p>\n")

	// Links naar volgende en vorige pagina's met resultaten
	qs := fmt.Sprintf("id=%s&amp;lbl=%s", urlencode(id), urlencode(lbl))
	if offset > 0 || nzin == 2*ZINMAX {
		if offset > 0 {
			fmt.Fprintf(q.w, "<a href=\"browse?%s&amp;offset=%d\">vorige</a>", qs, offset-2*ZINMAX)
		} else {
			fmt.Fprint(q.w, "vorige")
		}
		fmt.Fprint(q.w, " | ")
		if nzin == 2*ZINMAX {
			fmt.Fprintf(q.w, "<a href=\"browse?%s&amp;offset=%d\">volgende</a>", qs, offset+2*ZINMAX)
		} else {
			fmt.Fprint(q.w, "volgende")
		}
	}

	fmt.Fprint(q.w, "</body>\n</html>\n")

}
Exemplo n.º 3
0
// Run command, maar onderbreek het als chKill of chGlobalExit gesloten is
func run(cmd *exec.Cmd, chKill chan bool, chPipe chan string) error {
	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}

	chRet := make(chan error, 2)
	// deze functie schrijft twee keer op chRet
	go func() {
		if chPipe == nil {
			cmd.Start()
			chRet <- nil
			err := cmd.Wait()
			chRet <- err
			return
		}

		pipe, err := cmd.StdoutPipe()
		if err != nil {
			chRet <- nil
			chRet <- err
			return
		}
		cmd.Start()
		chRet <- nil

		var err1, err2 error

		rd := util.NewReader(pipe)
		for {
			var line string
			line, err1 = rd.ReadLineString()
			if err1 == io.EOF {
				err1 = nil
				break
			}
			if err1 != nil {
				break
			}
			chPipe <- line
		}
		close(chPipe)

		err2 = cmd.Wait()

		if err1 != nil {
			chRet <- err1
		} else {
			chRet <- err2
		}
	}()

	<-chRet // commando is gestart

	pgid, err := syscall.Getpgid(cmd.Process.Pid)
	if err != nil {
		// misschien betekent een fout alleen maar dat het process net klaar is
		logf("BIG TROUBLE: syscall.Getpgid(cmd.Process.Pid) error: %v", err)
		pgid = 0
	}

FORSELECT:
	for {
		select {
		case err = <-chRet:
			return err
		case <-chGlobalExit:
			break FORSELECT
		case <-chKill:
			break FORSELECT
		}
	}
	for _, sig := range []int{15, 9} {
		err = syscall.Kill(-pgid, syscall.Signal(sig))
		if err != nil {
			logf("syscall.Kill(-pgid, %d) error: %v", sig, err)
		}
		if sig != 9 {
			time.Sleep(2 * time.Second)
		}
	}
	err = <-chRet
	return err
}
Exemplo n.º 4
0
func dowork(db *sql.DB, task *Process) (user string, title string, err error) {
	logf("WORKING: " + task.id)

	processLock.Lock()
	if task.nr > taskWorkNr {
		taskWorkNr = task.nr
		if taskWaitNr == taskWorkNr {
			// queue is leeg: reset counters om (ooit) overflow te voorkomen
			taskWaitNr = 0
			taskWorkNr = 0
		}
	}
	processLock.Unlock()

	_, err = db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `status` = \"WORKING\", `nword` = 0 WHERE `id` = %q",
		Cfg.Prefix, task.id))
	if err != nil {
		return
	}

	params := "unknown"
	isArch := false
	var rows *sql.Rows
	rows, err = db.Query(fmt.Sprintf("SELECT `description`,`owner`,`params` FROM `%s_info` WHERE `id` = %q",
		Cfg.Prefix, task.id))
	if err != nil {
		return
	}
	if rows.Next() {
		err = rows.Scan(&title, &user, &params)
		rows.Close()
		if err != nil {
			return
		}
		if strings.Contains(params, "-arch") {
			params = strings.Replace(params, "-arch", "", 1)
			isArch = true
		}
	}

	dirname := filepath.Join(paqudir, "data", task.id)
	data := filepath.Join(dirname, "data")
	xml := filepath.Join(dirname, "xml")
	dact := filepath.Join(dirname, "data.dact")
	stdout := filepath.Join(dirname, "stdout.txt")
	stderr := filepath.Join(dirname, "stderr.txt")
	summary := filepath.Join(dirname, "summary.txt")

	// gzip
	var fp *os.File
	fp, err = os.Open(data)
	if err != nil {
		return
	}
	b := make([]byte, 2)
	io.ReadFull(fp, b)
	fp.Close()
	if string(b) == "\x1F\x8B" {
		// gzip
		fpin, _ := os.Open(data)
		r, e := gzip.NewReader(fpin)
		if e != nil {
			fpin.Close()
			err = e
			return
		}
		fpout, _ := os.Create(data + ".tmp")
		_, err = io.Copy(fpout, r)
		fpout.Close()
		r.Close()
		fpin.Close()
		if err != nil {
			return
		}
		os.Rename(data+".tmp", data)
	}

	var ar *arch
	ar, err = NewArchReader(data)
	if err == nil {

		//    als eerste bestand alpino-xml is
		//       params is xmlzip
		//    anders uitpakken en alles aan elkaar plakken in data, met newline aan eind van elk deel

		e := ar.Next()
		if e == io.EOF {
			ar.Close()
			err = LeegArchief
			return
		}
		if e != nil {
			ar.Close()
			err = e
			return
		}

		var b []byte
		b, err = ar.ReadN(1000)
		if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
			ar.Close()
			return
		}

		if strings.Contains(string(b), "<alpino_ds") {
			if !strings.HasPrefix(params, "xmlzip") {
				params = "xmlzip"
			}
			setinvoer(db, params, task.id, false)
			ar.Close()
		} else {
			if strings.Contains(string(b), "<FoLiA") {
				params = "folia-arch"
			} else if strings.Contains(string(b), "<TEI") {
				params = "tei-arch"
			}
			isArch = true
			ar.Close()
			var fp *os.File
			fp, err = os.Create(data + ".unzip")
			if err != nil {
				return
			}
			ar, err = NewArchReader(data)
			for {
				e := ar.Next()
				if e == io.EOF {
					break
				}
				if e != nil {
					ar.Close()
					fp.Close()
					err = e
					return
				}
				if params == "folia-arch" || params == "tei-arch" {
					fmt.Fprintf(fp, "##PAQUFILE %s\n", hex.EncodeToString([]byte(ar.Name())))
				} else {
					fmt.Fprintf(fp, "\n##PAQUFILE %s\n", ar.Name())
				}
				var buf bytes.Buffer
				if params == "folia-arch" || params == "tei-arch" {
					err = ar.Copy(&buf)
				} else {
					err = ar.Copy(fp)
				}
				if err != nil {
					fp.Close()
					ar.Close()
					return
				}
				if params == "folia-arch" {
					folia(ar.Name()+".", &buf, fp)
				} else if params == "tei-arch" {
					tei(ar.Name()+".", &buf, fp)
				} else {
					fmt.Fprintln(fp)
				}
			}
			fp.Close()
			ar.Close()
			if params == "folia-arch" || params == "tei-arch" {
				os.Rename(data+".unzip", data+".tmp")
			}
		}
	}

	if params == "auto" {
		if isArch {
			params, err = invoersoort(db, data+".unzip", task.id)
		} else {
			params, err = invoersoort(db, data, task.id)
		}
		if err != nil {
			return
		}
		if params == "dact" {
			os.Rename(data, dact)
		}
	}

	if isArch {
		setinvoer(db, params, task.id, true)
	}

	defer func() {
		select {
		case <-chGlobalExit:
			err = errGlobalExit
			return
		case <-task.chKill:
			err = errKilled
			return
		default:
		}
		os.Remove(data + ".lines.tmp")
		os.Remove(data + ".tmp")
		os.Remove(data + ".tmp2")
		for _, f := range []string{data, data + ".lines", stdout, stderr, summary} {
			if f == data && (params == "dact" || strings.HasPrefix(params, "xmlzip")) {
				continue
			}
			logerr(gz(f))
		}
		fnames, e := filenames2(xml, false)
		if logerr(e) {
			return
		}
		for _, fname := range fnames {
			select {
			case <-chGlobalExit:
				err = errGlobalExit
				return
			case <-task.chKill:
				err = errKilled
				return
			default:
			}
			logerr(gz(filepath.Join(xml, fname)))
		}
		if params == "dact" && !Cfg.Dact {
			os.Remove(dact)
		}
		if strings.HasPrefix(params, "xmlzip") {
			os.Remove(data)
		}
	}()

	select {
	case <-chGlobalExit:
		err = errGlobalExit
		return
	case <-task.chKill:
		err = errKilled
		return
	default:
	}

	if params == "dact" {
		var tokens, nlines int
		tokens, nlines, err = unpackDact(data, xml, dact, stderr, task.chKill)
		if err != nil {
			return
		}
		err = do_quotum(db, task.id, user, tokens, nlines)
		if err != nil {
			os.Remove(dact)
			os.Remove(dact + "x")
			os.Remove(data + ".lines")
			os.Remove(stderr)
			os.RemoveAll(xml)
			return
		}
	} else if strings.HasPrefix(params, "xmlzip") {
		var tokens, nlines int
		tokens, nlines, err = unpackXml(data, xml, stderr, task.chKill)
		if err != nil {
			return
		}
		err = do_quotum(db, task.id, user, tokens, nlines)
		if err != nil {
			os.Remove(data + ".lines")
			os.Remove(stderr)
			os.RemoveAll(xml)
			return
		}
	} else { // if params != (dact || xmlzip)
		reuse := false
		reuse_more := false
		if files, e := filenames2(xml, false); e == nil && len(files) > 0 {
			reuse = true
			if isArch {
				os.Remove(data + ".unzip")
			}
			done := make(map[string]bool)
			for _, f := range files {
				b, e := ioutil.ReadFile(filepath.Join(xml, f))
				if e != nil {
					err = e
					return
				}
				if strings.Index(string(b), "</alpino_ds>") > 0 {
					done[f] = true
				}
			}
			fpin, e := os.Open(data + ".lines")
			if e != nil {
				err = e
				return
			}
			fpout, e := os.Create(data + ".lines.tmp")
			if e != nil {
				fpin.Close()
				err = e
				return
			}
			nword := 0
			r := util.NewReader(fpin)
			for {
				line, e := r.ReadLineString()
				if e != nil {
					fpout.Close()
					fpin.Close()
					if e != io.EOF {
						err = e
						return
					}
					break
				}
				nword += len(strings.Fields(line))
				key := line[:strings.Index(line, "|")]
				if !done[key+".xml"] {
					fmt.Fprintln(fpout, line)
					reuse_more = true
				}
			}

			_, err = db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `nword` = %d WHERE `id` = %q",
				Cfg.Prefix, nword, task.id))
			if err != nil {
				return
			}

		} else { // if !reuse
			var has_tok, has_lbl, is_xml, is_arch bool
			if strings.Contains(params, "-arch") {
				is_arch = true
			}
			if strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") {
				is_xml = true
				has_lbl = true
				has_tok = true
			}
			if strings.Contains(params, "-lbl") {
				has_lbl = true
			}
			if strings.Contains(params, "-tok") {
				has_tok = true
			}

			if is_xml {
				if !is_arch {
					var fpin, fpout *os.File
					fpin, err = os.Open(data)
					if err != nil {
						return
					}
					fpout, err = os.Create(data + ".tmp")
					if err != nil {
						fpin.Close()
						return
					}
					if params == "folia" {
						err = folia("", fpin, fpout)
					} else {
						err = tei("", fpin, fpout)
					}
					fpout.Close()
					fpin.Close()
					if err != nil {
						return
					}
				}
			} else {
				// pqtexter
				var pqtexter, unzip string
				if params == "run" {
					pqtexter = "-r"
				} else if has_lbl {
					pqtexter = "-l"
				}
				if isArch {
					unzip = ".unzip"
				}
				err = shell("pqtexter %s %s%s > %s.tmp 2>> %s", pqtexter, data, unzip, data, stderr).Run()
				if err != nil {
					return
				}
				if isArch {
					os.Remove(data + ".unzip")
				}
			}

			// verwijderen van commentaren en labels zonder tekst
			if params == "run" || strings.HasPrefix(params, "line") {
				var fpin, fpout *os.File
				fpin, err = os.Open(data + ".tmp")
				if err != nil {
					return
				}
				fpout, err = os.Create(data + ".tmp2")
				if err != nil {
					fpin.Close()
					return
				}
				rd := util.NewReader(fpin)
				for {
					line, e := rd.ReadLineString()
					if e != nil {
						break
					}
					if strings.TrimSpace(line) == "" || line[0] == '%' || reRunLabel.MatchString(line) {
						continue
					}
					fmt.Fprintln(fpout, line)
				}
				fpout.Close()
				fpin.Close()
				os.Remove(data + ".tmp")
				os.Rename(data+".tmp2", data+".tmp")
			}

			// ontdubbelen van labels
			if strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || strings.Contains(params, "-lbl") {
				var fpin, fpout *os.File
				var dubbel bool
				fpin, err = os.Open(data + ".tmp")
				if err != nil {
					return
				}
				fpout, err = os.Create(data + ".tmp2")
				if err != nil {
					fpin.Close()
					return
				}
				rd := util.NewReader(fpin)
				seen := make(map[string]bool)
				for {
					line, e := rd.ReadLineString()
					if e != nil {
						break
					}
					if strings.HasPrefix(line, "##PAQULBL") {
						var b []byte
						b, err = hex.DecodeString(strings.Fields(line)[1])
						if err != nil {
							fpout.Close()
							fpin.Close()
							return
						}
						lbl := string(b)
						if seen[lbl] {
							dubbel = true
							for i := 1; true; i++ {
								lbl2 := fmt.Sprintf("%s.dup.%d", lbl, i)
								if !seen[lbl2] {
									lbl = lbl2
									break
								}
							}
							line = "##PAQULBL " + hex.EncodeToString([]byte(lbl))
						}
						seen[lbl] = true
					}
					fmt.Fprintln(fpout, line)
				}
				fpout.Close()
				fpin.Close()
				if dubbel {
					os.Remove(data + ".tmp")
					os.Rename(data+".tmp2", data+".tmp")
				} else {
					os.Remove(data + ".tmp2")
				}
			}

			// tokenizer
			if !has_tok {
				var tok string
				if params == "run" {
					tok = "tokenize.sh"
				} else {
					tok = "tokenize_no_breaks.sh"
				}
				err = shell("$ALPINO_HOME/Tokenization/%s < %s.tmp > %s.tmp2 2>> %s", tok, data, data, stderr).Run()
				if err != nil {
					return
				}
				os.Rename(data+".tmp2", data+".tmp")
			}

			var fp, fpin *os.File
			fpin, err = os.Open(data + ".tmp")
			if err != nil {
				return
			}
			fp, err = os.Create(data + ".lines")
			if err != nil {
				fpin.Close()
				return
			}

			metalines := make([]string, 0)
			inmeta := false

			rd := util.NewReader(fpin)
			var filename, lbl string
			var tokens, nlines, i int
			var metaseen map[string]bool
			for {
				line, e := rd.ReadLineString()
				if e == io.EOF {
					break
				}
				if e != nil {
					err = e
					fp.Close()
					fpin.Close()
					return
				}
				if line == "" {
					continue
				}
				if strings.HasPrefix(line, "##PAQU") {
					a := strings.Fields(line)
					var val string
					if len(a) == 2 {
						b, e := hex.DecodeString(a[1])
						if e != nil {
							err = e
							fp.Close()
							fpin.Close()
							return
						}
						val = strings.TrimSpace(string(b))
					}
					if a[0] == "##PAQUFILE" {
						filename = val
						if strings.HasSuffix(filename, ".txt") || strings.HasSuffix(filename, ".doc") {
							filename = filename[:len(filename)-4]
						}
						i = 0
						metalines = metalines[0:0]
						pp := strings.Split(filename, "/")
						for i, p := range pp {
							metalines = append(metalines, fmt.Sprintf("text\tpaqu.path%d\t%s", i+1, p))
						}
						metaseen = make(map[string]bool)
						inmeta = true
					} else if a[0] == "##PAQULBL" {
						lbl = val
					}
				} else if strings.HasPrefix(line, "##META") {
					if !inmeta {
						metaseen = make(map[string]bool)
						inmeta = true
					}
					a := strings.Fields(line)
					if len(a) == 2 {
						b, e := hex.DecodeString(a[1])
						if e != nil {
							err = e
							fp.Close()
							fpin.Close()
							return
						}
						f := strings.Fields(string(b))
						if len(f) > 2 && f[2] == "=" {
							// als deze voor het eerst, dan alle oude met dezelfde naam wegdoen
							if !metaseen[f[1]] {
								metaseen[f[1]] = true
								for i := 0; i < len(metalines); i++ {
									if i == 0 && filename != "" {
										continue
									}
									if a := strings.Split(metalines[i], "\t"); a[1] == f[1] {
										metalines = append(metalines[:i], metalines[i+1:]...)
										i--
									}
								}
							}
							if len(f) > 3 {
								metalines = append(metalines, fmt.Sprintf("%s\t%s\t%s", f[0], f[1], strings.Join(f[3:], " ")))
							}
						}
					}
				} else {
					inmeta = false
					tokens += len(strings.Fields(line))
					var fname string
					if lbl == "" {
						if filename != "" {
							i++
							fname = fmt.Sprintf("%04d/%04d-%s.%d", nlines/10000, nlines%10000, encode_filename(filename), i)
						} else {
							fname = fmt.Sprintf("%04d/%04d", nlines/10000, nlines%10000)
						}
					} else {
						fname = fmt.Sprintf("%04d/%04d-%s", nlines/10000, nlines%10000, encode_filename(lbl))
						lbl = ""
					}
					fmt.Fprintf(fp, "%s|%s\n", fname, strings.TrimSpace(line))
					nlines++
					if len(metalines) > 0 {
						fpm, e := os.Create(filepath.Join(xml, fname+".meta"))
						if e != nil {
							err = e
							fp.Close()
							fpin.Close()
							return
						}
						for _, m := range metalines {
							fmt.Fprintln(fpm, m)
						}
						fpm.Close()
					}
				}
			}
			fp.Close()
			fpin.Close()
			os.Remove(data + ".tmp")

			err = do_quotum(db, task.id, user, tokens, nlines)
			if err != nil {
				os.Remove(data)
				os.Remove(data + ".lines")
				os.Remove(stderr)
				os.RemoveAll(xml)
				return
			}

			if err != nil {
				return
			}

		} // end if !reuse

		if !reuse || reuse_more {

			var ext string
			if reuse {
				ext = ".tmp"
			}

			var server, timeout string
			if Cfg.Alpinoserver != "" {
				server = "-s " + Cfg.Alpinoserver
			}
			if Cfg.Timeout > 0 {
				timeout = fmt.Sprint("-t ", Cfg.Timeout)
			}
			cmd := shell(
				`pqalpino -e half -l -T -q -n %d -d %s %s %s %s.lines%s >> %s 2>> %s`,
				Cfg.Maxtokens, xml, server, timeout, data, ext, stdout, stderr)
			err = run(cmd, task.chKill, nil)
			if err != nil {
				return
			}
		}
	} // end if params != (dact || xmlzip)

	nlines := 0
	fp, e := os.Open(data + ".lines")
	if e != nil {
		err = e
		return
	}
	rd := util.NewReader(fp)
	for {
		_, e := rd.ReadLine()
		if e != nil {
			fp.Close()
			if e == io.EOF {
				break
			} else {
				err = e
				return
			}
		}
		nlines++
	}

	errlines := make([]string, 0)
	fp, e = os.Open(stderr)
	if e != nil {
		err = e
		return
	}
	rd = util.NewReader(fp)
	errseen := make(map[string]bool)
	for {
		line, e := rd.ReadLineString()
		if e != nil {
			fp.Close()
			if e == io.EOF {
				break
			} else {
				err = e
				return
			}
		}
		if strings.HasPrefix(line, "Q#") {
			a := strings.Split(line, "|")
			ln := len(a)
			n, err := strconv.Atoi(a[ln-3])
			if err == nil && n > 0 {
				continue
			}

			if ln > 5 {
				a[1] = strings.Join(a[1:ln-3], "|")
			}
			fname := a[0][2:]
			if params == "run" || strings.HasPrefix(params, "line") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") {
				fname = decode_filename(a[0][2:])
			}
			if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") {
				fname = fname[1+strings.Index(fname, "-"):]
			}
			// bij herstart worden mislukte zinnen opnieuw geparst, en mislukken dan opnieuw
			errline := fname + "\t" + a[ln-3] + "\t" + a[ln-2] + "\t" + a[1] + "\n"
			if !errseen[errline] {
				errlines = append(errlines, errline)
				errseen[errline] = true
			}
		}
	}
	fp, err = os.Create(summary)
	if err != nil {
		return
	}
	fmt.Fprintf(fp, "%d\t%d\t%s\n", nlines, len(errlines), invoertabel[params])
	for _, line := range errlines {
		fmt.Fprint(fp, line)
	}
	fp.Close()

	p := regexp.QuoteMeta(xml + "/")
	d := ""
	if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || isArch {
		p += "[0-9]+/[0-9]+-"
		d = "-d"
	} else if params == "dact" || strings.HasPrefix(params, "xmlzip") {
		p += "[0-9]+/"
		d = "-d"
	}

	filenames, e := filenames2(xml, true)
	if e != nil {
		err = e
		return
	}
	for _, filename := range filenames {
		m := filepath.Join(xml, filename)
		x := m[:len(m)-4] + "xml"
		var xb, mb []byte
		xb, err = ioutil.ReadFile(x)
		if err != nil {
			os.Remove(m)
			continue
		}
		mb, err = ioutil.ReadFile(m)
		if err != nil {
			return
		}
		fp, err = os.Create(x + ".tmp")
		if err != nil {
			return
		}
		xt := string(xb)
		mt := strings.Split(strings.TrimSpace(string(mb)), "\n")
		i := strings.Index(xt, "<node")
		fmt.Fprint(fp, xt[:i], "<metadata>\n")
		for _, m := range mt {
			mm := strings.Split(m, "\t")
			fmt.Fprintf(fp,
				"    <meta type=%q name=%q value=%q/>\n",
				html.EscapeString(strings.ToLower(mm[0])),
				html.EscapeString(mm[1]),
				html.EscapeString(mm[2]))
		}
		fmt.Fprint(fp, "  </metadata>\n  ", xt[i:])
		fp.Close()
		os.Rename(x+".tmp", x)
		os.Remove(m)
	}

	cmd := shell(
		// optie -w i.v.m. recover()
		`find %s -name '*.xml' | sort | pqbuild -w -p %s %s -s %s %s %s 0 >> %s 2>> %s`,
		dirname,
		quote(p), d,
		filepath.Base(dirname), quote(title), quote(user), stdout, stderr)
	err = run(cmd, task.chKill, nil)
	if err != nil {
		return
	}

	if Cfg.Dact && params != "dact" {
		p := ""
		if strings.Contains(params, "-lbl") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") || isArch {
			p = "-"
		} else if strings.HasPrefix(params, "xmlzip") {
			p = "/"
		}
		err = makeDact(dact, xml, p, task.chKill)
		if err != nil {
			return
		}
	}

	return
}
Exemplo n.º 5
0
func download(q *Context) {
	id := first(q.r, "id")
	dl := first(q.r, "dl")

	params := q.params[id]
	if !q.myprefixes[id] {
		// misschien een corpus dat mislukt is
		rows, err := q.db.Query(
			fmt.Sprintf("SELECT `params` FROM `%s_info` WHERE `id` = %q AND `owner` = %q",
				Cfg.Prefix,
				id,
				q.user))
		if err != nil {
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}
		if rows.Next() {
			err := rows.Scan(&params)
			rows.Close()
			if err != nil {
				http.Error(q.w, err.Error(), http.StatusInternalServerError)
				logerr(err)
				return
			}
		} else {
			http.Error(q.w, "Dat is niet je corpus", http.StatusUnauthorized)
			return
		}
	}

	if q.protected[id] && (dl == "xml" || dl == "dact") {
		http.Error(q.w, "Dat is afgeleid van een corpus dat niet van jou is", http.StatusUnauthorized)
		return
	}

	datadir := filepath.Join(paqudir, "data", id)
	var filename string
	switch dl {
	case "summary":
		filename = "summary.txt"
	case "stdout":
		filename = "stdout.txt"
	case "stderr":
		filename = "stderr.txt"
	case "zinnen":
		if !strings.Contains(params, "-lbl") && !strings.Contains(params, "-arch") && !strings.HasPrefix(params, "folia") && !strings.HasPrefix(params, "tei") {
			filename = "data.lines"
		}
	case "dact":
	case "xml":
	default:
		http.Error(q.w, "Ongeldige selectie: "+dl, http.StatusUnauthorized)
		return
	}

	if filename != "" {
		fp, err := os.Open(filepath.Join(datadir, filename+".gz"))
		if err != nil {
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}
		r, err := gzip.NewReader(fp)
		if err != nil {
			fp.Close()
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}
		q.w.Header().Set("Content-Type", "text/plain; charset=utf-8")
		io.Copy(q.w, r)
		r.Close()
		fp.Close()
		return
	}

	// dact
	if dl == "dact" {
		fp, err := os.Open(filepath.Join(datadir, "data.dact"))
		if err != nil {
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}
		q.w.Header().Set("Content-Type", "application/octet-stream")
		q.w.Header().Set("Content-Disposition", "attachment; filename="+id+".dact")
		io.Copy(q.w, fp)
		fp.Close()
		return
	}

	// data.lines met verkeerde labels
	if dl == "zinnen" {
		fp, err := os.Open(filepath.Join(datadir, "data.lines.gz"))
		if err != nil {
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}
		r, err := gzip.NewReader(fp)
		if err != nil {
			fp.Close()
			http.Error(q.w, err.Error(), http.StatusInternalServerError)
			logerr(err)
			return
		}

		q.w.Header().Set("Content-Type", "text/plain; charset=utf-8")
		rd := util.NewReader(r)
		for {
			line, err := rd.ReadLineString()
			if err != nil {
				break
			}
			a := strings.SplitN(line, "|", 2)
			lbl := decode_filename(a[0][1+strings.Index(a[0], "-"):])
			fmt.Fprintf(q.w, "%s|%s\n", lbl, a[1])
		}
		r.Close()
		fp.Close()

		return
	}

	// xml
	datadir = filepath.Join(datadir, "xml")
	files, err := filenames2(datadir, false)
	if err != nil {
		http.Error(q.w, err.Error(), http.StatusInternalServerError)
		logerr(err)
		return
	}

	q.w.Header().Set("Content-Type", "application/zip")
	q.w.Header().Set("Content-Disposition", "attachment; filename="+id+".zip")

	w := zip.NewWriter(q.w)
	for _, gzname := range files {
		fullgzname := filepath.Join(datadir, gzname)
		file, err := os.Stat(fullgzname)
		name := decode_filename(gzname[:len(gzname)-3])
		if params == "dact" || strings.HasPrefix(params, "xmlzip") {
			name = name[1+strings.Index(name, "/"):]
		} else if strings.Contains(params, "-lbl") || strings.Contains(params, "-arch") || strings.HasPrefix(params, "folia") || strings.HasPrefix(params, "tei") {
			name = name[1+strings.Index(name, "-"):]
		}
		if err != nil {
			logerr(err)
			return
		}
		fh, err := zip.FileInfoHeader(file)
		if err != nil {
			logerr(err)
			return
		}
		fh.Name = filepath.Join(id, name)
		f, err := w.CreateHeader(fh)
		if err != nil {
			logerr(err)
			return
		}

		fp, err := os.Open(filepath.Join(datadir, gzname))
		if err != nil {
			logerr(err)
			return
		}
		r, err := gzip.NewReader(fp)
		if err != nil {
			fp.Close()
			logerr(err)
			return
		}

		io.Copy(f, r)
		r.Close()
		fp.Close()
	}
	err = w.Close()
	if err != nil {
		logerr(err)
		return
	}
}
Exemplo n.º 6
0
func invoersoort(db *sql.DB, data, id string) (string, error) {

	set := func(soort string) (string, error) {
		return soort, setinvoer(db, soort, id, false)
	}

	fp, err := os.Open(data)
	if err != nil {
		return "", err
	}
	defer fp.Close()

	b := make([]byte, 200)
	n, _ := io.ReadFull(fp, b)
	fp.Seek(0, 0)

	if n >= 3 {
		s := string(b[:4])
		if s == "\x00\x06\x15\x61" || s == "\x61\x15\x06\x00" || s == "\x00\x05\x31\x62" || s == "\x62\x31\x05\x00" {
			return set("dact")
		}
	}

	if n > 15 {
		s := string(b[12:16])
		if s == "\x00\x06\x15\x61" || s == "\x61\x15\x06\x00" ||
			s == "\x00\x05\x31\x62" || s == "\x62\x31\x05\x00" ||
			s == "\x00\x04\x22\x53" || s == "\x53\x22\x04\x00" {
			return set("dact")
		}
	}

	if strings.Contains(string(b), "<FoLiA") {
		return set("folia")
	}

	if strings.Contains(string(b), "<TEI") {
		return set("tei")
	}

	lines := make([]string, 0, 20)
	rd := util.NewReader(fp)
	for i := 0; i < 20; i++ {
		line, e := rd.ReadLineString()
		if e != nil {
			break
		}
		line = strings.ToUpper(line)
		line = strings.Replace(line, "\000", "", -1) // utf-16, utf-32, grove methode
		if strings.TrimSpace(line) == "" ||
			strings.HasPrefix(line, "##PAQU") ||
			strings.HasPrefix(line, "##META") ||
			line[0] == '%' ||
			reRunLabel.MatchString(line) {
			i--
		} else {
			lines = append(lines, line)
		}
	}
	ln := len(lines)
	if ln < 2 {
		return set("run")
	}

	endletter := 0
	midpoint := 0
	nlabel := 0
	for _, line := range lines {
		if !reEndPoint.MatchString(line) {
			endletter++
		}
		midpoint += len(reMidPoint.FindAllString(line, -1))
		if strings.Contains(line, "|") {
			nlabel++
		}
	}
	if nlabel < ln && (endletter > ln/3 || midpoint > endletter/2) {
		return set("run")
	}

	soort := "line"

	ntok := 0
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if strings.HasSuffix(line, " .") || strings.HasSuffix(line, " !") || strings.HasSuffix(line, " ?") {
			ntok++
		}
	}
	if nlabel == ln {
		soort += "-lbl"
	}
	if ntok > (3*ln)/4 {
		soort += "-tok"
	}

	return set(soort)
}
Exemplo n.º 7
0
func main() {
	flag.Parse()

	if *opt_f == "" && flag.NArg() == 0 && util.IsTerminal(os.Stdin) && !*opt_a {
		fmt.Fprintf(os.Stderr, "\nUsage: %s [args] [text]\n\nargs with default values are:\n\n", os.Args[0])
		flag.PrintDefaults()
		fmt.Fprintf(os.Stderr, "\nIf both -f and text are missing, read from stdin\n\n")
		return
	}

	extras := make([]string, 0)
	tc := textcat.NewTextCat()
	if *opt_p != "" {
		for _, i := range strings.Split(*opt_p, ",") {
			name := strings.Split(path.Base(i), ".")[0]
			extras = append(extras, name)
			e := tc.AddLanguage(name, i)
			util.CheckErr(e)
		}
	}
	if *opt_z {
		if *opt_r || *opt_b {
			for _, extra := range extras {
				tc.EnableLanguages(extra + ".raw")
			}
		}
		if *opt_b || !*opt_r {
			for _, extra := range extras {
				tc.EnableLanguages(extra + ".utf8")
			}

		}
	} else {
		if *opt_r || *opt_b {
			tc.EnableAllRawLanguages()
		}
		if *opt_b || !*opt_r {
			tc.EnableAllUtf8Languages()
		}
	}
	if *opt_i != "" {
		tc.DisableLanguages(strings.Split(*opt_i, ",")...)
	}

	if *opt_a {
		for _, i := range tc.ActiveLanguages() {
			fmt.Println(i)
		}
		return
	}

	if *opt_l {
		var r *util.Reader
		if *opt_f != "" {
			fp, err := os.Open(*opt_f)
			util.CheckErr(err)
			defer fp.Close()
			r = util.NewReader(fp)
		} else if flag.NArg() > 0 {
			b := bytes.NewBufferString(strings.Join(flag.Args(), " "))
			r = util.NewReader(b)
		} else {
			r = util.NewReader(os.Stdin)
		}
		for {
			line, err := r.ReadLineString()
			if err == io.EOF {
				break
			}
			util.CheckErr(err)
			l, err := tc.Classify(line)
			if err != nil {
				fmt.Print(err)
			} else {
				fmt.Print(strings.Join(l, ","))
			}
			fmt.Println("\t" + line)
		}
		return
	}

	var text string
	if *opt_f != "" {
		t, err := ioutil.ReadFile(*opt_f)
		util.CheckErr(err)
		text = string(t)
	} else if flag.NArg() > 0 {
		text = strings.Join(flag.Args(), " ")
	} else {
		t, err := ioutil.ReadAll(os.Stdin)
		util.CheckErr(err)
		text = string(t)
	}

	l, e := tc.Classify(text)
	if e != nil {
		fmt.Println(e)
	} else {
		fmt.Println(strings.Join(l, "\n"))
	}
}