Beispiel #1
0
func saveOpenDact(q *Context, prefix string, arch int) (interface{}, string) {
	rows, err := q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` WHERE `id` = %d", Cfg.Prefix, prefix, arch))
	if doErr(q, err) {
		return nil, ""
	}
	var filename string
	for rows.Next() {
		err := rows.Scan(&filename)
		if doErr(q, err) {
			rows.Close()
			return nil, ""
		}
	}
	if doErr(q, rows.Err()) {
		return nil, ""
	}
	if filename == "" {
		return nil, ""
	}

	db, err := dbxml.Open(filename)
	if doErr(q, err) {
		return nil, ""
	}

	return db, filename
}
Beispiel #2
0
func do_dact(filename string) {
	reader, err := dbxml.Open(filename)
	util.CheckErr(err)
	fmt.Println(">>>", filename)
	docs, err := reader.All()
	util.CheckErr(err)
	for docs.Next() {
		do_data(filename, docs.Name(), []byte(docs.Content()))
	}
	showmemstats()
	reader.Close()
}
Beispiel #3
0
func get_dact(archive, filename string) ([]byte, error) {
	reader, err := dbxml.Open(archive)
	if err != nil {
		return []byte{}, err
	}
	defer reader.Close()
	d, err := reader.Get(filename)
	if err != nil {
		return []byte{}, err
	}
	return []byte(d), nil
}
Beispiel #4
0
func main() {
	if len(os.Args) != 3 {
		fmt.Printf(`
Syntax: %s infile.dact outfile.xdact

`, os.Args[0])
		return
	}
	db1, err := dbxml.Open(os.Args[1])
	x(err)
	db2, err := dbxml.Open(os.Args[2])
	x(err)
	docs, err := db1.All()
	x(err)
	for docs.Next() {
		name := docs.Name()
		fmt.Fprintln(os.Stderr, name)
		content := docs.Content()
		alpino := Alpino_ds{}
		err = xml.Unmarshal([]byte(content), &alpino)
		x(err)
		if expand(&alpino) {
			b, err := xml.MarshalIndent(&alpino, "", "  ")
			x(err)
			content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
				strings.Replace(
					strings.Replace(string(b), "  <metadata></metadata>\n", "", 1),
					"  <comments></comments>\n", "", 1) + "\n"
		}
		x(db2.PutXml(name, content, false))
	}
	db2.Close()
	db1.Close()
	st, err := os.Stat(os.Args[1])
	x(err)
	x(os.Chmod(os.Args[2], st.Mode()))
}
Beispiel #5
0
func main() {
	db, err := dbxml.Open(os.Args[1])
	if err != nil {
		fmt.Println(err)
		return
	}

	_, err = db.Prepare(os.Args[2])
	if err != nil {
		fmt.Println(err)
		db.Close()
		return
	}

	db.Close()

	fmt.Println("OK")
}
Beispiel #6
0
func xpathstats(q *Context) {

	download := false
	if first(q.r, "d") != "" {
		download = true
	}

	methode := first(q.r, "mt")
	if methode != "dx" {
		methode = "std"
	}

	if download {
		contentType(q, "text/plain; charset=utf-8")
		q.w.Header().Set("Content-Disposition", "attachment; filename=telling.txt")
		cache(q)
	} else {
		contentType(q, "text/html; charset=utf-8")
		cache(q)
		fmt.Fprint(q.w, `<!DOCTYPE html>
<html>
<head>
<title></title>
<script type="text/javascript"><!--
function f(s) {
    window.parent._fn.update(s);
}
function e(s) {
    window.parent._fn.error(s);
}
function init(s) {
    window.parent._fn.init(s);
}
//--></script>
</head>
<body>
`)
	}

	prefix := getprefix(q)
	if !q.prefixes[prefix] {
		if download {
			fmt.Fprintf(q.w, "Invalid corpus: "+prefix)
		} else {
			updateJsonErr(q, "Invalid corpus: "+html.EscapeString(prefix))
			fmt.Fprintln(q.w, "</body>\n</html>")
		}
		return
	}

	var chClose <-chan bool
	if f, ok := q.w.(http.CloseNotifier); ok {
		chClose = f.CloseNotify()
	} else {
		chClose = make(<-chan bool)
	}

	attr := make([]string, 5)
	attr[0], attr[1], attr[2], attr[3], attr[4] =
		first(q.r, "attr1"), first(q.r, "attr2"), first(q.r, "attr3"), first(q.r, "attr4"), first(q.r, "attr5")
	wantRel := false
	j := 0
	for i := 0; i < 5; i++ {
		if attr[i] != "" {
			attr[j] = attr[i]
			j++
			if attr[i] == "rel" {
				wantRel = true
			}
		}
	}
	for ; j < 5; j++ {
		attr[j] = ""
	}
	if attr[0] == "" {
		if download {
			fmt.Fprintln(q.w, "Geen attribuut gekozen")
		} else {
			updateJsonErr(q, "Geen attribuut gekozen")
			fmt.Fprintln(q.w, "</body>\n</html>")
		}
		return
	}

	nAttr := 0
	for i := 0; i < 5; i++ {
		if attr[i] != "" {
			nAttr = i + 1
		}
	}

	var isMeta [5]bool
	var isInt [5]bool
	var isFloat [5]bool
	var isDate [5]bool
	var isDateTime [5]bool
	var iranges [5]*irange
	var franges [5]*frange
	var dranges [5]*drange
	var isidx [5]bool
	var aligns [5]string
	for i := 0; i < 5; i++ {
		aligns[i] = "left"
		if attr[i] != "" && attr[i][0] == ':' {
			isMeta[i] = true
			name := attr[i][1:]
			rows, err := q.db.Query(fmt.Sprintf("SELECT `type` FROM `%s_c_%s_midx` WHERE `name` = %q",
				Cfg.Prefix, prefix, name))
			if err != nil {
				updateError(q, err, !download)
				logerr(err)
				return
			}
			var t string
			for rows.Next() {
				rows.Scan(&t)
			}
			if t == "INT" {
				rows, err := q.db.Query(fmt.Sprintf(
					"SELECT MIN(`ival`), MAX(`ival`), COUNT(DISTINCT `ival`) FROM `%s_c_%s_meta` JOIN `%s_c_%s_midx` USING (`id`) WHERE `name` = %q AND `idx` != 2147483647",
					Cfg.Prefix, prefix,
					Cfg.Prefix, prefix,
					name))
				if err != nil {
					updateError(q, err, !download)
					logerr(err)
					return
				}
				var min, max, count int
				for rows.Next() {
					if rows.Scan(&min, &max, &count) == nil {
						iranges[i] = newIrange(min, max, count)
						isInt[i] = true
						aligns[i] = "right"
						isidx[i] = true
					}
				}
			} else if t == "FLOAT" {
				rows, err := q.db.Query(fmt.Sprintf(
					"SELECT MIN(`fval`), MAX(`fval`) FROM `%s_c_%s_meta` JOIN `%s_c_%s_midx` USING (`id`) WHERE `name` = %q AND `idx` != 2147483647",
					Cfg.Prefix, prefix,
					Cfg.Prefix, prefix,
					name))
				if err != nil {
					updateError(q, err, !download)
					logerr(err)
					return
				}
				var min, max float64
				for rows.Next() {
					if rows.Scan(&min, &max) == nil {
						franges[i] = newFrange(min, max)
						isFloat[i] = true
						aligns[i] = "right"
						isidx[i] = true
					}
				}
			} else if t == "DATE" || t == "DATETIME" {
				rows, err := q.db.Query(fmt.Sprintf(
					"SELECT MIN(`dval`), MAX(`dval`) FROM `%s_c_%s_meta` JOIN `%s_c_%s_midx` USING (`id`) WHERE `name` = %q AND `idx` != 2147483647",
					Cfg.Prefix, prefix,
					Cfg.Prefix, prefix,
					name))
				if err != nil {
					updateError(q, err, !download)
					logerr(err)
					return
				}
				var min, max time.Time
				for rows.Next() {
					if rows.Scan(&min, &max) == nil {
						aligns[i] = "right"
						isidx[i] = true
						if t == "DATE" {
							dranges[i] = newDrange(min, max, 0, false)
							isDate[i] = true
						} else {
							dranges[i] = newDrange(min, max, 0, true)
							isDateTime[i] = true
						}
					}
				}
			}
		}
	}

	query := first(q.r, "xpath")

	if query == "" {
		if download {
			fmt.Fprintln(q.w, "Query ontbreekt")
		} else {
			updateJsonErr(q, "Query ontbreekt")
			fmt.Fprintln(q.w, "</body>\n</html>")
		}
		return
	}

	if strings.Contains(query, "%") {
		rules := getMacrosRules(q)
		query = macroKY.ReplaceAllStringFunc(query, func(s string) string {
			return rules[s[1:len(s)-1]]
		})
	}

	var owner string
	var nlines uint64
	rows, err := q.db.Query(fmt.Sprintf("SELECT `owner`,`nline` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, prefix))
	if err != nil {
		updateError(q, err, !download)
		logerr(err)
		return
	}
	for rows.Next() {
		if err := rows.Scan(&owner, &nlines); err != nil {
			updateError(q, err, !download)
			logerr(err)
			rows.Close()
			return
		}
	}
	if err := rows.Err(); err != nil {
		updateError(q, err, !download)
		logerr(err)
		return
	}

	dactfiles := make([]string, 0)
	if strings.Contains(owner, "@") {
		dactfiles = append(dactfiles, filepath.Join(paqudir, "data", prefix, "data.dact"))
	} else {
		rows, err := q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix))
		if err != nil {
			updateError(q, err, !download)
			logerr(err)
			return
		}
		for rows.Next() {
			var s string
			err := rows.Scan(&s)
			if err != nil {
				updateError(q, err, !download)
				logerr(err)
				rows.Close()
				return
			}
			if strings.HasSuffix(s, ".dact") {
				dactfiles = append(dactfiles, s)
			}
		}
		if err := rows.Err(); err != nil {
			updateError(q, err, !download)
			logerr(err)
			return
		}
	}

	if len(dactfiles) == 0 {
		fmt.Fprintln(q.w, "Er zijn geen dact-bestanden voor dit corpus")
		return
	}

	if !download {
		fmt.Fprintf(q.w, `<script type="text/javascript">
init({
"download": %q,
"aligns": ["right"`,
			strings.Replace(q.r.URL.RawQuery, "&", "&amp;", -1)+"&amp;d=1")
		for i := 0; i < nAttr; i++ {
			fmt.Fprint(q.w, `,"`, aligns[i], `"`)
		}
		fmt.Fprint(q.w, "],\n\"labels\": [\"aantal\"")
		for i := 0; i < nAttr; i++ {
			a := attr[i]
			if a[0] == ':' {
				a = a[1:]
			}
			fmt.Fprint(q.w, `,"`, html.EscapeString(a), `"`)
		}
		fmt.Fprint(q.w, "],\n\"isidx\": [true")
		for i := 0; i < nAttr; i++ {
			fmt.Fprintf(q.w, ",%v", isidx[i])
		}
		fmt.Fprintln(q.w, "]});\n</script>")
		if ff, ok := q.w.(http.Flusher); ok {
			ff.Flush()
		}
	}

	now := time.Now()
	now2 := time.Now()

	queryparts := strings.Split(query, "+|+")

	sums := make(map[[5]StructIS]int)
	count := 0
	tooMany := false
	var seen uint64
	for _, dactfile := range dactfiles {
		if !download && time.Now().Sub(now2) > 2*time.Second {
			xpathout(q, sums, attr, isidx, count, tooMany, now, download, seen, nlines, false)
			now2 = time.Now()
		}
		if tooMany {
			break
		}
		select {
		case <-chClose:
			logerr(errConnectionClosed)
			return
		default:
		}
		if Cfg.Dactx && methode == "dx" {
			dactfile += "x"
		}
		db, err := dbxml.Open(dactfile)
		if err != nil {
			updateError(q, err, !download)
			logerr(err)
			return
		}

		qu, err := db.Prepare(queryparts[0])
		if err != nil {
			updateError(q, err, !download)
			db.Close()
			return
		}
		done := make(chan bool, 1)
		interrupted := make(chan bool, 1)
		go func() {
			select {
			case <-chClose:
				interrupted <- true
				logerr(errConnectionClosed)
				qu.Cancel()
			case <-done:
			}
		}()

		docs, err := qu.Run()
		if err != nil {
			updateError(q, err, !download)
			db.Close()
			return
		}
		filename := ""
		var seenId map[string]bool
	NEXTDOC:
		for docs.Next() {
			if !download && time.Now().Sub(now2) > 2*time.Second {
				xpathout(q, sums, attr, isidx, count, tooMany, now, download, seen, nlines, false)
				now2 = time.Now()
			}

			matches := make([]string, 0)
			if len(queryparts) == 1 {
				matches = append(matches, docs.Match())
				name := docs.Name()
				if name != filename {
					filename = name
					seenId = make(map[string]bool)
				}
			} else {
				name := docs.Name()
				if name == filename {
					continue
				}
				filename = name
				seenId = make(map[string]bool)
				doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", name)
				for i := 1; i < len(queryparts)-1; i++ {
					docs2, err := db.Query(doctxt + queryparts[i])
					if err != nil {
						updateError(q, err, !download)
						logerr(err)
						docs.Close()
						db.Close()
						return
					}
					if !docs2.Next() {
						continue NEXTDOC
					}
					docs2.Close()
				}

				docs2, err := db.Query(doctxt + queryparts[len(queryparts)-1])
				if err != nil {
					updateError(q, err, !download)
					logerr(err)
					docs.Close()
					db.Close()
					return
				}
				for docs2.Next() {
					matches = append(matches, docs2.Match())
				}
			}

			if len(matches) == 0 {
				continue
			}

			alpino := Alpino_ds{}
			err := xml.Unmarshal([]byte(docs.Content()), &alpino)
			if err != nil {
				updateError(q, err, !download)
				logerr(err)
				docs.Close()
				db.Close()
				return
			}

			var mm [5][]StructIS
			for i := 0; i < 5; i++ {
				mm[i] = make([]StructIS, 0, 4)
			}
			for i := 0; i < 5; i++ {
				if isMeta[i] {
					name := attr[i][1:]
					for _, m := range alpino.Meta {
						if m.Name == name {
							if isInt[i] {
								v, err := strconv.Atoi(m.Value)
								if err == nil {
									vv, idx := iranges[i].value(v)
									mm[i] = append(mm[i], StructIS{idx, vv})
								} else {
									mm[i] = append(mm[i], StructIS{math.MinInt32, err.Error()})
								}
							} else if isFloat[i] {
								v, err := strconv.ParseFloat(m.Value, 64)
								if err == nil {
									vv, idx := franges[i].value(v)
									mm[i] = append(mm[i], StructIS{idx, vv})
								} else {
									mm[i] = append(mm[i], StructIS{math.MinInt32, err.Error()})
								}
							} else if isDate[i] {
								v, err := time.Parse("2006-01-02", m.Value)
								if err == nil {
									vv, idx := dranges[i].value(v)
									mm[i] = append(mm[i], StructIS{idx, vv})
								} else {
									mm[i] = append(mm[i], StructIS{math.MinInt32, err.Error()})
								}
							} else if isDateTime[i] {
								v, err := time.Parse("2006-01-02 15:04", m.Value)
								if err == nil {
									vv, idx := dranges[i].value(v)
									mm[i] = append(mm[i], StructIS{idx, vv})
								} else {
									mm[i] = append(mm[i], StructIS{math.MinInt32, err.Error()})
								}
							} else {
								mm[i] = append(mm[i], StructIS{0, m.Value})
							}
						}
					}
				}
			}
			for i := 0; i < 5; i++ {
				if len(mm[i]) == 0 {
					mm[i] = append(mm[i], StructIS{2147483647, ""})
				}
			}

			var at [5]StructIS
			for _, match := range matches {
				alp := Alpino_ds{}
				err = xml.Unmarshal([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<alpino_ds version="1.3">
`+match+`
</alpino_ds>`), &alp)
				if err != nil {
					updateError(q, err, !download)
					logerr(err)
					docs.Close()
					db.Close()
					return
				}
				sid := ""
				if alp.Node0 != nil {
					sid = alp.Node0.Id
					if alp.Node0.OtherId != "" {
						sid = alp.Node0.OtherId
					}
					if wantRel {
						sid = sid + " " + alp.Node0.Rel
					}
				}
				if seenId[sid] {
					continue
				}
				seenId[sid] = true
				for _, at[0] = range mm[0] {
					for _, at[1] = range mm[1] {
						for _, at[2] = range mm[2] {
							for _, at[3] = range mm[3] {
								for _, at[4] = range mm[4] {
									if nAttr > 0 && attr[0][0] != ':' {
										at[0] = StructIS{0, getFullAttr(attr[0], alp.Node0, alpino.Node0)}
									}
									if nAttr > 1 && attr[1][0] != ':' {
										at[1] = StructIS{0, getFullAttr(attr[1], alp.Node0, alpino.Node0)}
									}
									if nAttr > 2 && attr[2][0] != ':' {
										at[2] = StructIS{0, getFullAttr(attr[2], alp.Node0, alpino.Node0)}
									}
									if nAttr > 3 && attr[3][0] != ':' {
										at[3] = StructIS{0, getFullAttr(attr[3], alp.Node0, alpino.Node0)}
									}
									if nAttr > 4 && attr[4][0] != ':' {
										at[4] = StructIS{0, getFullAttr(attr[4], alp.Node0, alpino.Node0)}
									}
									sums[at]++
									count++
									if len(sums) >= 100000 {
										tooMany = true
										docs.Close()
									}
								}
							}
						}
					}
				}
			}
		}
		if err := docs.Error(); err != nil {
			logerr(err)
		}
		if n, err := db.Size(); err == nil {
			seen += n
		}
		db.Close()
		done <- true
		select {
		case <-interrupted:
			return
		default:
		}
	}

	xpathout(q, sums, attr, isidx, count, tooMany, now, download, 0, 0, true)

	if !download {
		fmt.Fprintln(q.w, "</body>\n</html>")
	}

}
Beispiel #7
0
func xstatsmeta(q *Context) {

	var errval error
	var download bool
	var db *dbxml.Db
	var docs, docs2 *dbxml.Docs
	defer func() {
		if docs2 != nil {
			docs2.Close()
		}
		if docs != nil {
			docs.Close()
		}
		if db != nil {
			db.Close()
		}
		if errval != nil {
			updateError(q, errval, !download)
		}
		completedmeta(q, download)
		if !download {
			fmt.Fprintln(q.w, "</body>\n</html>")
		}
	}()

	var rows *sql.Rows

	now := time.Now()
	now2 := time.Now()

	if first(q.r, "d") != "" {
		download = true
	}

	itemselect := first(q.r, "item")

	methode := first(q.r, "mt")
	if methode != "dx" {
		methode = "std"
	}

	if download {
		contentType(q, "text/plain; charset=utf-8")
		q.w.Header().Set("Content-Disposition", "attachment; filename=telling.txt")
		cache(q)
	} else {
		contentType(q, "text/html; charset=utf-8")
		cache(q)
		fmt.Fprint(q.w, `<!DOCTYPE html>
<html>
<head>
<title></title>
<script type="text/javascript"><!--
function setvalue(n) {
    window.parent._fn.setmetaval(n);
}
function setmetavars(idx, lbl, fl, max, ac, bc) {
    window.parent._fn.setmetavars(idx, lbl, fl, max, ac, bc);
}
function setmetalines(idx, a, b) {
    window.parent._fn.setmetalines(idx, a, b);
}
function makemetatable(idx) {
    window.parent._fn.makemetatable(idx);
}
function f(s) {
    window.parent._fn.updatemeta(s);
}
function f1(s) {
    window.parent._fn.updatemetatop(s);
}
function c(i, j) {
    window.parent._fn.countmeta(i, j);
}
//--></script>
</head>
<body>
<script type="text/javascript">
window.parent._fn.startedmeta();
c("0", "0");
</script>
`)
		if ff, ok := q.w.(http.Flusher); ok {
			ff.Flush()
		}
	}

	prefix := getprefix(q)
	if !q.prefixes[prefix] {
		if download {
			fmt.Fprintf(q.w, "Invalid corpus: "+prefix)
		} else {
			updateText(q, "Invalid corpus: "+html.EscapeString(prefix))
		}
		return
	}

	query := first(q.r, "xpath")

	if query == "" {
		if download {
			fmt.Fprintln(q.w, "Query ontbreekt")
		} else {
			updateText(q, "Query ontbreekt")
		}
		return
	}

	if strings.Contains(query, "%") {
		rules := getMacrosRules(q)
		query = macroKY.ReplaceAllStringFunc(query, func(s string) string {
			return rules[s[1:len(s)-1]]
		})
	}

	var owner string
	var nlines uint64
	rows, errval = q.db.Query(fmt.Sprintf("SELECT `owner`,`nline` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, prefix))
	if logerr(errval) {
		return
	}
	for rows.Next() {
		errval = rows.Scan(&owner, &nlines)
		if logerr(errval) {
			rows.Close()
			return
		}
	}
	errval = rows.Err()
	if logerr(errval) {
		return
	}

	dactfiles := make([]string, 0)
	if strings.Contains(owner, "@") {
		dactfiles = append(dactfiles, filepath.Join(paqudir, "data", prefix, "data.dact"))
	} else {
		rows, errval = q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix))
		if logerr(errval) {
			return
		}
		for rows.Next() {
			var s string
			errval = rows.Scan(&s)
			if logerr(errval) {
				rows.Close()
				return
			}
			if strings.HasSuffix(s, ".dact") {
				dactfiles = append(dactfiles, s)
			}
		}
		errval = rows.Err()
		if logerr(errval) {
			return
		}
	}

	if len(dactfiles) == 0 {
		if download {
			fmt.Fprintln(q.w, "Er zijn geen dact-bestanden voor dit corpus")
		} else {
			updateText(q, "Er zijn geen dact-bestanden voor dit corpus")
		}
		return
	}

	if !q.hasmeta[prefix] {
		if download {
			fmt.Fprintln(q.w, "Geen metadata voor dit corpus")
		} else {
			updateText(q, "Geen metadata voor dit corpus")
		}
		return
	}
	metas := getMeta(q, prefix)
	metat := make(map[string]string)
	metai := make(map[string]int)
	tranges := make(map[string]map[string]int)
	dranges := make(map[string]*drange)
	franges := make(map[string]*frange)
	iranges := make(map[string]*irange)
	for _, m := range metas {
		metat[m.name] = m.mtype
		metai[m.name] = m.id
		if m.mtype == "TEXT" {
			tranges[m.name] = make(map[string]int)
			rows, errval = q.db.Query(fmt.Sprintf(
				"SELECT `idx`,`text` FROM `%s_c_%s_mval` WHERE `id` = %d",
				Cfg.Prefix, prefix, m.id))
			if logerr(errval) {
				return
			}
			for rows.Next() {
				var t string
				var i int
				errval = rows.Scan(&i, &t)
				if logerr(errval) {
					rows.Close()
					return
				}
				tranges[m.name][t] = i
			}
			errval = rows.Err()
			if logerr(errval) {
				return
			}
			continue
		}
		var indexed bool
		var size, dtype, imin, istep int
		var dmin, dmax time.Time
		var fmin, fstep float64
		row := q.db.QueryRow(fmt.Sprintf(
			"SELECT `indexed`, `size`, `dmin`, `dmax`, `dtype`, `fmin`, `fstep`, `imin`, `istep` FROM `%s_c_%s_minf` WHERE `id` = %d",
			Cfg.Prefix, prefix, m.id))
		errval = row.Scan(&indexed, &size, &dmin, &dmax, &dtype, &fmin, &fstep, &imin, &istep)
		if logerr(errval) {
			return
		}
		switch m.mtype {
		case "INT":
			iranges[m.name] = oldIrange(imin, istep, size, indexed)
		case "FLOAT":
			franges[m.name] = oldFrange(fmin, fstep, size)
		case "DATE", "DATETIME":
			dranges[m.name] = oldDrange(dmin, dmax, dtype, indexed)
		}
	} // for _, m := range metas

	queryparts := strings.Split(query, "+|+")

	telling := make(map[string]map[string][3]int)
	for _, m := range metas {
		telling[m.name] = make(map[string][3]int)
	}

	seen := make(map[string]bool)

	var chClose <-chan bool
	if f, ok := q.w.(http.CloseNotifier); ok {
		chClose = f.CloseNotify()
	} else {
		chClose = make(<-chan bool)
	}

	counter := 0
	for _, dactfile := range dactfiles {
		if !download && time.Now().Sub(now2) > 2*time.Second {
			updateCount(q, counter, len(seen))
			now2 = time.Now()
		}
		select {
		case <-chClose:
			logerr(errConnectionClosed)
			return
		default:
		}
		if Cfg.Dactx && methode == "dx" {
			dactfile += "x"
		}
		db, errval = dbxml.Open(dactfile)
		if logerr(errval) {
			return
		}

		var qu *dbxml.Query
		qu, errval = db.Prepare(queryparts[0])
		if logerr(errval) {
			return
		}
		done := make(chan bool, 1)
		interrupted := make(chan bool, 1)
		go func() {
			select {
			case <-chClose:
				interrupted <- true
				logerr(errConnectionClosed)
				qu.Cancel()
			case <-done:
			}
		}()

		docs, errval = qu.Run()
		if logerr(errval) {
			return
		}
		filename := ""
		var seenId map[string]bool
	NEXTDOC:
		for docs.Next() {
			if !download && time.Now().Sub(now2) > 2*time.Second {
				updateCount(q, counter, len(seen))
				now2 = time.Now()
			}
			matches := 0
			if len(queryparts) == 1 {
				name := docs.Name()
				if name != filename {
					filename = name
					seenId = make(map[string]bool)
				}
				alp := Alpino_ds{}
				errval = xml.Unmarshal([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<alpino_ds version="1.3">
`+docs.Match()+`
</alpino_ds>`), &alp)
				if logerr(errval) {
					return
				}
				sid := ""
				if alp.Node0 != nil {
					sid = alp.Node0.Id
					if alp.Node0.OtherId != "" {
						sid = alp.Node0.OtherId
					}
				}
				if !seenId[sid] {
					matches = 1
					seenId[sid] = true
				}
			} else {
				name := docs.Name()
				if name == filename {
					continue
				}
				filename = name
				seenId = make(map[string]bool)
				doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", filename)
				for i := 1; i < len(queryparts)-1; i++ {
					docs2, errval = db.Query(doctxt + queryparts[i])
					if logerr(errval) {
						return
					}
					if !docs2.Next() {
						docs2 = nil
						continue NEXTDOC
					}
					docs2.Close()
					docs2 = nil
				}

				docs2, errval = db.Query(doctxt + queryparts[len(queryparts)-1])
				if logerr(errval) {
					return
				}
				for docs2.Next() {
					alp := Alpino_ds{}
					errval = xml.Unmarshal([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<alpino_ds version="1.3">
`+docs2.Match()+`
</alpino_ds>`), &alp)
					if logerr(errval) {
						return
					}
					sid := ""
					if alp.Node0 != nil {
						sid = alp.Node0.Id
						if alp.Node0.OtherId != "" {
							sid = alp.Node0.OtherId
						}
					}
					if !seenId[sid] {
						seenId[sid] = true
						matches++
					}
				}
				docs2 = nil
			}

			if matches == 0 {
				continue
			}

			counter += matches

			alpino := Alpino_ds_meta{}
			errval = xml.Unmarshal([]byte(docs.Content()), &alpino)
			if logerr(errval) {
				return
			}

			values := make(map[string][]string)
			for _, m := range metas {
				values[m.name] = make([]string, 0)
			}
			for _, m := range alpino.Meta {
				values[m.Name] = append(values[m.Name], m.Value)
			}
			for _, m := range metas {
				if len(values[m.name]) == 0 {
					values[m.name] = append(values[m.name], "")
				}
			}
			f := dactfile + "\t" + docs.Name()
			c := 0
			if !seen[f] {
				seen[f] = true
				c = 1
			}
			for name := range values {
				for _, value := range values[name] {
					var idx int
					if value == "" {
						idx = 2147483647
					} else {
						switch metat[name] {
						case "TEXT":
							idx = tranges[name][value]
						case "INT":
							v, _ := strconv.Atoi(value)
							value, idx = iranges[name].value(v)
						case "FLOAT":
							v, _ := strconv.ParseFloat(value, 32) // 32 is dezelfde precisie als gebruikt door MySQL
							value, idx = franges[name].value(v)
						case "DATE":
							v, _ := time.Parse("2006-01-02", value)
							value, idx = dranges[name].value(v)
						case "DATETIME":
							v, _ := time.Parse("2006-01-02 15:04", value)
							value, idx = dranges[name].value(v)
						}
					}
					telling[name][value] = [3]int{idx, telling[name][value][1] + matches, telling[name][value][2] + c}
				}
			}

		} // for docs.Next()
		errval = docs.Error()
		docs = nil
		if logerr(errval) {
			return
		}
		db.Close()
		db = nil
		done <- true
		select {
		case <-interrupted:
			return
		default:
		}
	} // for _, dactfile := range dactfiles
	if !download {
		updateCount(q, counter, len(seen))
	}

	var buf bytes.Buffer

	pow10 := math.Pow10(int(math.Log10(float64(q.lines[prefix])) + .5))
	if pow10 < 10 {
		pow10 = 10
	}

	if !download {
		fmt.Fprintf(q.w, `<script type="text/javascript">
setvalue(%d);
</script>
`, int(pow10))
	} else {
		fmt.Fprintf(q.w, "# items: %d\n# zinnen: %d\n# n = %d\n", counter, len(seen), int(pow10))
	}

	for number, meta := range metas {
		items := make([]*MetaItem, 0, len(telling[meta.name]))
		for name := range telling[meta.name] {
			items = append(items, &MetaItem{
				text:  name,
				idx:   telling[meta.name][name][0],
				count: [2]int{telling[meta.name][name][1], telling[meta.name][name][2]},
			})
		}
		rows, errval = q.db.Query(fmt.Sprintf(
			"SELECT `idx`, `text`, `n` FROM `%s_c_%s_mval` WHERE `id`=%d ORDER BY `idx`",
			Cfg.Prefix, prefix, metai[meta.name]))
		if logerr(errval) {
			return
		}
		nn := make(map[int]int)
		values := make([]StructIS, 0)
		for rows.Next() {
			var idx, n int
			var txt string
			errval = rows.Scan(&idx, &txt, &n)
			if logerr(errval) {
				rows.Close()
				return
			}
			nn[idx] = n
			values = append(values, StructIS{idx, txt})
		}
		errval = rows.Err()
		if logerr(errval) {
			return
		}

		if !download {
			var hide string
			if itemselect != meta.name {
				hide = " hide"
			}
			var hex string
			for _, c := range meta.name {
				hex += fmt.Sprintf("%04x", uint16(c))
			}
			fmt.Fprintf(&buf, `
<div class="metasub%s" id="meta%s">
<p>
<b>%s</b> &mdash; <a href="javascript:void(0)" onclick="javascript:metahelp()">toelichting bij tabel</a>
<p>
<table>
  <tr>
   <td>per item:
     <table class="right" id="meta%da">
     </table>
   <td class="next">per zin:
     <table class="right" id="meta%db">
     </table>
</table>
</div>
`, hide, hex, html.EscapeString(meta.name), number, number)
			updateText(q, buf.String())
			buf.Reset()

			fl := "right"
			max := 99999
			ac := 1
			bc := 2
			if meta.mtype == "TEXT" {
				fl = "left"
				max = METAMAX
				ac = 0
				bc = 0
			}
			fmt.Fprintf(q.w, `<script type="text/javascript">
setmetavars(%d,"%s","%s",%d,%d,%d);
setmetalines(%d`, number, meta.value, fl, max, ac, bc, number)
		}

		if metat[meta.name] != "TEXT" {
			sort.Sort(MetaItems(items))
		}
		for run := 0; run < 2; run++ {
			if !download {
				fmt.Fprint(q.w, ",[")
			}
			if metat[meta.name] == "TEXT" {
				if run == 0 {
					sort.Sort(MetaItems0(items))
				} else {
					sort.Sort(MetaItems1(items))
				}
			}

			lines := make([]Statline, 0)

			if download {
				if run == 0 {
					fmt.Fprintln(q.w, "# "+meta.name+" per item\t")
				} else {
					fmt.Fprintln(q.w, "# "+meta.name+" per zin\t")
				}
			}
			select {
			case <-chClose:
				logerr(errConnectionClosed)
				return
			default:
			}

			seen := make(map[int]*Statline)
			for _, item := range items {
				lines = append(lines, Statline{item.text, item.count[run], nn[item.idx], item.idx})
				seen[item.idx] = &lines[len(lines)-1]
			}
			if download || (meta.mtype != "TEXT" && len(seen)*NEEDALL > len(values)) {
				// ontbrekende waardes (count==0) toevoegen
				if meta.mtype == "TEXT" {
					for _, v := range values {
						if _, ok := seen[v.i]; !ok {
							lines = append(lines, Statline{v.s, 0, 1, v.i})
						}
					}
				} else {
					lines2 := make([]Statline, len(values))
					for i, v := range values {
						if s, ok := seen[v.i]; ok {
							lines2[i] = *s
						} else {
							lines2[i] = Statline{v.s, 0, 1, v.i}
						}
					}
					lines = lines2
				}
			}
			p := "\n"
			for _, line := range lines {
				if download {
					if run == 1 {
						v := int(.5 + pow10*float64(line.i)/float64(line.n))
						fmt.Fprintf(q.w, "%d\t%d\t%s\n", line.i, v, line.s)
					} else {
						fmt.Fprintf(q.w, "%d\t%s\n", line.i, line.s)
					}
				} else {
					fmt.Fprintf(q.w, "%s[%d,", p, line.i)
					p = ",\n"
					if run == 1 {
						v := int(.5 + pow10*float64(line.i)/float64(line.n))
						fmt.Fprintf(q.w, "%d,", v)
					}
					fmt.Fprintf(q.w, "%d,\"%s\"]", line.idx, line.s)
				}
			} // for _, line := range lines
			if !download {
				fmt.Fprintln(q.w, "]")
			}
		} // for run := 0; run < 2; run++

		if !download {
			fmt.Fprintf(q.w, `);
			   makemetatable(%d);
			   //--></script>
			   `, number)
		}
	} // for number, meta := range metas

	if !download {
		fmt.Fprintf(&buf,
			"<hr>tijd: %s\n<p>\n<a href=\"xstatsmeta?%s&amp;d=1\">download</a>\n",
			tijd(time.Now().Sub(now)),
			strings.Replace(q.r.URL.RawQuery, "&", "&amp;", -1))
		updateText(q, buf.String())
		buf.Reset()
	}

}
Beispiel #8
0
// TAB: xpath
func xpath(q *Context) {

	prefix := getprefix(q)
	if !q.prefixes[prefix] {
		http.Error(q.w, "Invalid corpus: "+prefix, http.StatusPreconditionFailed)
		return
	}

	xpathmax := getxpathmax(q)

	methode := first(q.r, "mt")
	if methode != "dx" {
		methode = "std"
	}

	var errval error
	var db *dbxml.Db
	var docs *dbxml.Docs
	var loading bool
	defer func() {
		if docs != nil {
			docs.Close()
		}
		if db != nil {
			db.Close()
		}
		if loading {
			clearLoading(q.w)
		}
		if errval != nil {
			fmt.Fprintf(q.w, "<div class=\"error\">FOUT: %s</div>\n", html.EscapeString(errval.Error()))
		}
		html_footer(q)
	}()

	// HTML-uitvoer van begin van de pagina
	writeHead(q, "", 2)
	html_xpath_header(q)

	// HTML-uitvoer van het formulier
	// Returnwaarde is true als er een query was gedefinieerd
	has_query := html_xpath_form(q, xpathmax)

	// Als er geen query is gedefinieerd, HTML-uitvoer van korte helptekst, pagina-einde, en exit
	if !has_query {
		html_xpath_uitleg(q)
		return
	}

	var chClose <-chan bool
	if f, ok := q.w.(http.CloseNotifier); ok {
		chClose = f.CloseNotify()
	} else {
		chClose = make(<-chan bool)
	}

	_, errval = q.db.Exec(fmt.Sprintf("UPDATE `%s_info` SET `active` = NOW() WHERE `id` = %q", Cfg.Prefix, prefix))
	if logerr(errval) {
		return
	}

	offset := 0
	o, err := strconv.Atoi(first(q.r, "offset"))
	if err == nil {
		offset = o
	}
	if offset < 0 {
		offset = 0
	}

	fmt.Fprintln(q.w, "<hr>")

	now := time.Now()

	var owner string
	var nlines uint64
	var rows *sql.Rows
	rows, errval = q.db.Query(fmt.Sprintf("SELECT `owner`,`nline` FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, prefix))
	if logerr(errval) {
		return
	}
	for rows.Next() {
		errval = rows.Scan(&owner, &nlines)
		if logerr(errval) {
			rows.Close()
			return
		}
	}
	errval = rows.Err()
	if logerr(errval) {
		return
	}

	dactfiles := make([]string, 0)
	global := false
	if strings.Contains(owner, "@") {
		dactfiles = append(dactfiles, filepath.Join(paqudir, "data", prefix, "data.dact"))
	} else {
		global = true
		rows, errval = q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix))
		if logerr(errval) {
			return
		}
		for rows.Next() {
			var s string
			errval = rows.Scan(&s)
			if logerr(errval) {
				rows.Close()
				return
			}
			if strings.HasSuffix(s, ".dact") {
				dactfiles = append(dactfiles, s)
			}
		}
		errval = rows.Err()
		if logerr(errval) {
			return
		}
	}

	if len(dactfiles) == 0 {
		fmt.Fprintln(q.w, "Er zijn geen dact-bestanden voor dit corpus")
		return
	}

	fmt.Fprintf(q.w, "<ol start=\"%d\" id=\"ol\" class=\"xpath\">\n</ol>\n", offset+1)

	fmt.Fprintln(q.w, "<div id=\"loading\"><img src=\"busy.gif\" alt=\"[bezig]\"> <span></span></div>")
	if ff, ok := q.w.(http.Flusher); ok {
		ff.Flush()
	}
	loading = true

	found := false
	curno := 0
	filename := ""
	curdac := ""
	xmlall := ""
	xmlparts := make([]string, 0)
	query := first(q.r, "xpath")
	fullquery := query
	if strings.Contains(query, "%") {
		rules := getMacrosRules(q)
		fullquery = macroKY.ReplaceAllStringFunc(query, func(s string) string {
			return rules[s[1:len(s)-1]]
		})
	}

	queryparts := strings.Split(fullquery, "+|+")

	var seen uint64
	for i, dactfile := range dactfiles {
		select {
		case <-chClose:
			logerr(errConnectionClosed)
			return
		default:
		}

		if Cfg.Dactx && methode == "dx" {
			if i == 0 {
				if _, err := os.Stat(dactfile + "x"); err != nil {
					methode = "std"
					fmt.Fprintln(q.w, `<script type="text/javascript"><!--
$('#ol').before('<div class="warning">Geen ge&euml;xpandeerde indexnodes beschikbaar voor dit corpus.<br>De standaardmethode wordt gebruikt.</div>');
//--></script>`)
				}
			}
			if methode == "dx" {
				dactfile += "x"
			}
		}

		if curno > offset+xpathmax {
			break
		}

		if seen > 0 {
			fmt.Fprintf(q.w, `<script type="text/javascript"><!--
$('#loading span').html('%.1f%%');
//--></script>
`, float64(seen)*100/float64(nlines))
			if ff, ok := q.w.(http.Flusher); ok {
				ff.Flush()
			}
		}

		errval = bugtest(dactfile, queryparts[0])
		if errval != nil {
			return
		}

		db, errval = dbxml.Open(dactfile)
		if logerr(errval) {
			return
		}

		var qu *dbxml.Query
		qu, errval = db.Prepare(queryparts[0])
		if logerr(errval) {
			return
		}
		done := make(chan bool, 1)
		interrupted := make(chan bool, 1)
		go func() {
			select {
			case <-chClose:
				interrupted <- true
				logerr(errConnectionClosed)
				qu.Cancel()
			case <-done:
			}
		}()

		docs, errval = qu.Run()
		if logerr(errval) {
			done <- true
			return
		}
		filename = ""
	NEXTDOC:
		for docs.Next() {
			name := docs.Name()
			newdoc := false
			if name != filename {
				if found && curno > offset && curno <= offset+xpathmax {
					found = false
					xpath_result(q, curno, curdac, filename, xmlall, xmlparts, prefix, global)
					xmlparts = xmlparts[0:0]
				}
				if len(queryparts) == 1 {
					curno++
					if curno > offset+xpathmax {
						docs.Close()
						continue
					}
				}
				curdac = dactfile
				filename = name
				newdoc = true
			}
			if len(queryparts) == 1 {
				found = true
				if curno > offset+xpathmax {
					docs.Close()
				} else {
					if curno > offset && curno <= offset+xpathmax {
						xmlall = docs.Content()
						xmlparts = append(xmlparts, docs.Match())
					}
				}
			} else if newdoc {
				newdoc = false
				doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", name)
				var docs2 *dbxml.Docs
				for i := 1; i < len(queryparts)-1; i++ {
					docs2, errval = db.Query(doctxt + queryparts[i])
					if logerr(errval) {
						done <- true
						return
					}
					if !docs2.Next() {
						continue NEXTDOC
					}
					docs2.Close()
				}
				docs2, errval = db.Query(doctxt + queryparts[len(queryparts)-1])
				if logerr(errval) {
					done <- true
					return
				}
				found = false
				for docs2.Next() {
					if !found {
						found = true
						curno++
						if curno > offset+xpathmax {
							docs.Close()
						}
					}
					if curno > offset && curno <= offset+xpathmax {
						xmlall = docs2.Content()
						xmlparts = append(xmlparts, docs2.Match())
					} else {
						docs2.Close()
					}
				}
			}
		} // for docs.Next()
		errval = docs.Error()
		docs = nil
		if logerr(errval) {
			done <- true
			return
		}

		if len(dactfiles) > 1 {
			if n, err := db.Size(); err == nil {
				seen += n
			}
		}
		db.Close()
		db = nil
		done <- true
		select {
		case <-interrupted:
			return
		default:
		}

		if found && curno > offset && curno <= offset+xpathmax {
			found = false
			xpath_result(q, curno, curdac, filename, xmlall, xmlparts, prefix, global)
			xmlparts = xmlparts[0:0]
		}
		if curno > offset+xpathmax {
			break
		}
	} // for _, dactfile := range dactfiles

	clearLoading(q.w)
	loading = false

	if curno == 0 {
		fmt.Fprintf(q.w, "geen match gevonden")
	}

	// Links naar volgende en vorige pagina's met resultaten
	qs := "xpath=" + urlencode(query) + "&amp;mt=" + methode
	if offset > 0 || curno > offset+xpathmax {
		if offset > 0 {
			fmt.Fprintf(q.w, "<a href=\"xpath?%s&amp;offset=%d\">vorige</a>", qs, offset-xpathmax)
		} else {
			fmt.Fprint(q.w, "vorige")
		}
		fmt.Fprint(q.w, " | ")
		if curno > offset+xpathmax {
			fmt.Fprintf(q.w, "<a href=\"xpath?%s&amp;offset=%d\">volgende</a>", qs, offset+xpathmax)
		} else {
			fmt.Fprint(q.w, "volgende")
		}
	}

	if q.auth && curno > 0 {
		fmt.Fprintf(q.w, `<p>
<form action="xsavez" method="POST" accept-charset="UTF-8" enctype="multipart/form-data">
<input type="hidden" name="xpath" value="%s">
<input type="hidden" name="db" value="%s">
<input type="hidden" name="mt" value="%s">
<input type="submit" value="nieuw corpus maken op basis van deze zoekopdracht">
</form>
`,
			html.EscapeString(first(q.r, "xpath")),
			html.EscapeString(prefix),
			methode)
	}

	fmt.Fprintln(q.w, "<hr><small>tijd:", tijd(time.Now().Sub(now)), "</small>")

	if curno == 0 {
		return
	}

	fmt.Fprintln(q.w, "<hr>")

	var metas []MetaType
	if q.hasmeta[prefix] {
		metas = getMeta(q, prefix)
	}

	// Links naar statistieken
	fmt.Fprintf(q.w, `
        <p>
		<div id="xstats">
		<form action="javascript:$.fn.xpathstats()" name="xstatsform">
		<input type="hidden" name="xpath" value="%s">
		<input type="hidden" name="db" value="%s">
		<input type="hidden" name="mt" value="%s">
		Selecteer &eacute;&eacute;n tot vijf attributen:
        <p>
`, html.EscapeString(query), html.EscapeString(prefix), methode)

	for i := 1; i <= 5; i++ {

		fmt.Fprintf(q.w, "<select name=\"attr%d\">\n<option value=\"\">--</option>\n", i)
		if q.hasmeta[prefix] {
			fmt.Fprintln(q.w, "<optgroup label=\"&mdash; metadata &mdash;\">")
			for _, m := range metas {
				fmt.Fprintf(q.w, "<option value=\":%s\">%s</option>\n", html.EscapeString(m.name), html.EscapeString(m.name))
			}
			fmt.Fprintln(q.w, "</optgroup>")
			fmt.Fprintln(q.w, "<optgroup label=\"&mdash; attributen &mdash;\">")
		}
		for _, s := range NodeTags {
			fmt.Fprintf(q.w, "<option>%s</option>\n", s)
		}
		if q.hasmeta[prefix] {
			fmt.Fprintln(q.w, "</optgroup>")
		}
		fmt.Fprintln(q.w, "</select>")
	}

	fmt.Fprint(q.w, `
		<p>
		<input type="submit" value="doe telling">
		</form>
		<p>
        <iframe src="leeg.html" id="xframe" class="hide"></iframe>
        <div id="result" class="hide"></div>
`)
	if q.hasmeta[prefix] {
		metahelp(q)
		fmt.Fprintln(q.w, `<p>
            <div id="statsmeta" class="hide">
            <div id="innermetatop"></div>
            <div id="metacount" class="hide">
            <table>
            <tr><td>items:<td class="right" id="metacount1">
            <tr><td>zinnen:<td class="right" id="metacount2">
            </table>
            </div>
            <div id="innermeta"></div>
            <img src="busy.gif" id="busymeta" class="hide" alt="aan het werk..." style="margin-top:1em">
            </div>`)
	}
	fmt.Fprintln(q.w, "</div>")

}
Beispiel #9
0
func xsavez2(q *Context) {

	var fpz, fpgz *os.File
	var z *zip.Writer
	var gz *gzip.Reader
	var dact *dbxml.Db
	var docs *dbxml.Docs
	var dirname, fulldirname string
	var okall bool

	defer func() {
		if z != nil {
			z.Close()
		}
		if fpz != nil {
			fpz.Close()
		}
		if gz != nil {
			gz.Close()
		}
		if fpgz != nil {
			fpgz.Close()
		}
		if docs != nil {
			docs.Close()
		}
		if dact != nil {
			dact.Close()
		}
		if !okall {
			os.RemoveAll(fulldirname)
			q.db.Exec(fmt.Sprintf("DELETE FROM `%s_info` WHERE `id` = %q", Cfg.Prefix, dirname))
		}
	}()

	protected := 0

	if !q.auth {
		http.Error(q.w, "Je bent niet ingelogd", http.StatusUnauthorized)
		return
	}

	methode := firstf(q.form, "mt")
	if methode != "dx" {
		methode = "std"
	}

	corpora := make([]string, 0, len(q.form.Value["db"]))
	for _, c := range q.form.Value["db"] {
		if s := strings.TrimSpace(c); s != "" {
			corpora = append(corpora, s)
		}
	}
	for _, corpus := range corpora {
		if !q.prefixes[corpus] {
			http.Error(q.w, "Geen toegang tot corpus", http.StatusUnauthorized)
			return
		}
		if q.protected[corpus] || !q.myprefixes[corpus] {
			protected = 1
		}
	}

	if len(corpora) == 0 {
		writeHtml(q, "Fout", "Geen corpora gekozen")
		return
	}

	xpath := firstf(q.form, "xpath")
	if xpath == "" {
		writeHtml(q, "Fout", "Zoekterm ontbreekt")
		return
	}

	title := maxtitlelen(firstf(q.form, "title"))
	if title == "" {
		writeHtml(q, "Fout", "Titel ontbreekt")
		return
	}

	maxdup, _ := strconv.Atoi(firstf(q.form, "maxdup"))
	if maxdup < 1 || maxdup > Cfg.Maxdup {
		maxdup = Cfg.Maxdup
	}

	dirname, fulldirname, ok := beginNewCorpus(q, q.db, title, hErr)
	if !ok {
		return
	}

	fpz, err := os.Create(fulldirname + "/data")
	if hErr(q, err) {
		fpz = nil
		return
	}
	z = zip.NewWriter(fpz)

	linecount := 0
	for _, prefix := range corpora {
		if linecount == maxdup && maxdup > 0 {
			break
		}

		global, ok := isGlobal(q, prefix)
		if !ok {
			return
		}
		pathlen, ok := getPathLen(q, prefix, global, true)
		if !ok {
			return
		}

		dactfiles := make([]string, 0)
		if !global {
			dactfiles = append(dactfiles, fmt.Sprintf("%s/data/%s/data.dact", paqudir, prefix))
		} else {
			rows, err := q.db.Query(fmt.Sprintf("SELECT `arch` FROM `%s_c_%s_arch` ORDER BY `id`", Cfg.Prefix, prefix))
			if hErr(q, err) {
				return
			}
			for rows.Next() {
				var s string
				if hErr(q, rows.Scan(&s)) {
					rows.Close()
					return
				}
				if strings.HasSuffix(s, ".dact") {
					dactfiles = append(dactfiles, s)
				}
			}
			if hErr(q, rows.Err()) {
				return
			}
		}

		fullquery := xpath
		if strings.Contains(xpath, "%") {
			rules := getMacrosRules(q)
			fullquery = macroKY.ReplaceAllStringFunc(xpath, func(s string) string {
				return rules[s[1:len(s)-1]]
			})
		}
		queryparts := strings.Split(fullquery, "+|+")

		for _, dactfile := range dactfiles {
			if linecount == maxdup && maxdup > 0 {
				break
			}
			if Cfg.Dactx && methode == "dx" {
				dactfile += "x"
			}
			var data []byte
			dact, err = dbxml.Open(dactfile)
			if hErr(q, err) {
				dact = nil
				return
			}

			qu, err := dact.Prepare(queryparts[0])
			if hErr(q, err) {
				return
			}
			docs, err = qu.Run()
			if hErr(q, err) {
				docs = nil
				return
			}
			seen := make(map[string]bool)
		NEXTDOC:
			for docs.Next() {
				if linecount == maxdup && maxdup > 0 {
					break
				}
				filename := docs.Name()
				if seen[filename] {
					continue
				}
				seen[filename] = true
				found := false
				if len(queryparts) == 1 {
					found = true
					data = []byte(docs.Content())
				} else {
					doctxt := fmt.Sprintf("[dbxml:metadata('dbxml:name')=%q]", filename)
					for i := 1; i < len(queryparts)-1; i++ {
						docs2, err := dact.Query(doctxt + queryparts[i])
						if hErr(q, err) {
							return
						}
						if !docs2.Next() {
							continue NEXTDOC
						}
						docs2.Close()
					}
					docs2, err := dact.Query(doctxt + queryparts[len(queryparts)-1])
					if hErr(q, err) {
						return
					}
					found = false
					if docs2.Next() {
						found = true
						data = []byte(docs2.Content())
						docs2.Close()
					}

				}
				if !found {
					continue
				}

				newfile := filename
				if global {
					newfile = dactfile[pathlen:len(dactfile)-5] + "::" + filename
				}
				if len(corpora) > 1 {
					newfile = prefix + "/" + newfile
					data = xmlSetSource(data, prefix)
				}
				f, err := z.Create(newfile)
				if hErr(q, err) {
					return
				}
				if methode == "dx" {
					data, err = unexpandDact(data)
					if hErr(q, err) {
						return
					}
				}
				_, err = f.Write(data)
				if hErr(q, err) {
					return
				}
				linecount++
			} // for docs.Next()
			err = docs.Error()
			docs = nil
			if hErr(q, err) {
				return
			}
			dact.Close()
			dact = nil
		} // for range dactfiles
	} // for range corpora

	err = z.Close()
	z = nil
	if hErr(q, err) {
		return
	}
	fpz.Close()
	fpz = nil

	s := "xmlzip-d"
	if protected != 0 {
		s = "xmlzip-p"
	}
	newCorpus(q, q.db, dirname, title, s, protected, hErr, true)
	okall = true
}
Beispiel #10
0
func unpackDact(data, xmldir, dact, stderr string, chKill chan bool) (tokens, nline int, err error) {

	os.Mkdir(xmldir, 0777)

	dc, err := dbxml.Open(dact)
	if err != nil {
		return 0, 0, fmt.Errorf(
			"Openen mislukt. PaQu kan geen dact-bestanden lezen die gemaakt zijn met DbXml nieuwer dan versie %d",
			dbxml_version_major())
	}
	defer dc.Close()

	var dbx *dbxml.Db
	if Cfg.Dactx {
		os.Remove(dact + "x")
		dbx, err = dbxml.Open(dact + "x")
		if err != nil {
			return 0, 0, err
		}
		defer dbx.Close()
	}

	docs, err := dc.All()
	if err != nil {
		return 0, 0, fmt.Errorf("Lezen dact-bestand: %s", err)
	}
	defer docs.Close()

	fperr, err := os.Create(stderr)
	if err != nil {
		return 0, 0, err
	}
	defer fperr.Close()

	fplines, err := os.Create(data + ".lines")
	if err != nil {
		return 0, 0, err
	}
	defer fplines.Close()

	tokens = 0
	nline = 0
	nd := -1
	sdir := ""
	for docs.Next() {

		select {
		case <-chGlobalExit:
			return 0, 0, errGlobalExit
		case <-chKill:
			return 0, 0, errKilled
		default:
		}

		nline++

		if nline%10000 == 1 {
			nd++
			sdir = fmt.Sprintf("%04d", nd)
			os.Mkdir(filepath.Join(xmldir, sdir), 0777)
		}

		name := docs.Name()
		if strings.HasSuffix(strings.ToLower(name), ".xml") {
			name = name[:len(name)-4]
		}
		encname := encode_filename(name)

		data := []byte(docs.Content())

		fp, err := os.Create(filepath.Join(xmldir, sdir, encname+".xml"))
		if err != nil {
			return 0, 0, err
		}
		_, err = fp.Write(data)
		fp.Close()
		if err != nil {
			return 0, 0, err
		}

		alpino := Alpino_ds_no_node{}
		err = xml.Unmarshal(data, &alpino)
		if err != nil {
			return 0, 0, fmt.Errorf("Parsen van %q uit dact-bestand: %s", docs.Name(), err)
		}
		tokens += len(strings.Fields(alpino.Sentence))
		fmt.Fprintf(fplines, "%s|%s\n", name, strings.TrimSpace(alpino.Sentence))
		for _, c := range alpino.Comments {
			if strings.HasPrefix(c.Comment, "Q#") {
				a := strings.SplitN(c.Comment, "|", 2)
				if len(a) == 2 {
					fmt.Fprintf(fperr, "Q#%s|%s\n", name, strings.TrimSpace(a[1]))
					break
				}
			}
		}
		if Cfg.Dactx {
			content, err := dactExpand(data)
			if err != nil {
				return 0, 0, err
			}
			if content == "" {
				content = string(data)
			}
			err = dbx.PutXml(docs.Name(), content, false)
			if err != nil {
				return 0, 0, err
			}
		}
	}

	return tokens, nline, nil
}
Beispiel #11
0
func makeDact(dact, xml string, stripchar string, chKill chan bool) error {
	files, err := filenames2(xml, false)
	if err != nil {
		return err
	}

	os.Remove(dact)
	db, err := dbxml.Open(dact)
	if err != nil {
		return err
	}
	defer db.Close()

	var dbx *dbxml.Db
	if Cfg.Dactx {
		os.Remove(dact + "x")
		dbx, err = dbxml.Open(dact + "x")
		if err != nil {
			return err
		}
		defer dbx.Close()
	}

	for _, name := range files {

		select {
		case <-chGlobalExit:
			return errGlobalExit
		case <-chKill:
			return errKilled
		default:
		}

		data, err := ioutil.ReadFile(filepath.Join(xml, name))
		if err != nil {
			return err
		}

		name = decode_filename(name)
		if stripchar != "" {
			name = name[1+strings.Index(name, stripchar):]
		}
		err = db.PutXml(name, string(data), false)
		if err != nil {
			return err
		}

		if Cfg.Dactx {
			content, err := dactExpand(data)
			if err != nil {
				return err
			}
			if content == "" {
				content = string(data)
			}
			err = dbx.PutXml(name, content, false)
			if err != nil {
				return err
			}
		}
	}
	return nil
}