Exemplo n.º 1
0
// Reads a single JSON request from the TCP connection, performs the search and
// sends results back over the TCP connection as they appear.
func streamingQuery(conn net.Conn) {
	defer conn.Close()
	connMu := new(sync.Mutex)
	logprefix := fmt.Sprintf("[%s]", conn.RemoteAddr().String())

	type sourceRequest struct {
		Query string
		// Rewritten URL (after RewriteQuery()) with all the parameters that
		// are relevant for ranking.
		URL string
	}

	var r sourceRequest
	if err := json.NewDecoder(conn).Decode(&r); err != nil {
		log.Printf("%s Could not parse JSON request: %v\n", logprefix, err)
		return
	}

	logprefix = fmt.Sprintf("%s [%q]", logprefix, r.Query)

	// Ask the local index backend for all the filenames.
	filenames, err := queryIndexBackend(r.Query)
	if err != nil {
		log.Printf("%s Error querying index backend for query %q: %v\n", logprefix, r.Query, err)
		return
	}

	// Parse the (rewritten) URL to extract all ranking options/keywords.
	rewritten, err := url.Parse(r.URL)
	if err != nil {
		log.Fatal(err)
	}
	rankingopts := ranking.RankingOptsFromQuery(rewritten.Query())

	// Rank all the paths.
	files := make(ranking.ResultPaths, 0, len(filenames))
	for _, filename := range filenames {
		result := ranking.ResultPath{Path: filename}
		result.Rank(&rankingopts)
		if result.Ranking > -1 {
			files = append(files, result)
		}
	}

	// Filter all files that should be excluded.
	files = filterByKeywords(rewritten, files)

	// While not strictly necessary, this will lead to better results being
	// discovered (and returned!) earlier, so let’s spend a few cycles on
	// sorting the list of potential files first.
	sort.Sort(files)

	re, err := regexp.Compile(r.Query)
	if err != nil {
		log.Printf("%s Could not compile regexp: %v\n", logprefix, err)
		return
	}

	log.Printf("%s regexp = %q, %d possible files\n", logprefix, re, len(files))

	// Send the first progress update so that clients know how many files are
	// going to be searched.
	if _, err := sendProgressUpdate(conn, connMu, 0, len(files)); err != nil {
		log.Printf("%s %v\n", logprefix, err)
		return
	}

	// The tricky part here is “flow control”: if we just start grepping like
	// crazy, we will eventually run out of memory because all our writes are
	// blocked on the connection (and the goroutines need to keep the write
	// buffer in memory until the write is done).
	//
	// So instead, we start 1000 worker goroutines and feed them work through a
	// single channel. Due to these these goroutines being blocked on writing,
	// the grepping will naturally become slower.
	work := make(chan ranking.ResultPath)
	progress := make(chan int)

	var wg sync.WaitGroup
	// We add the additional 1 for the progress updater goroutine. It also
	// needs to be done before we can return, otherwise it will try to use the
	// (already closed) network connection, which is a fatal error.
	wg.Add(len(files) + 1)

	go func() {
		for _, file := range files {
			work <- file
		}
		close(work)
	}()

	go func() {
		cnt := 0
		errorShown := false
		var lastProgressUpdate time.Time
		progressInterval := 2*time.Second + time.Duration(rand.Int63n(int64(500*time.Millisecond)))
		for cnt < len(files) {
			add := <-progress
			cnt += add

			if time.Since(lastProgressUpdate) > progressInterval {
				if _, err := sendProgressUpdate(conn, connMu, cnt, len(files)); err != nil {
					if !errorShown {
						log.Printf("%s %v\n", logprefix, err)
						// We need to read the 'progress' channel, so we cannot
						// just exit the loop here. Instead, we suppress all
						// error messages after the first one.
						errorShown = true
					}
				}
				lastProgressUpdate = time.Now()
			}
		}

		if _, err := sendProgressUpdate(conn, connMu, len(files), len(files)); err != nil {
			log.Printf("%s %v\n", logprefix, err)
		}
		close(progress)

		wg.Done()
	}()

	querystr := ranking.NewQueryStr(r.Query)

	numWorkers := 1000
	if len(files) < 1000 {
		numWorkers = len(files)
	}
	for i := 0; i < numWorkers; i++ {
		go func() {
			re, err := regexp.Compile(r.Query)
			if err != nil {
				log.Printf("%s\n", err)
				return
			}

			grep := regexp.Grep{
				Regexp: re,
				Stdout: os.Stdout,
				Stderr: os.Stderr,
			}

			for file := range work {
				sourcePkgName := file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]]
				if rankingopts.Pathmatch {
					file.Ranking += querystr.Match(&file.Path)
				}
				if rankingopts.Sourcepkgmatch {
					file.Ranking += querystr.Match(&sourcePkgName)
				}
				if rankingopts.Weighted {
					file.Ranking += 0.1460 * querystr.Match(&file.Path)
					file.Ranking += 0.0008 * querystr.Match(&sourcePkgName)
				}

				// TODO: figure out how to safely clone a dcs/regexp
				matches := grep.File(path.Join(*unpackedPath, file.Path))
				for _, match := range matches {
					match.Ranking = ranking.PostRank(rankingopts, &match, &querystr)
					match.PathRank = file.Ranking
					//match.Path = match.Path[len(*unpackedPath):]
					// NB: populating match.Ranking happens in
					// cmd/dcs-web/querymanager because it depends on at least
					// one other result.

					// TODO: ideally, we’d get capn buffers from grep.File(), let’s do that after profiling the decoding performance
					seg := capn.NewBuffer(nil)
					z := proto.NewRootZ(seg)
					m := proto.NewMatch(seg)
					m.SetPath(match.Path[len(*unpackedPath):])
					m.SetLine(uint32(match.Line))
					m.SetPackage(m.Path()[:strings.Index(m.Path(), "/")])
					m.SetCtxp2(match.Ctxp2)
					m.SetCtxp1(match.Ctxp1)
					m.SetContext(match.Context)
					m.SetCtxn1(match.Ctxn1)
					m.SetCtxn2(match.Ctxn2)
					m.SetPathrank(match.PathRank)
					m.SetRanking(match.Ranking)
					z.SetMatch(m)

					connMu.Lock()
					if _, err := seg.WriteToPacked(conn); err != nil {
						connMu.Unlock()
						log.Printf("%s %v\n", logprefix, err)
						// Drain the work channel, but without doing any work.
						// This effectively exits the worker goroutine(s)
						// cleanly.
						for _ = range work {
						}
						break
					}
					connMu.Unlock()
				}

				progress <- 1

				wg.Done()
			}
		}()
	}

	wg.Wait()

	log.Printf("%s Sent all results.\n", logprefix)
}
Exemplo n.º 2
0
func Source(w http.ResponseWriter, r *http.Request) {
	r.ParseForm()
	textQuery := r.Form.Get("q")
	limit, err := strconv.ParseInt(r.Form.Get("limit"), 10, 0)
	if err != nil {
		log.Printf("%s\n", err)
		return
	}
	filenames := r.Form["filename"]
	re, err := regexp.Compile(textQuery)
	if err != nil {
		log.Printf("%s\n", err)
		return
	}

	log.Printf("query: text = %s, regexp = %s\n", textQuery, re)

	rankingopts := ranking.RankingOptsFromQuery(r.URL.Query())

	querystr := ranking.NewQueryStr(textQuery)

	// Create one Goroutine per filename, which means all IO will be done in
	// parallel.
	// TODO: implement a more clever way of scheduling IO. when enough results
	// are gathered, we don’t need to grep any other files, so currently we may
	// do unnecessary work.
	output := make(chan []regexp.Match)
	for _, filename := range filenames {
		go func(filename string) {
			// TODO: figure out how to safely clone a dcs/regexp
			re, err := regexp.Compile(textQuery)
			if err != nil {
				log.Printf("%s\n", err)
				return
			}

			grep := regexp.Grep{
				Regexp: re,
				Stdout: os.Stdout,
				Stderr: os.Stderr,
			}

			output <- grep.File(path.Join(*unpackedPath, filename))
		}(filename)
	}

	fmt.Printf("done, now getting the results\n")

	// TODO: also limit the number of matches per source-package, not only per file
	var reply SourceReply
	for idx, filename := range filenames {
		fmt.Printf("…in %s\n", filename)
		matches := <-output
		for idx, match := range matches {
			if limit > 0 && idx == 5 {
				// TODO: we somehow need to signal that there are more results
				// (if there are more), so that the user can expand this.
				break
			}
			fmt.Printf("match: %s\n", match)
			match.Ranking = ranking.PostRank(rankingopts, &match, &querystr)
			match.Path = match.Path[len(*unpackedPath):]
			reply.AllMatches = append(reply.AllMatches, match)
		}
		if limit > 0 && int64(len(reply.AllMatches)) >= limit {
			reply.LastUsedFilename = idx
			break
		}
	}
	jsonFiles, err := json.Marshal(&reply)
	if err != nil {
		log.Printf("%s\n", err)
		return
	}
	_, err = w.Write(jsonFiles)
	if err != nil {
		log.Printf("%s\n", err)
		return
	}

	// Read the remaining outputs in the background.
	if reply.LastUsedFilename > 0 {
		go func(stopped, max int) {
			for i := stopped + 1; i < max; i++ {
				<-output
			}
		}(reply.LastUsedFilename, len(filenames))
	}
}
Exemplo n.º 3
0
Arquivo: search.go Projeto: krzsas/dcs
func Search(w http.ResponseWriter, r *http.Request) {
	var tinit, t0, t1, t2, t3, t4 time.Time

	tinit = time.Now()

	// Rewrite the query to extract words like "lang:c" from the querystring
	// and place them in parameters.
	rewritten := RewriteQuery(*r.URL)

	query := rewritten.Query()
	// The "package:" keyword, if specified.
	pkg := rewritten.Query().Get("package")
	// The "-package:" keyword, if specified.
	npkgs := rewritten.Query()["npackage"]
	// The "path:" keyword, if specified.
	paths := rewritten.Query()["path"]

	// Usage of this flag should be restricted to local IP addresses or
	// something like that (it causes a lot of load, but it makes analyzing the
	// search engine’s ranking easier).
	allResults := query.Get("all") == "1"

	// Users can configurable which ranking factors (with what weight) they
	// want to use. rankingopts stores these values, extracted from the query
	// parameters.
	rankingopts := ranking.RankingOptsFromQuery(query)

	querystr := ranking.NewQueryStr(query.Get("q"))

	if len(query.Get("q")) < 3 {
		err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{
			"q":        r.URL.Query().Get("q"),
			"errormsg": "Your search term is too short. You need at least 3 characters.",
		})
		if err != nil {
			http.Error(w, err.Error(), http.StatusInternalServerError)
		}

		return
	}

	_, err := regexp.Compile(query.Get("q"))
	if err != nil {
		err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{
			"q":          r.URL.Query().Get("q"),
			"errormsg":   fmt.Sprintf(`%v`, err),
			"suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`),
		})
		if err != nil {
			http.Error(w, err.Error(), http.StatusInternalServerError)
		}

		return
	}

	// Number of files to skip when searching. Used for pagination.
	skip64, _ := strconv.ParseInt(query.Get("skip"), 10, 0)
	skip := int(skip64)

	log.Printf("Search query: term %q, URL %q", query.Get("q"), rewritten.String())
	fmt.Printf("opts: %v\n", rankingopts)

	fmt.Printf("Query parsed after %v\n", time.Now().Sub(tinit))

	// TODO: compile the regular expression right here so that we don’t do it N
	// times and can properly error out.

	// Send the query to all index backends (our index is sharded into multiple
	// pieces).
	backends := strings.Split(*indexBackends, ",")
	done := make(chan int)
	indexResults := make(chan ranking.ResultPath, 10)
	t0 = time.Now()
	for _, backend := range backends {
		fmt.Printf("Sending query to " + backend)
		go sendIndexQuery(rewritten, backend, indexResults, done, rankingopts)
	}

	// Close the result channel when all index queries are done so that we can
	// use range on the result channel.
	go func() {
		for i := 0; i < len(backends); i++ {
			<-done
		}
		close(indexResults)
	}()

	var files ranking.ResultPaths
	// We also keep the files in a map with their path as the key so that we
	// can correlate a match to a (ranked!) filename later on.
	fileMap := make(map[string]ranking.ResultPath)
	for result := range indexResults {
		// Time to the first result (≈ time to query the regexp index in
		// case len(backends) == 1)
		if t1.IsZero() {
			t1 = time.Now()
		}
		files = append(files, result)
	}

	// Time to receive and rank the results
	t2 = time.Now()
	log.Printf("All %d index backend results after %v\n", len(files), t2.Sub(t0))

	// Filter the filenames if the "package:" keyword was specified.
	if pkg != "" {
		fmt.Printf(`Filtering for package "%s"\n`, pkg)
		filtered := make(ranking.ResultPaths, 0, len(files))
		for _, file := range files {
			// XXX: Do we want this to be a regular expression match, too?
			if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] != pkg {
				continue
			}

			filtered = append(filtered, file)
		}

		files = filtered
	}
	// Filter the filenames if the "-package:" keyword was specified.
	for _, npkg := range npkgs {
		fmt.Printf(`Excluding matches for package "%s"\n`, npkg)
		filtered := make(ranking.ResultPaths, 0, len(files))
		for _, file := range files {
			// XXX: Do we want this to be a regular expression match, too?
			if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] == npkg {
				continue
			}

			filtered = append(filtered, file)
		}

		files = filtered
	}

	for _, path := range paths {
		fmt.Printf(`Filtering for path "%s"\n`, path)
		pathRegexp, err := regexp.Compile(path)
		if err != nil {
			err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{
				"q":          r.URL.Query().Get("q"),
				"errormsg":   fmt.Sprintf(`%v`, err),
				"suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`),
			})
			if err != nil {
				http.Error(w, err.Error(), http.StatusInternalServerError)
			}

			return
		}

		filtered := make(ranking.ResultPaths, 0, len(files))
		for _, file := range files {
			if pathRegexp.MatchString(file.Path, true, true) == -1 {
				continue
			}

			filtered = append(filtered, file)
		}

		files = filtered
	}

	sort.Sort(files)

	// Time to sort the results
	t3 = time.Now()

	// Now we set up a goroutine which grabs 1000 filenames, ranks them and
	// sends them to sendSourceQuery until sendSourceQuery tells it to stop.
	// For most queries, the first batch will be enough, but for queries with a
	// high false-positive rate (that is, file does not contain the searched
	// word, but all trigrams), we need multiple iterations.
	values := make(chan ranking.ResultPaths)
	cont := make(chan bool)

	go func() {
		start := 0

		for start < len(files) {
			fmt.Printf("ranking 1000 starting from %d\n", start)
			batch := ranking.ResultPaths(resultWindow(files, start, 1000))

			for idx, result := range batch {
				sourcePkgName := result.Path[result.SourcePkgIdx[0]:result.SourcePkgIdx[1]]
				if rankingopts.Pathmatch {
					batch[idx].Ranking += querystr.Match(&result.Path)
				}
				if rankingopts.Sourcepkgmatch {
					batch[idx].Ranking += querystr.Match(&sourcePkgName)
				}
				if rankingopts.Weighted {
					batch[idx].Ranking += 0.1460 * querystr.Match(&result.Path)
					batch[idx].Ranking += 0.0008 * querystr.Match(&sourcePkgName)
				}
				fileMap[result.Path] = batch[idx]
			}

			sort.Sort(batch)
			values <- batch
			if !<-cont {
				fmt.Printf("ranking goroutine exits\n")
				return
			}

			start += 1000
		}

		// Close the channel to signal that there are no more values available
		close(values)

		// Read value from cont goroutine to avoid a blocking write in
		// sendSourceQuery (effectively leading to goroutine leaks).
		<-cont
	}()

	tBeforeSource := time.Now()

	// NB: At this point we could implement some kind of scheduler in the
	// future to split the load between multiple source servers (that might
	// even be multiple instances on the same machine just serving from
	// different disks).
	matches := make(chan Match)
	go sendSourceQuery(rewritten, values, cont, matches, done, allResults, skip)

	var results SearchResults
	var lastUsedFilename int
	maxPathRanking := float32(0)
	for i := 0; i < 1; {
		select {
		case match := <-matches:
			// Time to the first index result
			if t4.IsZero() {
				t4 = time.Now()
			}
			match.Prettify()
			fileResult, ok := fileMap[match.Path]
			if !ok {
				log.Printf("Could not find %s in fileMap?!\n", match.Path)
			} else {
				match.PathRanking = fileResult.Ranking
			}
			if match.PathRanking > maxPathRanking {
				maxPathRanking = match.PathRanking
			}
			results = append(results, match)
		case lastUsedFilename = <-done:
			i++
		}
	}

	fmt.Printf("All source backend results after %v\n", time.Now().Sub(tBeforeSource))

	// Now store the combined ranking of PathRanking (pre) and Ranking (post).
	// We add the values because they are both percentages.
	// To make the Ranking (post) less significant, we multiply it with
	// 1/10 * maxPathRanking
	for idx, match := range results {
		results[idx].FinalRanking = match.PathRanking + ((maxPathRanking * 0.1) * match.Ranking)
	}

	sort.Sort(results)

	// People seem to be distracted by large negative numbers, so we rather
	// show a 0 in case there were no source results :-).
	if t4.IsZero() {
		t4 = t3
	}

	// Add our measurements as HTTP headers so that we can log them in nginx.
	outHeader := w.Header()
	// time to first regexp result
	outHeader.Add("dcs-t0", fmt.Sprintf("%.2fms", float32(t1.Sub(t0).Nanoseconds())/1000/1000))
	// time to receive and rank
	outHeader.Add("dcs-t1", fmt.Sprintf("%.2fms", float32(t2.Sub(t1).Nanoseconds())/1000/1000))
	// time to sort
	outHeader.Add("dcs-t2", fmt.Sprintf("%.2fms", float32(t3.Sub(t2).Nanoseconds())/1000/1000))
	// time to first index result
	outHeader.Add("dcs-t3", fmt.Sprintf("%.2fms", float32(t4.Sub(t3).Nanoseconds())/1000/1000))
	// amount of regexp results
	outHeader.Add("dcs-numfiles", fmt.Sprintf("%.d", len(files)))
	// amount of source results
	outHeader.Add("dcs-numresults", fmt.Sprintf("%.d", len(results)))

	// Show a helpful message when there are no search results instead of just
	// an empty list.
	if len(results) == 0 {
		err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{
			"q":          r.URL.Query().Get("q"),
			"errormsg":   "No search results!",
			"suggestion": template.HTML(`Debian Code Search is case-sensitive. Also, search queries are interpreted as <a href="http://codesearch.debian.net/faq#regexp">regular expressions</a>.`),
		})
		if err != nil {
			http.Error(w, err.Error(), http.StatusInternalServerError)
		}

		return

	}

	// NB: We send the template output to a buffer because that is faster. We
	// also just use the template for the header of the page and then print the
	// results directly from Go, which saves ≈ 10 ms (!).
	outputBuffer := new(bytes.Buffer)
	err = common.Templates.ExecuteTemplate(outputBuffer, "results.html", map[string]interface{}{
		//"results": results,
		"t0":         t1.Sub(t0),
		"t1":         t2.Sub(t1),
		"t2":         t3.Sub(t2),
		"t3":         t4.Sub(t3),
		"numfiles":   len(files),
		"numresults": len(results),
		"timing":     (rewritten.Query().Get("notiming") != "1"),
		"q":          r.URL.Query().Get("q"),
	})
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
	}
	outputBuffer.WriteTo(w)

	context := make([]string, 5)
	for _, result := range results {
		ctx := context[:0]
		if val := strings.TrimSpace(result.Ctxp2); val != "" {
			ctx = append(ctx, result.Ctxp2)
		}
		if val := strings.TrimSpace(result.Ctxp1); val != "" {
			ctx = append(ctx, result.Ctxp1)
		}
		ctx = append(ctx, "<strong>"+result.Context+"</strong>")
		if val := strings.TrimSpace(result.Ctxn1); val != "" {
			ctx = append(ctx, result.Ctxn1)
		}
		if val := strings.TrimSpace(result.Ctxn2); val != "" {
			ctx = append(ctx, result.Ctxn2)
		}
		fmt.Fprintf(w, `<li><a href="/show?file=%s&amp;line=%d&amp;numfiles=%d#L%d"><code><strong>%s</strong>%s:%d</code></a><br><pre>%s</pre>
<small>PathRank: %g, Rank: %g, Final: %g</small></li>`+"\n",
			url.QueryEscape(result.SourcePackage+result.RelativePath),
			result.Line,
			len(files),
			result.Line,
			result.SourcePackage,
			result.RelativePath,
			result.Line,
			strings.Replace(strings.Join(ctx, "<br>"), "\t", "    ", -1),
			result.PathRanking,
			result.Ranking,
			result.FinalRanking)
	}
	fmt.Fprintf(w, "</ul>")

	fmt.Fprintf(w, `<div id="pagination">`)
	if skip > 0 {
		urlCopy := *r.URL
		queryCopy := urlCopy.Query()
		// Pop one value from nextPrev
		prev := strings.Split(queryCopy.Get("prev"), ".")
		// We always have one element, but let’s make sure, it’s user-input
		// after all.
		if len(prev) > 0 {
			queryCopy.Set("skip", prev[len(prev)-1])
			queryCopy.Set("prev", strings.Join(prev[:len(prev)-1], "."))
			urlCopy.RawQuery = queryCopy.Encode()
			fmt.Fprintf(w, `<a href="%s">Previous page</a><div style="display: inline-block; width: 100px">&nbsp;</div>`, urlCopy.RequestURI())
		}
	}

	if skip != lastUsedFilename {
		urlCopy := *r.URL
		queryCopy := urlCopy.Query()
		queryCopy.Set("skip", fmt.Sprintf("%d", lastUsedFilename))
		nextPrev := queryCopy.Get("prev")
		if nextPrev == "" {
			nextPrev = "0"
		} else {
			// We use dot as a separator because it doesn’t get url-encoded
			// (see RFC 3986 section 2.3).
			nextPrev = fmt.Sprintf("%s.%d", nextPrev, skip)
		}
		queryCopy.Set("prev", nextPrev)
		urlCopy.RawQuery = queryCopy.Encode()
		fmt.Fprintf(w, `<a href="%s">Next page</a>`, urlCopy.RequestURI())
	}

	err = common.Templates.ExecuteTemplate(w, "footer.html", map[string]interface{}{
		"version": common.Version,
	})
	if err != nil {
		log.Printf("template error: %v\n", err.Error())
		// We cannot use http.Error since it sends headers and we already did that.
		//http.Error(w, err.Error(), http.StatusInternalServerError)
	}

	if len(*timingTotalPath) > 0 {
		fmt.Fprintf(tTotal, "%d\t%d\n", requestCounter, time.Now().Sub(t0).Nanoseconds()/1000/1000)
	}

	if len(*timingFirstRegexp) > 0 {
		fmt.Fprintf(tFirstRegexp, "%d\t%d\n", requestCounter, t1.Sub(t0).Nanoseconds()/1000/1000)
	}

	if len(*timingFirstIndex) > 0 {
		fmt.Fprintf(tFirstIndex, "%d\t%d\n", requestCounter, t4.Sub(t3).Nanoseconds()/1000/1000)
	}

	if len(*timingReceiveRank) > 0 {
		fmt.Fprintf(tReceiveRank, "%d\t%d\n", requestCounter, t2.Sub(t1).Nanoseconds()/1000/1000)
	}

	if len(*timingSort) > 0 {
		fmt.Fprintf(tSort, "%d\t%d\n", requestCounter, t3.Sub(t2).Nanoseconds()/1000/1000)
	}

	requestCounter++
}