// Reads a single JSON request from the TCP connection, performs the search and // sends results back over the TCP connection as they appear. func streamingQuery(conn net.Conn) { defer conn.Close() connMu := new(sync.Mutex) logprefix := fmt.Sprintf("[%s]", conn.RemoteAddr().String()) type sourceRequest struct { Query string // Rewritten URL (after RewriteQuery()) with all the parameters that // are relevant for ranking. URL string } var r sourceRequest if err := json.NewDecoder(conn).Decode(&r); err != nil { log.Printf("%s Could not parse JSON request: %v\n", logprefix, err) return } logprefix = fmt.Sprintf("%s [%q]", logprefix, r.Query) // Ask the local index backend for all the filenames. filenames, err := queryIndexBackend(r.Query) if err != nil { log.Printf("%s Error querying index backend for query %q: %v\n", logprefix, r.Query, err) return } // Parse the (rewritten) URL to extract all ranking options/keywords. rewritten, err := url.Parse(r.URL) if err != nil { log.Fatal(err) } rankingopts := ranking.RankingOptsFromQuery(rewritten.Query()) // Rank all the paths. files := make(ranking.ResultPaths, 0, len(filenames)) for _, filename := range filenames { result := ranking.ResultPath{Path: filename} result.Rank(&rankingopts) if result.Ranking > -1 { files = append(files, result) } } // Filter all files that should be excluded. files = filterByKeywords(rewritten, files) // While not strictly necessary, this will lead to better results being // discovered (and returned!) earlier, so let’s spend a few cycles on // sorting the list of potential files first. sort.Sort(files) re, err := regexp.Compile(r.Query) if err != nil { log.Printf("%s Could not compile regexp: %v\n", logprefix, err) return } log.Printf("%s regexp = %q, %d possible files\n", logprefix, re, len(files)) // Send the first progress update so that clients know how many files are // going to be searched. if _, err := sendProgressUpdate(conn, connMu, 0, len(files)); err != nil { log.Printf("%s %v\n", logprefix, err) return } // The tricky part here is “flow control”: if we just start grepping like // crazy, we will eventually run out of memory because all our writes are // blocked on the connection (and the goroutines need to keep the write // buffer in memory until the write is done). // // So instead, we start 1000 worker goroutines and feed them work through a // single channel. Due to these these goroutines being blocked on writing, // the grepping will naturally become slower. work := make(chan ranking.ResultPath) progress := make(chan int) var wg sync.WaitGroup // We add the additional 1 for the progress updater goroutine. It also // needs to be done before we can return, otherwise it will try to use the // (already closed) network connection, which is a fatal error. wg.Add(len(files) + 1) go func() { for _, file := range files { work <- file } close(work) }() go func() { cnt := 0 errorShown := false var lastProgressUpdate time.Time progressInterval := 2*time.Second + time.Duration(rand.Int63n(int64(500*time.Millisecond))) for cnt < len(files) { add := <-progress cnt += add if time.Since(lastProgressUpdate) > progressInterval { if _, err := sendProgressUpdate(conn, connMu, cnt, len(files)); err != nil { if !errorShown { log.Printf("%s %v\n", logprefix, err) // We need to read the 'progress' channel, so we cannot // just exit the loop here. Instead, we suppress all // error messages after the first one. errorShown = true } } lastProgressUpdate = time.Now() } } if _, err := sendProgressUpdate(conn, connMu, len(files), len(files)); err != nil { log.Printf("%s %v\n", logprefix, err) } close(progress) wg.Done() }() querystr := ranking.NewQueryStr(r.Query) numWorkers := 1000 if len(files) < 1000 { numWorkers = len(files) } for i := 0; i < numWorkers; i++ { go func() { re, err := regexp.Compile(r.Query) if err != nil { log.Printf("%s\n", err) return } grep := regexp.Grep{ Regexp: re, Stdout: os.Stdout, Stderr: os.Stderr, } for file := range work { sourcePkgName := file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] if rankingopts.Pathmatch { file.Ranking += querystr.Match(&file.Path) } if rankingopts.Sourcepkgmatch { file.Ranking += querystr.Match(&sourcePkgName) } if rankingopts.Weighted { file.Ranking += 0.1460 * querystr.Match(&file.Path) file.Ranking += 0.0008 * querystr.Match(&sourcePkgName) } // TODO: figure out how to safely clone a dcs/regexp matches := grep.File(path.Join(*unpackedPath, file.Path)) for _, match := range matches { match.Ranking = ranking.PostRank(rankingopts, &match, &querystr) match.PathRank = file.Ranking //match.Path = match.Path[len(*unpackedPath):] // NB: populating match.Ranking happens in // cmd/dcs-web/querymanager because it depends on at least // one other result. // TODO: ideally, we’d get capn buffers from grep.File(), let’s do that after profiling the decoding performance seg := capn.NewBuffer(nil) z := proto.NewRootZ(seg) m := proto.NewMatch(seg) m.SetPath(match.Path[len(*unpackedPath):]) m.SetLine(uint32(match.Line)) m.SetPackage(m.Path()[:strings.Index(m.Path(), "/")]) m.SetCtxp2(match.Ctxp2) m.SetCtxp1(match.Ctxp1) m.SetContext(match.Context) m.SetCtxn1(match.Ctxn1) m.SetCtxn2(match.Ctxn2) m.SetPathrank(match.PathRank) m.SetRanking(match.Ranking) z.SetMatch(m) connMu.Lock() if _, err := seg.WriteToPacked(conn); err != nil { connMu.Unlock() log.Printf("%s %v\n", logprefix, err) // Drain the work channel, but without doing any work. // This effectively exits the worker goroutine(s) // cleanly. for _ = range work { } break } connMu.Unlock() } progress <- 1 wg.Done() } }() } wg.Wait() log.Printf("%s Sent all results.\n", logprefix) }
func Source(w http.ResponseWriter, r *http.Request) { r.ParseForm() textQuery := r.Form.Get("q") limit, err := strconv.ParseInt(r.Form.Get("limit"), 10, 0) if err != nil { log.Printf("%s\n", err) return } filenames := r.Form["filename"] re, err := regexp.Compile(textQuery) if err != nil { log.Printf("%s\n", err) return } log.Printf("query: text = %s, regexp = %s\n", textQuery, re) rankingopts := ranking.RankingOptsFromQuery(r.URL.Query()) querystr := ranking.NewQueryStr(textQuery) // Create one Goroutine per filename, which means all IO will be done in // parallel. // TODO: implement a more clever way of scheduling IO. when enough results // are gathered, we don’t need to grep any other files, so currently we may // do unnecessary work. output := make(chan []regexp.Match) for _, filename := range filenames { go func(filename string) { // TODO: figure out how to safely clone a dcs/regexp re, err := regexp.Compile(textQuery) if err != nil { log.Printf("%s\n", err) return } grep := regexp.Grep{ Regexp: re, Stdout: os.Stdout, Stderr: os.Stderr, } output <- grep.File(path.Join(*unpackedPath, filename)) }(filename) } fmt.Printf("done, now getting the results\n") // TODO: also limit the number of matches per source-package, not only per file var reply SourceReply for idx, filename := range filenames { fmt.Printf("…in %s\n", filename) matches := <-output for idx, match := range matches { if limit > 0 && idx == 5 { // TODO: we somehow need to signal that there are more results // (if there are more), so that the user can expand this. break } fmt.Printf("match: %s\n", match) match.Ranking = ranking.PostRank(rankingopts, &match, &querystr) match.Path = match.Path[len(*unpackedPath):] reply.AllMatches = append(reply.AllMatches, match) } if limit > 0 && int64(len(reply.AllMatches)) >= limit { reply.LastUsedFilename = idx break } } jsonFiles, err := json.Marshal(&reply) if err != nil { log.Printf("%s\n", err) return } _, err = w.Write(jsonFiles) if err != nil { log.Printf("%s\n", err) return } // Read the remaining outputs in the background. if reply.LastUsedFilename > 0 { go func(stopped, max int) { for i := stopped + 1; i < max; i++ { <-output } }(reply.LastUsedFilename, len(filenames)) } }
func Search(w http.ResponseWriter, r *http.Request) { var tinit, t0, t1, t2, t3, t4 time.Time tinit = time.Now() // Rewrite the query to extract words like "lang:c" from the querystring // and place them in parameters. rewritten := RewriteQuery(*r.URL) query := rewritten.Query() // The "package:" keyword, if specified. pkg := rewritten.Query().Get("package") // The "-package:" keyword, if specified. npkgs := rewritten.Query()["npackage"] // The "path:" keyword, if specified. paths := rewritten.Query()["path"] // Usage of this flag should be restricted to local IP addresses or // something like that (it causes a lot of load, but it makes analyzing the // search engine’s ranking easier). allResults := query.Get("all") == "1" // Users can configurable which ranking factors (with what weight) they // want to use. rankingopts stores these values, extracted from the query // parameters. rankingopts := ranking.RankingOptsFromQuery(query) querystr := ranking.NewQueryStr(query.Get("q")) if len(query.Get("q")) < 3 { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": "Your search term is too short. You need at least 3 characters.", }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } _, err := regexp.Compile(query.Get("q")) if err != nil { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": fmt.Sprintf(`%v`, err), "suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } // Number of files to skip when searching. Used for pagination. skip64, _ := strconv.ParseInt(query.Get("skip"), 10, 0) skip := int(skip64) log.Printf("Search query: term %q, URL %q", query.Get("q"), rewritten.String()) fmt.Printf("opts: %v\n", rankingopts) fmt.Printf("Query parsed after %v\n", time.Now().Sub(tinit)) // TODO: compile the regular expression right here so that we don’t do it N // times and can properly error out. // Send the query to all index backends (our index is sharded into multiple // pieces). backends := strings.Split(*indexBackends, ",") done := make(chan int) indexResults := make(chan ranking.ResultPath, 10) t0 = time.Now() for _, backend := range backends { fmt.Printf("Sending query to " + backend) go sendIndexQuery(rewritten, backend, indexResults, done, rankingopts) } // Close the result channel when all index queries are done so that we can // use range on the result channel. go func() { for i := 0; i < len(backends); i++ { <-done } close(indexResults) }() var files ranking.ResultPaths // We also keep the files in a map with their path as the key so that we // can correlate a match to a (ranked!) filename later on. fileMap := make(map[string]ranking.ResultPath) for result := range indexResults { // Time to the first result (≈ time to query the regexp index in // case len(backends) == 1) if t1.IsZero() { t1 = time.Now() } files = append(files, result) } // Time to receive and rank the results t2 = time.Now() log.Printf("All %d index backend results after %v\n", len(files), t2.Sub(t0)) // Filter the filenames if the "package:" keyword was specified. if pkg != "" { fmt.Printf(`Filtering for package "%s"\n`, pkg) filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { // XXX: Do we want this to be a regular expression match, too? if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] != pkg { continue } filtered = append(filtered, file) } files = filtered } // Filter the filenames if the "-package:" keyword was specified. for _, npkg := range npkgs { fmt.Printf(`Excluding matches for package "%s"\n`, npkg) filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { // XXX: Do we want this to be a regular expression match, too? if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] == npkg { continue } filtered = append(filtered, file) } files = filtered } for _, path := range paths { fmt.Printf(`Filtering for path "%s"\n`, path) pathRegexp, err := regexp.Compile(path) if err != nil { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": fmt.Sprintf(`%v`, err), "suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { if pathRegexp.MatchString(file.Path, true, true) == -1 { continue } filtered = append(filtered, file) } files = filtered } sort.Sort(files) // Time to sort the results t3 = time.Now() // Now we set up a goroutine which grabs 1000 filenames, ranks them and // sends them to sendSourceQuery until sendSourceQuery tells it to stop. // For most queries, the first batch will be enough, but for queries with a // high false-positive rate (that is, file does not contain the searched // word, but all trigrams), we need multiple iterations. values := make(chan ranking.ResultPaths) cont := make(chan bool) go func() { start := 0 for start < len(files) { fmt.Printf("ranking 1000 starting from %d\n", start) batch := ranking.ResultPaths(resultWindow(files, start, 1000)) for idx, result := range batch { sourcePkgName := result.Path[result.SourcePkgIdx[0]:result.SourcePkgIdx[1]] if rankingopts.Pathmatch { batch[idx].Ranking += querystr.Match(&result.Path) } if rankingopts.Sourcepkgmatch { batch[idx].Ranking += querystr.Match(&sourcePkgName) } if rankingopts.Weighted { batch[idx].Ranking += 0.1460 * querystr.Match(&result.Path) batch[idx].Ranking += 0.0008 * querystr.Match(&sourcePkgName) } fileMap[result.Path] = batch[idx] } sort.Sort(batch) values <- batch if !<-cont { fmt.Printf("ranking goroutine exits\n") return } start += 1000 } // Close the channel to signal that there are no more values available close(values) // Read value from cont goroutine to avoid a blocking write in // sendSourceQuery (effectively leading to goroutine leaks). <-cont }() tBeforeSource := time.Now() // NB: At this point we could implement some kind of scheduler in the // future to split the load between multiple source servers (that might // even be multiple instances on the same machine just serving from // different disks). matches := make(chan Match) go sendSourceQuery(rewritten, values, cont, matches, done, allResults, skip) var results SearchResults var lastUsedFilename int maxPathRanking := float32(0) for i := 0; i < 1; { select { case match := <-matches: // Time to the first index result if t4.IsZero() { t4 = time.Now() } match.Prettify() fileResult, ok := fileMap[match.Path] if !ok { log.Printf("Could not find %s in fileMap?!\n", match.Path) } else { match.PathRanking = fileResult.Ranking } if match.PathRanking > maxPathRanking { maxPathRanking = match.PathRanking } results = append(results, match) case lastUsedFilename = <-done: i++ } } fmt.Printf("All source backend results after %v\n", time.Now().Sub(tBeforeSource)) // Now store the combined ranking of PathRanking (pre) and Ranking (post). // We add the values because they are both percentages. // To make the Ranking (post) less significant, we multiply it with // 1/10 * maxPathRanking for idx, match := range results { results[idx].FinalRanking = match.PathRanking + ((maxPathRanking * 0.1) * match.Ranking) } sort.Sort(results) // People seem to be distracted by large negative numbers, so we rather // show a 0 in case there were no source results :-). if t4.IsZero() { t4 = t3 } // Add our measurements as HTTP headers so that we can log them in nginx. outHeader := w.Header() // time to first regexp result outHeader.Add("dcs-t0", fmt.Sprintf("%.2fms", float32(t1.Sub(t0).Nanoseconds())/1000/1000)) // time to receive and rank outHeader.Add("dcs-t1", fmt.Sprintf("%.2fms", float32(t2.Sub(t1).Nanoseconds())/1000/1000)) // time to sort outHeader.Add("dcs-t2", fmt.Sprintf("%.2fms", float32(t3.Sub(t2).Nanoseconds())/1000/1000)) // time to first index result outHeader.Add("dcs-t3", fmt.Sprintf("%.2fms", float32(t4.Sub(t3).Nanoseconds())/1000/1000)) // amount of regexp results outHeader.Add("dcs-numfiles", fmt.Sprintf("%.d", len(files))) // amount of source results outHeader.Add("dcs-numresults", fmt.Sprintf("%.d", len(results))) // Show a helpful message when there are no search results instead of just // an empty list. if len(results) == 0 { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": "No search results!", "suggestion": template.HTML(`Debian Code Search is case-sensitive. Also, search queries are interpreted as <a href="http://codesearch.debian.net/faq#regexp">regular expressions</a>.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } // NB: We send the template output to a buffer because that is faster. We // also just use the template for the header of the page and then print the // results directly from Go, which saves ≈ 10 ms (!). outputBuffer := new(bytes.Buffer) err = common.Templates.ExecuteTemplate(outputBuffer, "results.html", map[string]interface{}{ //"results": results, "t0": t1.Sub(t0), "t1": t2.Sub(t1), "t2": t3.Sub(t2), "t3": t4.Sub(t3), "numfiles": len(files), "numresults": len(results), "timing": (rewritten.Query().Get("notiming") != "1"), "q": r.URL.Query().Get("q"), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } outputBuffer.WriteTo(w) context := make([]string, 5) for _, result := range results { ctx := context[:0] if val := strings.TrimSpace(result.Ctxp2); val != "" { ctx = append(ctx, result.Ctxp2) } if val := strings.TrimSpace(result.Ctxp1); val != "" { ctx = append(ctx, result.Ctxp1) } ctx = append(ctx, "<strong>"+result.Context+"</strong>") if val := strings.TrimSpace(result.Ctxn1); val != "" { ctx = append(ctx, result.Ctxn1) } if val := strings.TrimSpace(result.Ctxn2); val != "" { ctx = append(ctx, result.Ctxn2) } fmt.Fprintf(w, `<li><a href="/show?file=%s&line=%d&numfiles=%d#L%d"><code><strong>%s</strong>%s:%d</code></a><br><pre>%s</pre> <small>PathRank: %g, Rank: %g, Final: %g</small></li>`+"\n", url.QueryEscape(result.SourcePackage+result.RelativePath), result.Line, len(files), result.Line, result.SourcePackage, result.RelativePath, result.Line, strings.Replace(strings.Join(ctx, "<br>"), "\t", " ", -1), result.PathRanking, result.Ranking, result.FinalRanking) } fmt.Fprintf(w, "</ul>") fmt.Fprintf(w, `<div id="pagination">`) if skip > 0 { urlCopy := *r.URL queryCopy := urlCopy.Query() // Pop one value from nextPrev prev := strings.Split(queryCopy.Get("prev"), ".") // We always have one element, but let’s make sure, it’s user-input // after all. if len(prev) > 0 { queryCopy.Set("skip", prev[len(prev)-1]) queryCopy.Set("prev", strings.Join(prev[:len(prev)-1], ".")) urlCopy.RawQuery = queryCopy.Encode() fmt.Fprintf(w, `<a href="%s">Previous page</a><div style="display: inline-block; width: 100px"> </div>`, urlCopy.RequestURI()) } } if skip != lastUsedFilename { urlCopy := *r.URL queryCopy := urlCopy.Query() queryCopy.Set("skip", fmt.Sprintf("%d", lastUsedFilename)) nextPrev := queryCopy.Get("prev") if nextPrev == "" { nextPrev = "0" } else { // We use dot as a separator because it doesn’t get url-encoded // (see RFC 3986 section 2.3). nextPrev = fmt.Sprintf("%s.%d", nextPrev, skip) } queryCopy.Set("prev", nextPrev) urlCopy.RawQuery = queryCopy.Encode() fmt.Fprintf(w, `<a href="%s">Next page</a>`, urlCopy.RequestURI()) } err = common.Templates.ExecuteTemplate(w, "footer.html", map[string]interface{}{ "version": common.Version, }) if err != nil { log.Printf("template error: %v\n", err.Error()) // We cannot use http.Error since it sends headers and we already did that. //http.Error(w, err.Error(), http.StatusInternalServerError) } if len(*timingTotalPath) > 0 { fmt.Fprintf(tTotal, "%d\t%d\n", requestCounter, time.Now().Sub(t0).Nanoseconds()/1000/1000) } if len(*timingFirstRegexp) > 0 { fmt.Fprintf(tFirstRegexp, "%d\t%d\n", requestCounter, t1.Sub(t0).Nanoseconds()/1000/1000) } if len(*timingFirstIndex) > 0 { fmt.Fprintf(tFirstIndex, "%d\t%d\n", requestCounter, t4.Sub(t3).Nanoseconds()/1000/1000) } if len(*timingReceiveRank) > 0 { fmt.Fprintf(tReceiveRank, "%d\t%d\n", requestCounter, t2.Sub(t1).Nanoseconds()/1000/1000) } if len(*timingSort) > 0 { fmt.Fprintf(tSort, "%d\t%d\n", requestCounter, t3.Sub(t2).Nanoseconds()/1000/1000) } requestCounter++ }