func (r *Run) fileWorker(files chan string) error { compiledInstructions, err := r.compileInstruction() if err != nil { return fmt.Errorf("failed to compile instructions: %s", err) } var fileFilterRe, fileIgnoreRe *regexp.Regexp if r.FileFilter != "" { fileFilterRe, err = regexp.Compile(r.FileFilter) if err != nil { return fmt.Errorf("failed to compile file filter regexp: %s", err) } } if r.FileIgnore != "" { fileIgnoreRe, err = regexp.Compile(r.FileIgnore) if err != nil { return fmt.Errorf("failed to compile file ignore regexp: %s", err) } } for f := range files { if fileFilterRe != nil && fileFilterRe.MatchString(f, true, true) < 0 { continue } if fileIgnoreRe != nil && fileIgnoreRe.MatchString(f, true, true) > -1 { continue } if err = r.processFile(f, compiledInstructions); err != nil { return fmt.Errorf("failed to process file %s: %s", f, err) } } return nil }
func (r *Run) Run() (err error) { var wg sync.WaitGroup files := make(chan string, r.numWorkers()*2) for i := 0; i < r.numWorkers(); i++ { wg.Add(1) go func() { defer wg.Done() err = r.fileWorker(files) }() } combined := bytes.NewBufferString("(?m)(") last := len(r.Instruction) - 1 for i, instr := range r.Instruction { combined.WriteString(instr.MatchRegexpString()) if i != last { combined.WriteString("|") } } combined.WriteString(")") re, err := regexp.Compile(combined.String()) if err != nil { return fmt.Errorf( "failed to parse combined regexp %s: %s", combined.String(), err) } q := index.RegexpQuery(re.Syntax) post := r.Index.PostingQuery(q) for _, fileid := range post { files <- r.Index.Name(fileid) } close(files) wg.Wait() return }
// Handles requests to /index by compiling the q= parameter into a regular // expression (codesearch/regexp), searching the index for it and returning the // list of matching filenames in a JSON array. // TODO: This doesn’t handle file name regular expressions at all yet. // TODO: errors aren’t properly signaled to the requester func Index(w http.ResponseWriter, r *http.Request) { if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } r.ParseForm() textQuery := r.Form.Get("q") re, err := regexp.Compile(textQuery) if err != nil { log.Printf("regexp.Compile: %s\n", err) return } query := index.RegexpQuery(re.Syntax) log.Printf("[%s] query: text = %s, regexp = %s\n", id, textQuery, query) files := doPostingQuery(query) t2 := time.Now() if err := json.NewEncoder(w).Encode(files); err != nil { log.Printf("%s\n", err) return } t3 := time.Now() fmt.Printf("[%s] written in %v\n", id, t3.Sub(t2)) }
func (si *SearchIndex) CodeResults(query string) []string { pat := "(?m)" + query re, err := csregexp.Compile(pat) if err != nil { log.Fatal(err) } codesearchQuery := index.RegexpQuery(re.Syntax) post := si.CodeIndex.PostingQuery(codesearchQuery) matchStrings := make([]string, len(post)) for pos, fileid := range post { name := si.CodeIndex.Name(fileid) matchStrings[pos] = name } return matchStrings }
// Handles requests to /index by compiling the q= parameter into a regular // expression (codesearch/regexp), searching the index for it and returning the // list of matching filenames in a JSON array. // TODO: This doesn’t handle file name regular expressions at all yet. // TODO: errors aren’t properly signaled to the requester func Index(w http.ResponseWriter, r *http.Request) { if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } r.ParseForm() textQuery := r.Form.Get("q") re, err := regexp.Compile(textQuery) if err != nil { log.Printf("regexp.Compile: %s\n", err) return } query := index.RegexpQuery(re.Syntax) log.Printf("[%s] query: text = %s, regexp = %s\n", id, textQuery, query) t0 := time.Now() post := ix.PostingQuery(query) t1 := time.Now() fmt.Printf("[%s] postingquery done in %v, %d results\n", id, t1.Sub(t0), len(post)) files := make([]string, len(post)) for idx, fileid := range post { files[idx] = ix.Name(fileid) } t2 := time.Now() fmt.Printf("[%s] filenames collected in %v\n", id, t2.Sub(t1)) jsonFiles, err := json.Marshal(files) if err != nil { log.Printf("%s\n", err) return } t3 := time.Now() fmt.Printf("[%s] marshaling done in %v\n", id, t3.Sub(t2)) _, err = w.Write(jsonFiles) if err != nil { log.Printf("%s\n", err) return } t4 := time.Now() fmt.Printf("[%s] written in %v\n", id, t4.Sub(t3)) }
func main() { var g regexp.Grep g.AddFlags() g.Stdout = os.Stdout g.Stderr = os.Stderr flag.Usage = usage flag.Parse() args := flag.Args() if len(args) == 0 { flag.Usage() } if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } pat := "(?m)" + args[0] if *iflag { pat = "(?i)" + pat } re, err := regexp.Compile(pat) if err != nil { log.Fatal(err) } g.Regexp = re if len(args) == 1 { g.Reader(os.Stdin, "<standard input>") } else { for _, arg := range args[1:] { g.File(arg) } } if !g.Match { os.Exit(1) } }
func query(patterns []string, fFlag string, iFlag bool, out io.Writer, limit int, timelimit time.Duration) (lines int) { var fre *regexp.Regexp var err error if fFlag != "" { fre, err = regexp.Compile(fFlag) if err != nil { return } } outchan := make(chan string) // all output ist collected here. matchchan := make(chan bool) // grep's tell whether thy found sth. stopchan := make(chan bool) // grep's listen here to be stopped timeout := make(chan bool) // delivers a timeout for this function go func() { time.Sleep(timelimit) timeout <- true }() g := make([]*Grep, 0, len(patterns)) for _, v := range patterns { pat := "(?m)" + v if iFlag { pat = "(?i)" + pat } re, err := regexp.Compile(pat) if err != nil { continue } log.Printf("Grepping for %s\n", re) g = append(g, &Grep{ Regexp: re, Stdout: outchan, Matched: matchchan, Stop: stopchan, Stderr: os.Stderr, }) } if len(g) == 0 { return } q := index.RegexpQuery(g[0].Regexp.Syntax) for _, v := range g[1:] { q = q.And(index.RegexpQuery(v.Regexp.Syntax)) } if *verboseFlag { log.Printf("query: %s\n", q) } ix := index.Open(index.File()) ix.Verbose = *verboseFlag var post []uint32 if *bruteFlag { post = ix.PostingQuery(&index.Query{Op: index.QAll}) } else { post = ix.PostingQuery(q) } if *verboseFlag { log.Printf("post query identified %d possible files\n", len(post)) } if fre != nil { fnames := make([]uint32, 0, len(post)) for _, fileid := range post { name := ix.Name(fileid) if fre.MatchString(name, true, true) < 0 { continue } fnames = append(fnames, fileid) } if *verboseFlag { log.Printf("filename regexp matched %d files\n", len(fnames)) } post = fnames } output := make([]string, 0, 10) lines = 0 timeoutFlag := false for _, fileid := range post { output = output[:0] name := ix.Name(fileid) for _, grep := range g { go grep.File(name) } runningcount := len(g) // Counting is critical here. Read once from matchchan and write once // to stopchan for ech grep - or everything will deadlock. matched := true for runningcount > 0 { select { case s := <-outchan: output = append(output, s) case match := <-matchchan: runningcount-- if !match { matched = false runningcount = 0 } case <-timeout: runningcount = 0 timeoutFlag = true } } //log.Println("Stopping all greps") stopcount := len(g) for stopcount > 0 { select { case stopchan <- true: stopcount-- case <-outchan: case <-matchchan: } } //log.Println("All greps stopped") if matched { if *verboseFlag { log.Printf("writing %d lines of output from %s\n", len(output), name) } for _, s := range output { fmt.Fprint(out, s) lines++ limit-- if limit == 0 { fmt.Fprint(out, "... :0: Even More.\n") return } } } if timeoutFlag { fmt.Fprintf(out, "... :0: Timeout: %dms.\n", timelimit/time.Millisecond) break } } return }
func Main() { g := regexp.Grep{ Stdout: os.Stdout, Stderr: os.Stderr, } g.AddFlags() flag.Usage = usage flag.Parse() args := flag.Args() if len(args) != 1 { usage() } if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } pat := "(?m)" + args[0] if *iFlag { pat = "(?i)" + pat } re, err := regexp.Compile(pat) if err != nil { log.Fatal(err) } g.Regexp = re var fre *regexp.Regexp if *fFlag != "" { fre, err = regexp.Compile(*fFlag) if err != nil { log.Fatal(err) } } q := index.RegexpQuery(re.Syntax) if *verboseFlag { log.Printf("query: %s\n", q) } ix := index.Open(index.File()) ix.Verbose = *verboseFlag var post []uint32 if *bruteFlag { post = ix.PostingQuery(&index.Query{Op: index.QAll}) } else { post = ix.PostingQuery(q) } if *verboseFlag { log.Printf("post query identified %d possible files\n", len(post)) } if fre != nil { fnames := make([]uint32, 0, len(post)) for _, fileid := range post { name := ix.Name(fileid) if fre.MatchString(name, true, true) < 0 { continue } fnames = append(fnames, fileid) } if *verboseFlag { log.Printf("filename regexp matched %d files\n", len(fnames)) } post = fnames } for _, fileid := range post { name := ix.Name(fileid) g.File(name) } matches = g.Match }
func Search(w http.ResponseWriter, r *http.Request) { var tinit, t0, t1, t2, t3, t4 time.Time tinit = time.Now() // Rewrite the query to extract words like "lang:c" from the querystring // and place them in parameters. rewritten := RewriteQuery(*r.URL) query := rewritten.Query() // The "package:" keyword, if specified. pkg := rewritten.Query().Get("package") // The "-package:" keyword, if specified. npkgs := rewritten.Query()["npackage"] // The "path:" keyword, if specified. paths := rewritten.Query()["path"] // Usage of this flag should be restricted to local IP addresses or // something like that (it causes a lot of load, but it makes analyzing the // search engine’s ranking easier). allResults := query.Get("all") == "1" // Users can configurable which ranking factors (with what weight) they // want to use. rankingopts stores these values, extracted from the query // parameters. rankingopts := ranking.RankingOptsFromQuery(query) querystr := ranking.NewQueryStr(query.Get("q")) if len(query.Get("q")) < 3 { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": "Your search term is too short. You need at least 3 characters.", }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } _, err := regexp.Compile(query.Get("q")) if err != nil { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": fmt.Sprintf(`%v`, err), "suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } // Number of files to skip when searching. Used for pagination. skip64, _ := strconv.ParseInt(query.Get("skip"), 10, 0) skip := int(skip64) log.Printf("Search query: term %q, URL %q", query.Get("q"), rewritten.String()) fmt.Printf("opts: %v\n", rankingopts) fmt.Printf("Query parsed after %v\n", time.Now().Sub(tinit)) // TODO: compile the regular expression right here so that we don’t do it N // times and can properly error out. // Send the query to all index backends (our index is sharded into multiple // pieces). backends := strings.Split(*indexBackends, ",") done := make(chan int) indexResults := make(chan ranking.ResultPath, 10) t0 = time.Now() for _, backend := range backends { fmt.Printf("Sending query to " + backend) go sendIndexQuery(rewritten, backend, indexResults, done, rankingopts) } // Close the result channel when all index queries are done so that we can // use range on the result channel. go func() { for i := 0; i < len(backends); i++ { <-done } close(indexResults) }() var files ranking.ResultPaths // We also keep the files in a map with their path as the key so that we // can correlate a match to a (ranked!) filename later on. fileMap := make(map[string]ranking.ResultPath) for result := range indexResults { // Time to the first result (≈ time to query the regexp index in // case len(backends) == 1) if t1.IsZero() { t1 = time.Now() } files = append(files, result) } // Time to receive and rank the results t2 = time.Now() log.Printf("All %d index backend results after %v\n", len(files), t2.Sub(t0)) // Filter the filenames if the "package:" keyword was specified. if pkg != "" { fmt.Printf(`Filtering for package "%s"\n`, pkg) filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { // XXX: Do we want this to be a regular expression match, too? if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] != pkg { continue } filtered = append(filtered, file) } files = filtered } // Filter the filenames if the "-package:" keyword was specified. for _, npkg := range npkgs { fmt.Printf(`Excluding matches for package "%s"\n`, npkg) filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { // XXX: Do we want this to be a regular expression match, too? if file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]] == npkg { continue } filtered = append(filtered, file) } files = filtered } for _, path := range paths { fmt.Printf(`Filtering for path "%s"\n`, path) pathRegexp, err := regexp.Compile(path) if err != nil { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": fmt.Sprintf(`%v`, err), "suggestion": template.HTML(`See <a href="http://codesearch.debian.net/faq#regexp">http://codesearch.debian.net/faq#regexp</a> for help on regular expressions.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } filtered := make(ranking.ResultPaths, 0, len(files)) for _, file := range files { if pathRegexp.MatchString(file.Path, true, true) == -1 { continue } filtered = append(filtered, file) } files = filtered } sort.Sort(files) // Time to sort the results t3 = time.Now() // Now we set up a goroutine which grabs 1000 filenames, ranks them and // sends them to sendSourceQuery until sendSourceQuery tells it to stop. // For most queries, the first batch will be enough, but for queries with a // high false-positive rate (that is, file does not contain the searched // word, but all trigrams), we need multiple iterations. values := make(chan ranking.ResultPaths) cont := make(chan bool) go func() { start := 0 for start < len(files) { fmt.Printf("ranking 1000 starting from %d\n", start) batch := ranking.ResultPaths(resultWindow(files, start, 1000)) for idx, result := range batch { sourcePkgName := result.Path[result.SourcePkgIdx[0]:result.SourcePkgIdx[1]] if rankingopts.Pathmatch { batch[idx].Ranking += querystr.Match(&result.Path) } if rankingopts.Sourcepkgmatch { batch[idx].Ranking += querystr.Match(&sourcePkgName) } if rankingopts.Weighted { batch[idx].Ranking += 0.1460 * querystr.Match(&result.Path) batch[idx].Ranking += 0.0008 * querystr.Match(&sourcePkgName) } fileMap[result.Path] = batch[idx] } sort.Sort(batch) values <- batch if !<-cont { fmt.Printf("ranking goroutine exits\n") return } start += 1000 } // Close the channel to signal that there are no more values available close(values) // Read value from cont goroutine to avoid a blocking write in // sendSourceQuery (effectively leading to goroutine leaks). <-cont }() tBeforeSource := time.Now() // NB: At this point we could implement some kind of scheduler in the // future to split the load between multiple source servers (that might // even be multiple instances on the same machine just serving from // different disks). matches := make(chan Match) go sendSourceQuery(rewritten, values, cont, matches, done, allResults, skip) var results SearchResults var lastUsedFilename int maxPathRanking := float32(0) for i := 0; i < 1; { select { case match := <-matches: // Time to the first index result if t4.IsZero() { t4 = time.Now() } match.Prettify() fileResult, ok := fileMap[match.Path] if !ok { log.Printf("Could not find %s in fileMap?!\n", match.Path) } else { match.PathRanking = fileResult.Ranking } if match.PathRanking > maxPathRanking { maxPathRanking = match.PathRanking } results = append(results, match) case lastUsedFilename = <-done: i++ } } fmt.Printf("All source backend results after %v\n", time.Now().Sub(tBeforeSource)) // Now store the combined ranking of PathRanking (pre) and Ranking (post). // We add the values because they are both percentages. // To make the Ranking (post) less significant, we multiply it with // 1/10 * maxPathRanking for idx, match := range results { results[idx].FinalRanking = match.PathRanking + ((maxPathRanking * 0.1) * match.Ranking) } sort.Sort(results) // People seem to be distracted by large negative numbers, so we rather // show a 0 in case there were no source results :-). if t4.IsZero() { t4 = t3 } // Add our measurements as HTTP headers so that we can log them in nginx. outHeader := w.Header() // time to first regexp result outHeader.Add("dcs-t0", fmt.Sprintf("%.2fms", float32(t1.Sub(t0).Nanoseconds())/1000/1000)) // time to receive and rank outHeader.Add("dcs-t1", fmt.Sprintf("%.2fms", float32(t2.Sub(t1).Nanoseconds())/1000/1000)) // time to sort outHeader.Add("dcs-t2", fmt.Sprintf("%.2fms", float32(t3.Sub(t2).Nanoseconds())/1000/1000)) // time to first index result outHeader.Add("dcs-t3", fmt.Sprintf("%.2fms", float32(t4.Sub(t3).Nanoseconds())/1000/1000)) // amount of regexp results outHeader.Add("dcs-numfiles", fmt.Sprintf("%.d", len(files))) // amount of source results outHeader.Add("dcs-numresults", fmt.Sprintf("%.d", len(results))) // Show a helpful message when there are no search results instead of just // an empty list. if len(results) == 0 { err := common.Templates.ExecuteTemplate(w, "error.html", map[string]interface{}{ "q": r.URL.Query().Get("q"), "errormsg": "No search results!", "suggestion": template.HTML(`Debian Code Search is case-sensitive. Also, search queries are interpreted as <a href="http://codesearch.debian.net/faq#regexp">regular expressions</a>.`), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } return } // NB: We send the template output to a buffer because that is faster. We // also just use the template for the header of the page and then print the // results directly from Go, which saves ≈ 10 ms (!). outputBuffer := new(bytes.Buffer) err = common.Templates.ExecuteTemplate(outputBuffer, "results.html", map[string]interface{}{ //"results": results, "t0": t1.Sub(t0), "t1": t2.Sub(t1), "t2": t3.Sub(t2), "t3": t4.Sub(t3), "numfiles": len(files), "numresults": len(results), "timing": (rewritten.Query().Get("notiming") != "1"), "q": r.URL.Query().Get("q"), }) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } outputBuffer.WriteTo(w) context := make([]string, 5) for _, result := range results { ctx := context[:0] if val := strings.TrimSpace(result.Ctxp2); val != "" { ctx = append(ctx, result.Ctxp2) } if val := strings.TrimSpace(result.Ctxp1); val != "" { ctx = append(ctx, result.Ctxp1) } ctx = append(ctx, "<strong>"+result.Context+"</strong>") if val := strings.TrimSpace(result.Ctxn1); val != "" { ctx = append(ctx, result.Ctxn1) } if val := strings.TrimSpace(result.Ctxn2); val != "" { ctx = append(ctx, result.Ctxn2) } fmt.Fprintf(w, `<li><a href="/show?file=%s&line=%d&numfiles=%d#L%d"><code><strong>%s</strong>%s:%d</code></a><br><pre>%s</pre> <small>PathRank: %g, Rank: %g, Final: %g</small></li>`+"\n", url.QueryEscape(result.SourcePackage+result.RelativePath), result.Line, len(files), result.Line, result.SourcePackage, result.RelativePath, result.Line, strings.Replace(strings.Join(ctx, "<br>"), "\t", " ", -1), result.PathRanking, result.Ranking, result.FinalRanking) } fmt.Fprintf(w, "</ul>") fmt.Fprintf(w, `<div id="pagination">`) if skip > 0 { urlCopy := *r.URL queryCopy := urlCopy.Query() // Pop one value from nextPrev prev := strings.Split(queryCopy.Get("prev"), ".") // We always have one element, but let’s make sure, it’s user-input // after all. if len(prev) > 0 { queryCopy.Set("skip", prev[len(prev)-1]) queryCopy.Set("prev", strings.Join(prev[:len(prev)-1], ".")) urlCopy.RawQuery = queryCopy.Encode() fmt.Fprintf(w, `<a href="%s">Previous page</a><div style="display: inline-block; width: 100px"> </div>`, urlCopy.RequestURI()) } } if skip != lastUsedFilename { urlCopy := *r.URL queryCopy := urlCopy.Query() queryCopy.Set("skip", fmt.Sprintf("%d", lastUsedFilename)) nextPrev := queryCopy.Get("prev") if nextPrev == "" { nextPrev = "0" } else { // We use dot as a separator because it doesn’t get url-encoded // (see RFC 3986 section 2.3). nextPrev = fmt.Sprintf("%s.%d", nextPrev, skip) } queryCopy.Set("prev", nextPrev) urlCopy.RawQuery = queryCopy.Encode() fmt.Fprintf(w, `<a href="%s">Next page</a>`, urlCopy.RequestURI()) } err = common.Templates.ExecuteTemplate(w, "footer.html", map[string]interface{}{ "version": common.Version, }) if err != nil { log.Printf("template error: %v\n", err.Error()) // We cannot use http.Error since it sends headers and we already did that. //http.Error(w, err.Error(), http.StatusInternalServerError) } if len(*timingTotalPath) > 0 { fmt.Fprintf(tTotal, "%d\t%d\n", requestCounter, time.Now().Sub(t0).Nanoseconds()/1000/1000) } if len(*timingFirstRegexp) > 0 { fmt.Fprintf(tFirstRegexp, "%d\t%d\n", requestCounter, t1.Sub(t0).Nanoseconds()/1000/1000) } if len(*timingFirstIndex) > 0 { fmt.Fprintf(tFirstIndex, "%d\t%d\n", requestCounter, t4.Sub(t3).Nanoseconds()/1000/1000) } if len(*timingReceiveRank) > 0 { fmt.Fprintf(tReceiveRank, "%d\t%d\n", requestCounter, t2.Sub(t1).Nanoseconds()/1000/1000) } if len(*timingSort) > 0 { fmt.Fprintf(tSort, "%d\t%d\n", requestCounter, t3.Sub(t2).Nanoseconds()/1000/1000) } requestCounter++ }