예제 #1
0
파일: search.go 프로젝트: krzsas/dcs
func sendIndexQuery(query url.URL, backend string, indexResults chan ranking.ResultPath, done chan int, rankingopts ranking.RankingOpts) {
	t0 := time.Now()
	query.Scheme = "http"
	query.Host = backend
	query.Path = "/index"
	log.Printf("asking %s\n", query.String())
	resp, err := http.Get(query.String())
	if err != nil {
		done <- 1
		return
	}
	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		done <- 1
		return
	}

	var files []string
	if err := json.Unmarshal(body, &files); err != nil {
		// TODO: Better error message
		log.Printf("Invalid result from backend " + backend)
		done <- 1
		return
	}

	t1 := time.Now()

	log.Printf("[%s] %d results in %v\n", backend, len(files), t1.Sub(t0))

	var result ranking.ResultPath
	for _, filename := range files {
		result.Path = filename
		result.Rank(&rankingopts)
		if result.Ranking > -1 {
			indexResults <- result
		}
	}
	done <- 1
}
예제 #2
0
// Reads a single JSON request from the TCP connection, performs the search and
// sends results back over the TCP connection as they appear.
func streamingQuery(conn net.Conn) {
	defer conn.Close()
	connMu := new(sync.Mutex)
	logprefix := fmt.Sprintf("[%s]", conn.RemoteAddr().String())

	type sourceRequest struct {
		Query string
		// Rewritten URL (after RewriteQuery()) with all the parameters that
		// are relevant for ranking.
		URL string
	}

	var r sourceRequest
	if err := json.NewDecoder(conn).Decode(&r); err != nil {
		log.Printf("%s Could not parse JSON request: %v\n", logprefix, err)
		return
	}

	logprefix = fmt.Sprintf("%s [%q]", logprefix, r.Query)

	// Ask the local index backend for all the filenames.
	filenames, err := queryIndexBackend(r.Query)
	if err != nil {
		log.Printf("%s Error querying index backend for query %q: %v\n", logprefix, r.Query, err)
		return
	}

	// Parse the (rewritten) URL to extract all ranking options/keywords.
	rewritten, err := url.Parse(r.URL)
	if err != nil {
		log.Fatal(err)
	}
	rankingopts := ranking.RankingOptsFromQuery(rewritten.Query())

	// Rank all the paths.
	files := make(ranking.ResultPaths, 0, len(filenames))
	for _, filename := range filenames {
		result := ranking.ResultPath{Path: filename}
		result.Rank(&rankingopts)
		if result.Ranking > -1 {
			files = append(files, result)
		}
	}

	// Filter all files that should be excluded.
	files = filterByKeywords(rewritten, files)

	// While not strictly necessary, this will lead to better results being
	// discovered (and returned!) earlier, so let’s spend a few cycles on
	// sorting the list of potential files first.
	sort.Sort(files)

	re, err := regexp.Compile(r.Query)
	if err != nil {
		log.Printf("%s Could not compile regexp: %v\n", logprefix, err)
		return
	}

	log.Printf("%s regexp = %q, %d possible files\n", logprefix, re, len(files))

	// Send the first progress update so that clients know how many files are
	// going to be searched.
	if _, err := sendProgressUpdate(conn, connMu, 0, len(files)); err != nil {
		log.Printf("%s %v\n", logprefix, err)
		return
	}

	// The tricky part here is “flow control”: if we just start grepping like
	// crazy, we will eventually run out of memory because all our writes are
	// blocked on the connection (and the goroutines need to keep the write
	// buffer in memory until the write is done).
	//
	// So instead, we start 1000 worker goroutines and feed them work through a
	// single channel. Due to these these goroutines being blocked on writing,
	// the grepping will naturally become slower.
	work := make(chan ranking.ResultPath)
	progress := make(chan int)

	var wg sync.WaitGroup
	// We add the additional 1 for the progress updater goroutine. It also
	// needs to be done before we can return, otherwise it will try to use the
	// (already closed) network connection, which is a fatal error.
	wg.Add(len(files) + 1)

	go func() {
		for _, file := range files {
			work <- file
		}
		close(work)
	}()

	go func() {
		cnt := 0
		errorShown := false
		var lastProgressUpdate time.Time
		progressInterval := 2*time.Second + time.Duration(rand.Int63n(int64(500*time.Millisecond)))
		for cnt < len(files) {
			add := <-progress
			cnt += add

			if time.Since(lastProgressUpdate) > progressInterval {
				if _, err := sendProgressUpdate(conn, connMu, cnt, len(files)); err != nil {
					if !errorShown {
						log.Printf("%s %v\n", logprefix, err)
						// We need to read the 'progress' channel, so we cannot
						// just exit the loop here. Instead, we suppress all
						// error messages after the first one.
						errorShown = true
					}
				}
				lastProgressUpdate = time.Now()
			}
		}

		if _, err := sendProgressUpdate(conn, connMu, len(files), len(files)); err != nil {
			log.Printf("%s %v\n", logprefix, err)
		}
		close(progress)

		wg.Done()
	}()

	querystr := ranking.NewQueryStr(r.Query)

	numWorkers := 1000
	if len(files) < 1000 {
		numWorkers = len(files)
	}
	for i := 0; i < numWorkers; i++ {
		go func() {
			re, err := regexp.Compile(r.Query)
			if err != nil {
				log.Printf("%s\n", err)
				return
			}

			grep := regexp.Grep{
				Regexp: re,
				Stdout: os.Stdout,
				Stderr: os.Stderr,
			}

			for file := range work {
				sourcePkgName := file.Path[file.SourcePkgIdx[0]:file.SourcePkgIdx[1]]
				if rankingopts.Pathmatch {
					file.Ranking += querystr.Match(&file.Path)
				}
				if rankingopts.Sourcepkgmatch {
					file.Ranking += querystr.Match(&sourcePkgName)
				}
				if rankingopts.Weighted {
					file.Ranking += 0.1460 * querystr.Match(&file.Path)
					file.Ranking += 0.0008 * querystr.Match(&sourcePkgName)
				}

				// TODO: figure out how to safely clone a dcs/regexp
				matches := grep.File(path.Join(*unpackedPath, file.Path))
				for _, match := range matches {
					match.Ranking = ranking.PostRank(rankingopts, &match, &querystr)
					match.PathRank = file.Ranking
					//match.Path = match.Path[len(*unpackedPath):]
					// NB: populating match.Ranking happens in
					// cmd/dcs-web/querymanager because it depends on at least
					// one other result.

					// TODO: ideally, we’d get capn buffers from grep.File(), let’s do that after profiling the decoding performance
					seg := capn.NewBuffer(nil)
					z := proto.NewRootZ(seg)
					m := proto.NewMatch(seg)
					m.SetPath(match.Path[len(*unpackedPath):])
					m.SetLine(uint32(match.Line))
					m.SetPackage(m.Path()[:strings.Index(m.Path(), "/")])
					m.SetCtxp2(match.Ctxp2)
					m.SetCtxp1(match.Ctxp1)
					m.SetContext(match.Context)
					m.SetCtxn1(match.Ctxn1)
					m.SetCtxn2(match.Ctxn2)
					m.SetPathrank(match.PathRank)
					m.SetRanking(match.Ranking)
					z.SetMatch(m)

					connMu.Lock()
					if _, err := seg.WriteToPacked(conn); err != nil {
						connMu.Unlock()
						log.Printf("%s %v\n", logprefix, err)
						// Drain the work channel, but without doing any work.
						// This effectively exits the worker goroutine(s)
						// cleanly.
						for _ = range work {
						}
						break
					}
					connMu.Unlock()
				}

				progress <- 1

				wg.Done()
			}
		}()
	}

	wg.Wait()

	log.Printf("%s Sent all results.\n", logprefix)
}