Пример #1
0
// BeforeAction logs request and response data and times the handler
// h's execution.
func BeforeAction(h func(http.ResponseWriter, *http.Request), contentType string) func(http.ResponseWriter, *http.Request) {

	return func(rw http.ResponseWriter, req *http.Request) {
		defer logs.TimerEnd(logs.TimerBegin(fmt.Sprintf("%s '%s'", req.Method, req.URL.Path)))

		// log request headers
		logs.Log(fmt.Sprintf("Request headers: %s", req.Header))

		// parse params
		err := req.ParseForm()
		if logs.CheckErr(err) {
			http.Error(rw, err.Error(), http.StatusInternalServerError)
			return
		}

		// new recorder for logging/middleware
		rec := httptest.NewRecorder()
		// set content-type
		if contentType != "" {
			rec.Header().Set("Content-Type", contentType)
		}

		h(rec, req)

		// log response
		logs.Log(fmt.Sprintf("Response status: %d", rec.Code))
		logs.Log(fmt.Sprintf("Response headers: %s", rec.Header()))

		// copy to actual ResponseWriter
		copyResponse(rw, rec)
	}
}
Пример #2
0
func getLinks(u *url.URL) []*url.URL {

	resp, err := http.Get(u.String())
	if err != nil {
		logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
	}
	defer resp.Body.Close()

	links := make([]*url.URL, 0)
	tokenizer := html.NewTokenizer(resp.Body)
	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			return links
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if link, ok := getURL(u, token); ok {
				links = append(links, link)
			}
		}
	}

	return links
}
Пример #3
0
func Crawl(u *url.URL) []map[string]interface{} {

	sitemap := &Sitemap{Host: u.Host, Nodes: make(map[string]*Node), Ordered: make([]string, 0)}
	sitemap.Nodes[u.String()] = &Node{URL: u, Neighbors: make(map[string]bool)}
	sitemap.Ordered = append(sitemap.Ordered, u.String())

	// create incoming and outgoing channels for workers
	urls := make(chan *url.URL, 1000)
	results := make(chan *result, 100)

	// create worker pool
	nWorkers := runtime.NumCPU()
	runtime.GOMAXPROCS(nWorkers)
	for i := 0; i < nWorkers; i++ {
		go worker(urls, results)
	}

	// add initial url and track outstanding jobs to know when to terminate
	urls <- u
	outstanding := 1

	for count := 1; ; count++ {
		res := <-results
		newLinks := sitemap.update(res)
		outstanding += len(newLinks) - 1
		for _, link := range newLinks {
			urls <- link
		}

		if outstanding == 0 {
			close(urls)
			logs.Log(fmt.Sprintf("Crawled %d urls total", count))
			break
		}

		if count%chunkSize == 0 {
			logs.Log(fmt.Sprintf("Crawled %d urls so far", count))
			logs.Log(fmt.Sprintf("%d urls pending", outstanding))
		}
	}

	return sitemap.simplify()
}
Пример #4
0
func main() {
	r := mux.NewRouter()
	r.HandleFunc("/",
		handlers.BeforeAction(handlers.Index, "text/html")).Methods("GET")
	r.HandleFunc("/crawl",
		handlers.BeforeAction(handlers.Crawl, "application/json")).Methods("GET")
	r.PathPrefix("/assets/").Handler(
		http.StripPrefix("/assets/", http.FileServer(http.Dir("./assets/"))))

	logs.Log("Starting server on port 8000")
	logs.CheckFatal(http.ListenAndServe(":8000", r))
}