// BeforeAction logs request and response data and times the handler // h's execution. func BeforeAction(h func(http.ResponseWriter, *http.Request), contentType string) func(http.ResponseWriter, *http.Request) { return func(rw http.ResponseWriter, req *http.Request) { defer logs.TimerEnd(logs.TimerBegin(fmt.Sprintf("%s '%s'", req.Method, req.URL.Path))) // log request headers logs.Log(fmt.Sprintf("Request headers: %s", req.Header)) // parse params err := req.ParseForm() if logs.CheckErr(err) { http.Error(rw, err.Error(), http.StatusInternalServerError) return } // new recorder for logging/middleware rec := httptest.NewRecorder() // set content-type if contentType != "" { rec.Header().Set("Content-Type", contentType) } h(rec, req) // log response logs.Log(fmt.Sprintf("Response status: %d", rec.Code)) logs.Log(fmt.Sprintf("Response headers: %s", rec.Header())) // copy to actual ResponseWriter copyResponse(rw, rec) } }
func getLinks(u *url.URL) []*url.URL { resp, err := http.Get(u.String()) if err != nil { logs.Log(fmt.Sprintf("Couldn't crawl %s", u)) } defer resp.Body.Close() links := make([]*url.URL, 0) tokenizer := html.NewTokenizer(resp.Body) for { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return links case html.StartTagToken, html.SelfClosingTagToken: token := tokenizer.Token() if link, ok := getURL(u, token); ok { links = append(links, link) } } } return links }
func Crawl(u *url.URL) []map[string]interface{} { sitemap := &Sitemap{Host: u.Host, Nodes: make(map[string]*Node), Ordered: make([]string, 0)} sitemap.Nodes[u.String()] = &Node{URL: u, Neighbors: make(map[string]bool)} sitemap.Ordered = append(sitemap.Ordered, u.String()) // create incoming and outgoing channels for workers urls := make(chan *url.URL, 1000) results := make(chan *result, 100) // create worker pool nWorkers := runtime.NumCPU() runtime.GOMAXPROCS(nWorkers) for i := 0; i < nWorkers; i++ { go worker(urls, results) } // add initial url and track outstanding jobs to know when to terminate urls <- u outstanding := 1 for count := 1; ; count++ { res := <-results newLinks := sitemap.update(res) outstanding += len(newLinks) - 1 for _, link := range newLinks { urls <- link } if outstanding == 0 { close(urls) logs.Log(fmt.Sprintf("Crawled %d urls total", count)) break } if count%chunkSize == 0 { logs.Log(fmt.Sprintf("Crawled %d urls so far", count)) logs.Log(fmt.Sprintf("%d urls pending", outstanding)) } } return sitemap.simplify() }
func main() { r := mux.NewRouter() r.HandleFunc("/", handlers.BeforeAction(handlers.Index, "text/html")).Methods("GET") r.HandleFunc("/crawl", handlers.BeforeAction(handlers.Crawl, "application/json")).Methods("GET") r.PathPrefix("/assets/").Handler( http.StripPrefix("/assets/", http.FileServer(http.Dir("./assets/")))) logs.Log("Starting server on port 8000") logs.CheckFatal(http.ListenAndServe(":8000", r)) }