func (m *response) fill(s *gryffin.Scan) { /* {"response":{"headers":{"Date":["Thu, 30 Jul 2015 00:13:43 GMT"],"Set-Cookie":["B=82j3nrdarir1n&b=3&s=23; expires=Sun, 30-Jul-2017 00:13:43 GMT; path=/; domain=.yahoo.com"] */ resp := &http.Response{ Request: s.Request, StatusCode: m.Status, Status: strconv.FormatInt(int64(m.Status), 10), Proto: "HTTP/1.1", ProtoMajor: 1, ProtoMinor: 1, Header: m.Headers, Body: noCloseReader{strings.NewReader(m.Body)}, } s.Response = resp s.ReadResponseBody() }
func (r *NoScriptRenderer) Do(s *gryffin.Scan) { r.chanResponse = make(chan *gryffin.Scan, 10) r.chanLinks = make(chan *gryffin.Scan, 10) crawl := func() { defer close(r.chanResponse) defer close(r.chanLinks) client := &http.Client{} client.Timeout = time.Duration(3) * time.Second if response, err := client.Do(s.Request); err == nil { s.Response = response } else { s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err)) return } s.ReadResponseBody() if s.IsDuplicatedPage() { return } tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody)) r.chanResponse <- s for { t := tokenizer.Next() switch t { case html.ErrorToken: return case html.StartTagToken: token := tokenizer.Token() if token.DataAtom.String() == "a" { for _, attr := range token.Attr { if attr.Key == "href" { link := s.Spawn() // TODO - we drop relative URL as it would drop "#". // Yet, how about real relative URLs? if req, err := http.NewRequest("GET", attr.Val, nil); err == nil { if true { // || req.URL.IsAbs() { link.MergeRequest(req) if link.IsScanAllowed() { r.chanLinks <- link } } else { // ignore relative URL. TOFIX. } } else { log.Printf("error in building request: %s", err) } } } } } } // parse and find links. } go crawl() return }