func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) { defer close(r.done) dec := json.NewDecoder(stdout) for { var m message err := dec.Decode(&m) if err == io.EOF { return } else { if m.responseMessage != nil { m.Response.fill(s) if s.IsDuplicatedPage() { return } r.chanResponse <- s for _, link := range m.Response.Details.Links { if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() { r.chanLinks <- newScan } } } else if m.domMessage != nil { for _, link := range m.domMessage.Links { if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() { r.chanLinks <- newScan } } } } } }
func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) { defer close(r.done) dec := json.NewDecoder(stdout) for { var m message err := dec.Decode(&m) if err == io.EOF { return } else { if m.responseMessage != nil { m.Response.fill(s) if s.IsDuplicatedPage() { return } r.chanResponse <- s r.parseDetails(&m.Response.Details, s) } if m.details != nil { r.parseDetails(m.details, s) } } } }
func (r *PhantomJSRenderer) Do(s *gryffin.Scan) { r.chanResponse = make(chan *gryffin.Scan, 10) r.chanLinks = make(chan *gryffin.Scan, 10) // Construct the command. // render.js http(s)://<host>[:port][/path] [{"method":"post", "data":"a=1&b=2"}] url := s.Request.URL.String() cookies := make([]string, 0) // ua := s.Request.UserAgent() ua := "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" for _, c := range s.Cookies { cookies = append(cookies, c.String()) } arg := input{ Method: s.Request.Method, Headers: inputHeaders{ UserAgent: ua, Cookie: strings.Join(cookies, ";"), }, } opt, err := json.Marshal(arg) if err != nil { s.Error("PhantomjsRenderer.Do", err) return } // s.Logmf("PhantomjsRenderer.Do", "Running: render.js %s '%s'", url, string(opt)) s.Logmf("PhantomjsRenderer.Do", "Running: render.js") cmd := exec.Command( os.Getenv("GOPATH")+"/src/github.com/yahoo/gryffin/renderer/resource/render.js", url, string(opt)) stdout, err := cmd.StdoutPipe() if err != nil { s.Error("PhantomjsRenderer.Do", err) return } if err := cmd.Start(); err != nil { s.Error("PhantomjsRenderer.Do", err) return } kill := func(reason string) { if err := cmd.Process.Kill(); err != nil { // TODO - forgive "os: process already finished" s.Error("PhantomjsRenderer.Do", err) // log.Printf("error: %s", err) } else { s.Logmf("PhantomjsRenderer.Do", "[%s] Terminating the crawl process.", reason) } } // Kill when timeout _ = time.Second if r.Timeout != 0 { timeout := func() { <-time.After(time.Duration(r.Timeout) * time.Second) kill("Timeout") } go timeout() } crawl := func() { defer close(r.chanResponse) defer close(r.chanLinks) dec := json.NewDecoder(stdout) for { var m message err := dec.Decode(&m) if err == io.EOF { return break } else { if m.responseMessage != nil { m.Response.fill(s) if s.IsDuplicatedPage() { kill("Duplicated") return } s.Logm("PhantomjsRenderer.Do.UniqueCrawl", m.MsgType) r.chanResponse <- s for _, link := range m.Response.Details.Links { if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() { r.chanLinks <- newScan } } } else if m.domMessage != nil { for _, link := range m.domMessage.Links { if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() { r.chanLinks <- newScan } } } } } cmd.Wait() } go crawl() }
func (r *NoScriptRenderer) Do(s *gryffin.Scan) { r.chanResponse = make(chan *gryffin.Scan, 10) r.chanLinks = make(chan *gryffin.Scan, 10) crawl := func() { defer close(r.chanResponse) defer close(r.chanLinks) client := &http.Client{} client.Timeout = time.Duration(3) * time.Second if response, err := client.Do(s.Request); err == nil { s.Response = response } else { s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err)) return } s.ReadResponseBody() if s.IsDuplicatedPage() { return } tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody)) r.chanResponse <- s for { t := tokenizer.Next() switch t { case html.ErrorToken: return case html.StartTagToken: token := tokenizer.Token() if token.DataAtom.String() == "a" { for _, attr := range token.Attr { if attr.Key == "href" { link := s.Spawn() // TODO - we drop relative URL as it would drop "#". // Yet, how about real relative URLs? if req, err := http.NewRequest("GET", attr.Val, nil); err == nil { if true { // || req.URL.IsAbs() { link.MergeRequest(req) if link.IsScanAllowed() { r.chanLinks <- link } } else { // ignore relative URL. TOFIX. } } else { log.Printf("error in building request: %s", err) } } } } } } // parse and find links. } go crawl() return }