Example #1
0
func (m *response) fill(s *gryffin.Scan) {

	/*
	   {"response":{"headers":{"Date":["Thu, 30 Jul 2015 00:13:43 GMT"],"Set-Cookie":["B=82j3nrdarir1n&b=3&s=23; expires=Sun, 30-Jul-2017 00:13:43 GMT; path=/; domain=.yahoo.com"]

	*/
	resp := &http.Response{
		Request:    s.Request,
		StatusCode: m.Status,
		Status:     strconv.FormatInt(int64(m.Status), 10),
		Proto:      "HTTP/1.1",
		ProtoMajor: 1,
		ProtoMinor: 1,
		Header:     m.Headers,
		Body:       noCloseReader{strings.NewReader(m.Body)},
	}

	s.Response = resp
	s.ReadResponseBody()

}
Example #2
0
func (r *NoScriptRenderer) Do(s *gryffin.Scan) {
	r.chanResponse = make(chan *gryffin.Scan, 10)
	r.chanLinks = make(chan *gryffin.Scan, 10)

	crawl := func() {

		defer close(r.chanResponse)
		defer close(r.chanLinks)

		client := &http.Client{}

		client.Timeout = time.Duration(3) * time.Second

		if response, err := client.Do(s.Request); err == nil {
			s.Response = response
		} else {
			s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err))
			return
		}

		s.ReadResponseBody()

		if s.IsDuplicatedPage() {
			return
		}

		tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody))

		r.chanResponse <- s

		for {
			t := tokenizer.Next()

			switch t {

			case html.ErrorToken:
				return

			case html.StartTagToken:
				token := tokenizer.Token()
				if token.DataAtom.String() == "a" {
					for _, attr := range token.Attr {
						if attr.Key == "href" {
							link := s.Spawn()
							// TODO - we drop relative URL as it would drop "#".
							// Yet, how about real relative URLs?
							if req, err := http.NewRequest("GET", attr.Val, nil); err == nil {
								if true {
									// || req.URL.IsAbs() {
									link.MergeRequest(req)
									if link.IsScanAllowed() {
										r.chanLinks <- link
									}
								} else {
									// ignore relative URL. TOFIX.
								}
							} else {
								log.Printf("error in building request: %s", err)
							}
						}
					}
				}
			}
		}

		// parse and find links.

	}

	go crawl()

	return
}