Example #1
0
func absoluteURLs(base *url.URL, s string) (result []*url.URL) {
	matches := urlRegexp.FindAllString(s, -1)
	for _, match := range matches {
		u, err := urlx.ParseRef(base, match)
		if err != nil {
			continue
		}
		result = append(result, u)
	}
	return
}
Example #2
0
func windowLocation(base *url.URL, s string) (result []*url.URL) {
	matches := windowLocationRegexp.FindAllStringSubmatch(s, -1)
	for _, match := range matches {
		u, err := urlx.ParseRef(base, match[3])
		if err != nil {
			continue
		}
		result = append(result, u)
	}
	return
}
Example #3
0
func (r *Response) scanLocation() {
	var baseurl *url.URL
	if baseurl = r.NewURL; baseurl == nil {
		baseurl = r.URL
	}
	if loc := r.Header.Get("Content-Location"); loc != "" {
		r.ContentLocation, _ = urlx.ParseRef(baseurl, loc)
	}
	if s := r.Header.Get("Refresh"); s != "" {
		r.Refresh.Seconds, r.Refresh.URL = parseRefresh(s, baseurl)
	}
}
Example #4
0
func ExtractHref(base *url.URL, reader io.Reader, ch chan<- *url.URL) error {
	z := html.NewTokenizer(reader)
	f := func(z *html.Tokenizer, base *url.URL) *url.URL {
		for {
			key, val, more := z.TagAttr()
			if bytes.Equal(key, []byte("href")) {
				if u, err := urlx.ParseRef(base, string(val)); err == nil {
					return u
				}
				break
			}
			if !more {
				break
			}
		}
		return nil
	}
LOOP:
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if err := z.Err(); err != io.EOF {
				return err
			}
			break LOOP
		case html.StartTagToken:
			tn, hasAttr := z.TagName()
			if hasAttr && len(tn) == 1 && tn[0] == 'a' {
				if u := f(z, base); u != nil {
					ch <- u
				}
			}
		case html.SelfClosingTagToken:
			tn, hasAttr := z.TagName()
			if hasAttr && len(tn) == 4 && bytes.Equal(tn, []byte("base")) {
				if u := f(z, base); u != nil {
					base = u
				}
			}
		}
	}
	return nil
}
Example #5
0
func parseRefresh(s string, u *url.URL) (second int, uu *url.URL) {
	const blank = " \t\n\f\r"
	var i int
	var err error
	if i = strings.IndexAny(s, ";,"); i == -1 {
		second, _ = strconv.Atoi(strings.TrimRight(s, blank))
		return
	}
	if second, err = strconv.Atoi(strings.TrimRight(s[:i], blank)); err != nil {
		return
	}
	s = strings.TrimLeft(s[i+1:], blank)
	if i = strings.Index(s, "url"); i == -1 {
		return
	}
	s = strings.TrimLeft(s[i+len("url"):], blank)
	if !strings.HasPrefix(s, "=") {
		return
	}
	s = strings.TrimLeft(s[1:], blank)
	uu, _ = urlx.ParseRef(u, s)
	return
}
Example #6
0
func (e *Extractor) tokenLoop(
	r *crawler.Response, body io.Reader, ch chan<- *url.URL, chErr chan<- error,
) {
	defer close(chErr)
	defer close(ch)

	z := html.NewTokenizer(body)
	base := *r.NewURL
	normalize := e.Normalize
	dest := e.Pos
	if normalize == nil {
		normalize = urlx.Normalize
	}
	if len(dest) == 0 {
		dest = []struct{ Tag, Attr string }{{"a", "href"}}
	}

	var prev html.Token
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if err := z.Err(); err != io.EOF {
				chErr <- err
			}
			return
		case html.StartTagToken, html.SelfClosingTagToken:
			token := z.Token()
			prev = token
			if len(token.Attr) == 0 {
				continue
			}
			var (
				v    string
				u    *url.URL
				ok   bool
				err  error
				name = string(token.Data)
			)
			for _, d := range dest {
				if name != d.Tag {
					continue
				} else if v, ok = get(&token, d.Attr); !ok || v == "" {
					continue
				} else if u, err = urlx.ParseRef(
					&base, v,
				); err != nil {
					continue
				}
				if name == "base" {
					base = *u
				}
				if err = normalize(u); err != nil {
					continue
				}
				ch <- u
			}
		case html.TextToken:
			token := z.Token()
			var urls []*url.URL
			switch {
			case e.SniffFlags&SniffWindowLocation != 0:
				if prev.Type == html.StartTagToken && prev.Data == "script" {
					urls = windowLocation(&base, token.Data)
				}
			case e.SniffFlags&SniffAbsoluteURLs != 0:
				urls = absoluteURLs(&base, token.Data)
			}
			for _, u := range urls {
				if err := normalize(u); err != nil {
					continue
				}
				ch <- u
			}
			prev = token
		default:
			prev = html.Token{}
		}
	}
}