func absoluteURLs(base *url.URL, s string) (result []*url.URL) { matches := urlRegexp.FindAllString(s, -1) for _, match := range matches { u, err := urlx.ParseRef(base, match) if err != nil { continue } result = append(result, u) } return }
func windowLocation(base *url.URL, s string) (result []*url.URL) { matches := windowLocationRegexp.FindAllStringSubmatch(s, -1) for _, match := range matches { u, err := urlx.ParseRef(base, match[3]) if err != nil { continue } result = append(result, u) } return }
func (r *Response) scanLocation() { var baseurl *url.URL if baseurl = r.NewURL; baseurl == nil { baseurl = r.URL } if loc := r.Header.Get("Content-Location"); loc != "" { r.ContentLocation, _ = urlx.ParseRef(baseurl, loc) } if s := r.Header.Get("Refresh"); s != "" { r.Refresh.Seconds, r.Refresh.URL = parseRefresh(s, baseurl) } }
func ExtractHref(base *url.URL, reader io.Reader, ch chan<- *url.URL) error { z := html.NewTokenizer(reader) f := func(z *html.Tokenizer, base *url.URL) *url.URL { for { key, val, more := z.TagAttr() if bytes.Equal(key, []byte("href")) { if u, err := urlx.ParseRef(base, string(val)); err == nil { return u } break } if !more { break } } return nil } LOOP: for { tt := z.Next() switch tt { case html.ErrorToken: if err := z.Err(); err != io.EOF { return err } break LOOP case html.StartTagToken: tn, hasAttr := z.TagName() if hasAttr && len(tn) == 1 && tn[0] == 'a' { if u := f(z, base); u != nil { ch <- u } } case html.SelfClosingTagToken: tn, hasAttr := z.TagName() if hasAttr && len(tn) == 4 && bytes.Equal(tn, []byte("base")) { if u := f(z, base); u != nil { base = u } } } } return nil }
func parseRefresh(s string, u *url.URL) (second int, uu *url.URL) { const blank = " \t\n\f\r" var i int var err error if i = strings.IndexAny(s, ";,"); i == -1 { second, _ = strconv.Atoi(strings.TrimRight(s, blank)) return } if second, err = strconv.Atoi(strings.TrimRight(s[:i], blank)); err != nil { return } s = strings.TrimLeft(s[i+1:], blank) if i = strings.Index(s, "url"); i == -1 { return } s = strings.TrimLeft(s[i+len("url"):], blank) if !strings.HasPrefix(s, "=") { return } s = strings.TrimLeft(s[1:], blank) uu, _ = urlx.ParseRef(u, s) return }
func (e *Extractor) tokenLoop( r *crawler.Response, body io.Reader, ch chan<- *url.URL, chErr chan<- error, ) { defer close(chErr) defer close(ch) z := html.NewTokenizer(body) base := *r.NewURL normalize := e.Normalize dest := e.Pos if normalize == nil { normalize = urlx.Normalize } if len(dest) == 0 { dest = []struct{ Tag, Attr string }{{"a", "href"}} } var prev html.Token for { tt := z.Next() switch tt { case html.ErrorToken: if err := z.Err(); err != io.EOF { chErr <- err } return case html.StartTagToken, html.SelfClosingTagToken: token := z.Token() prev = token if len(token.Attr) == 0 { continue } var ( v string u *url.URL ok bool err error name = string(token.Data) ) for _, d := range dest { if name != d.Tag { continue } else if v, ok = get(&token, d.Attr); !ok || v == "" { continue } else if u, err = urlx.ParseRef( &base, v, ); err != nil { continue } if name == "base" { base = *u } if err = normalize(u); err != nil { continue } ch <- u } case html.TextToken: token := z.Token() var urls []*url.URL switch { case e.SniffFlags&SniffWindowLocation != 0: if prev.Type == html.StartTagToken && prev.Data == "script" { urls = windowLocation(&base, token.Data) } case e.SniffFlags&SniffAbsoluteURLs != 0: urls = absoluteURLs(&base, token.Data) } for _, u := range urls { if err := normalize(u); err != nil { continue } ch <- u } prev = token default: prev = html.Token{} } } }