func fetchAllComics() (chan Comic, error) { urlchan := make(chan string, 2000) cc := make(chan Comic) for i := 0; i < 4; i++ { go fetchComic(urlchan, cc) } // Fetch the Archive page resp, err := http.Get("https://xkcd.com/archive/") if err != nil { return nil, err } defer resp.Body.Close() // Parse HTML tok := html.NewTokenizer(resp.Body) for { next := tok.Next() switch next { case html.ErrorToken: return cc, tok.Err() case html.StartTagToken: t := tok.Token() if t.DataAtom == atom.A { for _, attr := range t.Attr { if attr.Key == "href" { isComic, _ := regexp.Match("/[0-9]+/", []byte(attr.Val)) if isComic { urlchan <- attr.Val } } } } } } }
func fetchComic(urlchan chan string, comicchan chan<- Comic) { for { url, more := <-urlchan if !more { break } c := Comic{ Page: "https://xkcd.com" + url, } if ComicInDatabase(c) { continue } // Fetch the Comic page resp, err := http.Get("https://xkcd.com" + url) if err != nil { log.Println("Could not fetch " + url) continue } // Parse HTML tok := html.NewTokenizer(resp.Body) foundImg := false parseloop: for { next := tok.Next() switch next { case html.ErrorToken: resp.Body.Close() break parseloop case html.SelfClosingTagToken: t := tok.Token() if t.DataAtom == atom.Img { for _, attr := range t.Attr { switch attr.Key { case "src": foundImg = strings.HasPrefix(attr.Val, "//imgs.xkcd.com/comics/") if foundImg { c.Image = attr.Val } case "title": if foundImg { c.Title = attr.Val } case "alt": if foundImg { c.Alt = attr.Val } } } if foundImg { comicchan <- c foundImg = false } } } } time.Sleep(10 * time.Second) } close(comicchan) }
func prescan(content []byte) (e encoding.Encoding, name string) { z := html.NewTokenizer(bytes.NewReader(content)) for { switch z.Next() { case html.ErrorToken: return nil, "" case html.StartTagToken, html.SelfClosingTagToken: tagName, hasAttr := z.TagName() if !bytes.Equal(tagName, []byte("meta")) { continue } attrList := make(map[string]bool) gotPragma := false const ( dontKnow = iota doNeedPragma doNotNeedPragma ) needPragma := dontKnow name = "" e = nil for hasAttr { var key, val []byte key, val, hasAttr = z.TagAttr() ks := string(key) if attrList[ks] { continue } attrList[ks] = true for i, c := range val { if 'A' <= c && c <= 'Z' { val[i] = c + 0x20 } } switch ks { case "http-equiv": if bytes.Equal(val, []byte("content-type")) { gotPragma = true } case "content": if e == nil { name = fromMetaElement(string(val)) if name != "" { e, name = Lookup(name) if e != nil { needPragma = doNeedPragma } } } case "charset": e, name = Lookup(string(val)) needPragma = doNotNeedPragma } } if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma { continue } if strings.HasPrefix(name, "utf-16") { name = "utf-8" e = encoding.Nop } if e != nil { return e, name } } } }