Пример #1
0
func fetchAllComics() (chan Comic, error) {
	urlchan := make(chan string, 2000)

	cc := make(chan Comic)
	for i := 0; i < 4; i++ {
		go fetchComic(urlchan, cc)
	}

	// Fetch the Archive page
	resp, err := http.Get("https://xkcd.com/archive/")
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	// Parse HTML
	tok := html.NewTokenizer(resp.Body)

	for {
		next := tok.Next()

		switch next {
		case html.ErrorToken:
			return cc, tok.Err()

		case html.StartTagToken:
			t := tok.Token()
			if t.DataAtom == atom.A {
				for _, attr := range t.Attr {
					if attr.Key == "href" {
						isComic, _ := regexp.Match("/[0-9]+/", []byte(attr.Val))
						if isComic {
							urlchan <- attr.Val
						}
					}
				}
			}
		}
	}
}
Пример #2
0
func fetchComic(urlchan chan string, comicchan chan<- Comic) {
	for {

		url, more := <-urlchan

		if !more {
			break
		}

		c := Comic{
			Page: "https://xkcd.com" + url,
		}

		if ComicInDatabase(c) {
			continue
		}

		// Fetch the Comic  page
		resp, err := http.Get("https://xkcd.com" + url)
		if err != nil {
			log.Println("Could not fetch " + url)
			continue
		}

		// Parse HTML
		tok := html.NewTokenizer(resp.Body)

		foundImg := false

	parseloop:
		for {
			next := tok.Next()

			switch next {
			case html.ErrorToken:
				resp.Body.Close()
				break parseloop

			case html.SelfClosingTagToken:
				t := tok.Token()
				if t.DataAtom == atom.Img {
					for _, attr := range t.Attr {
						switch attr.Key {
						case "src":
							foundImg = strings.HasPrefix(attr.Val, "//imgs.xkcd.com/comics/")
							if foundImg {
								c.Image = attr.Val
							}

						case "title":
							if foundImg {
								c.Title = attr.Val
							}

						case "alt":
							if foundImg {
								c.Alt = attr.Val
							}
						}
					}

					if foundImg {
						comicchan <- c
						foundImg = false
					}
				}
			}
		}

		time.Sleep(10 * time.Second)
	}

	close(comicchan)
}
Пример #3
0
func prescan(content []byte) (e encoding.Encoding, name string) {
	z := html.NewTokenizer(bytes.NewReader(content))
	for {
		switch z.Next() {
		case html.ErrorToken:
			return nil, ""

		case html.StartTagToken, html.SelfClosingTagToken:
			tagName, hasAttr := z.TagName()
			if !bytes.Equal(tagName, []byte("meta")) {
				continue
			}
			attrList := make(map[string]bool)
			gotPragma := false

			const (
				dontKnow = iota
				doNeedPragma
				doNotNeedPragma
			)
			needPragma := dontKnow

			name = ""
			e = nil
			for hasAttr {
				var key, val []byte
				key, val, hasAttr = z.TagAttr()
				ks := string(key)
				if attrList[ks] {
					continue
				}
				attrList[ks] = true
				for i, c := range val {
					if 'A' <= c && c <= 'Z' {
						val[i] = c + 0x20
					}
				}

				switch ks {
				case "http-equiv":
					if bytes.Equal(val, []byte("content-type")) {
						gotPragma = true
					}

				case "content":
					if e == nil {
						name = fromMetaElement(string(val))
						if name != "" {
							e, name = Lookup(name)
							if e != nil {
								needPragma = doNeedPragma
							}
						}
					}

				case "charset":
					e, name = Lookup(string(val))
					needPragma = doNotNeedPragma
				}
			}

			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
				continue
			}

			if strings.HasPrefix(name, "utf-16") {
				name = "utf-8"
				e = encoding.Nop
			}

			if e != nil {
				return e, name
			}
		}
	}
}