func flushTagToken(htmlBuf *[]byte, tz *html.Tokenizer, url string) string { *htmlBuf = append(*htmlBuf, '<') tagName, hasAttr := tz.TagName() *htmlBuf = append(*htmlBuf, tagName...) if hasAttr { for { attrKey, attrValue, hasMore := tz.TagAttr() *htmlBuf = append(*htmlBuf, ' ') *htmlBuf = append(*htmlBuf, attrKey...) *htmlBuf = append(*htmlBuf, '=', '"') if tagAttrToProxy[string(tagName)][string(attrKey)] { urlInAttr := string(attrValue) *htmlBuf = append(*htmlBuf, []byte(GetProxiedUrl(urlInAttr, url))...) } else { *htmlBuf = append(*htmlBuf, attrValue...) } *htmlBuf = append(*htmlBuf, '"') if !hasMore { break } } } *htmlBuf = append(*htmlBuf, '>') if string(tagName) == "head" { *htmlBuf = append(*htmlBuf, []byte(getJsHookTag())...) } return string(tagName) }
func (item *AnimeConventionItem) readResgiterNowurl(t *html.Tokenizer) { t.Next() if _, hasmore := t.TagName(); hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.registerNowURL = string(val) } } }
func (item *AnimeConventionItem) readCountry(t *html.Tokenizer, hasmore bool) { if hasmore { _, val, _ := t.TagAttr() item.country = string(val) return } t.Next() item.country = string(t.Text()) }
func readAttributes(z *html.Tokenizer, buff []html.Attribute) []html.Attribute { buff = buff[:0] for { key, value, more := z.TagAttr() if key != nil { buff = append(buff, html.Attribute{Key: string(key), Val: string(value)}) } if !more { return buff } } }
func (item *AnimeConventionItem) readNameAndLink(t *html.Tokenizer) { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.siteURL = string(val) } } } if label := t.Next(); label == html.TextToken { item.name = string(t.Text()) } }
// AttrMap parses the attributes of the current element into a friendly map. // It only makes sense to call this while processing a start or self closing tag token. func AttrMap(hasAttr bool, z *html.Tokenizer) map[string]string { attrs := make(map[string]string) if !hasAttr { return attrs } for { k, v, more := z.TagAttr() attrs[string(k)] = string(v) if !more { break } } return attrs }
func (item *AnimeConventionItem) Parse(t *html.Tokenizer) { for { label := t.Next() switch label { case html.ErrorToken: fmt.Errorf("%v\n", t.Err()) return case html.TextToken: switch string(t.Text()) { case "Advance Rates:": //fmt.Println("rate") item.readadvanceRate(t) case "At-Door Rates:": item.readatDoorRate(t) } case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: tag, hasmore := t.TagName() if strings.EqualFold(string(tag), "big") { item.readResgiterNowurl(t) } else if hasmore { key, val, hasmore := t.TagAttr() if strings.EqualFold(string(key), "itemprop") { //fmt.Println(string(val)) switch string(val) { case "description": item.readDescription(t) case "latitude": item.readLatitude(t) case "longitude": item.readLongitude(t) case "startDate": item.readStartDate(t) case "endDate": item.readEndDate(t) case "location": item.readLocation(t) case "addressLocality": item.readCity(t) case "addressRegion": item.readState(t) case "addressCountry": item.readCountry(t, hasmore) case "name": item.readNameAndLink(t) } } } } } }
func (item *AnimeConventionItem) readLocation(t *html.Tokenizer) { for { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if _, val, _ := t.TagAttr(); strings.EqualFold(string(val), "name") { break } } } } if label := t.Next(); label == html.TextToken { item.location = string(t.Text()) } }
func parseDivX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic { more := true var k, v []byte for more { k, v, more = z.TagAttr() // fmt.Println("attr:", string(k)) switch string(k) { case "class": val := string(v) if strings.Contains(val, "intrinsic") { in.FinishX86() return NewIntrinsic() } switch val { case "cpuid": in.CpuID = getText(z) case "instruction": in.Instruction = strings.ToUpper(getText(z)) case "rettype": in.RetType = fixTypeX86(getText(z)) case "param_type": in.cParam = &Param{Type: fixTypeX86(getText(z))} case "param_name": in.cParam.Name = getText(z) if !in.Params.HasParam(in.cParam.Name) { in.Params = append(in.Params, *in.cParam) } in.cParam = nil case "description": in.Description = strings.TrimSpace(getTextR(z)) case "name": in.OrgName = getText(z) in.Name = fixFuncNameX86(in.OrgName) case "operation": in.Operation = strings.TrimSpace(getText(z)) default: //fmt.Println("unparsed class:", string(v)) } } } return in }
func (list *AnimeConventionList) Parse(t *html.Tokenizer) { for { next := t.Next() switch next { case html.ErrorToken: return case html.TextToken: //fmt.Println(string(t.Text())) case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: _, hasmore := t.TagName() if hasmore { key, val, _ := t.TagAttr() if strings.EqualFold(string(key), "href") && strings.HasPrefix(string(val), "/events/info.shtml") { var item = &AnimeConventionItem{} item.url = strings.Join([]string{"http://animecons.com", string(val)}, "") list.taskNum++ go item.crawlInformation(&list.ConventionList) //time.Sleep(100 * time.Millisecond) } } } } }
func buildTokenArray(tokenizer *xhtml.Tokenizer) []tagInfo { tokens := []tagInfo{} for tt := tokenizer.Next(); tt != xhtml.ErrorToken; tt = tokenizer.Next() { switch tt { case xhtml.TextToken: txt := string(tokenizer.Text()) if len(tokens) == 0 { info := tagInfo{ raw: txt, } tokens = append(tokens, info) } tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), txt: txt, } tokens = append(tokens, info) case xhtml.StartTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), } tokens = append(tokens, info) case xhtml.SelfClosingTagToken, xhtml.EndTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), closingTag: true, } tokens = append(tokens, info) } } return tokens }
func (item *AnimeConventionItem) readEndDate(t *html.Tokenizer) { _, val, _ := t.TagAttr() item.endDate = string(val) }
func (item *AnimeConventionItem) readLongitude(t *html.Tokenizer) { _, val, _ := t.TagAttr() item.longitude = string(val) }
func pullNode(tokens *html.Tokenizer, root *Markup) { var node *Markup for { token := tokens.Next() switch token { case html.ErrorToken: return case html.TextToken, html.CommentToken, html.DoctypeToken: text := strings.TrimSpace(string(tokens.Text())) if text == "" { continue } if token == html.CommentToken { text = "<!--" + text + "-->" } if node != nil { NewText(text).Apply(node) continue } NewText(text).Apply(root) continue case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: if token == html.EndTagToken { node = nil return } tagName, hasAttr := tokens.TagName() node = NewMarkup(string(tagName), token == html.SelfClosingTagToken) node.Apply(root) if hasAttr { attrLoop: for { key, val, more := tokens.TagAttr() NewAttr(string(key), string(val)).Apply(node) if !more { break attrLoop } } } if token == html.SelfClosingTagToken { continue } pullNode(tokens, node) } } }