// getLinks parses the response for links, doing it's best with bad HTML. func getLinks(contents []byte) ([]*URL, error) { utf8Reader, err := charset.NewReader(bytes.NewReader(contents), "text/html") if err != nil { return nil, err } tokenizer := html.NewTokenizer(utf8Reader) var links []*URL tags := getIncludedTags() for { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: //TODO: should use tokenizer.Err() to see if this is io.EOF // (meaning success) or an actual error return links, nil case html.StartTagToken: tagName, hasAttrs := tokenizer.TagName() if hasAttrs && tags[string(tagName)] { links = parseAnchorAttrs(tokenizer, links) } } } return links, nil }
// pup func main() { cmds := ProcessFlags(os.Args[1:]) cr, err := charset.NewReader(inputStream, "") if err != nil { fmt.Fprintf(os.Stderr, err.Error()) os.Exit(2) } root, err := html.Parse(cr) if err != nil { fmt.Fprintf(os.Stderr, err.Error()) os.Exit(2) } inputStream.Close() if len(cmds) == 0 { PrintNode(root, 0) os.Exit(0) } selectors := make([]selector.Selector, len(cmds)) for i, cmd := range cmds { if i+1 == len(cmds) { d, err := funcs.NewDisplayFunc(cmd) if err == nil { displayer = d selectors = selectors[0 : len(cmds)-1] break } } selectors[i], err = selector.NewSelector(cmd) if err != nil { Fatal("Selector parse error: %s", err) } } currNodes := []*html.Node{root} var selected []*html.Node for _, selector := range selectors { selected = []*html.Node{} for _, node := range currNodes { selected = append(selected, selector.FindAllChildren(node)...) } currNodes = selected } if displayer != nil { displayer.Display(currNodes) } else if printNumber { fmt.Println(len(currNodes)) } else { for _, s := range currNodes { // defined in `printing.go` PrintNode(s, 0) } } }
// pup func main() { cmds := ProcessFlags(os.Args[1:]) cr, err := charset.NewReader(inputStream, "") if err != nil { fmt.Fprintf(os.Stderr, err.Error()) os.Exit(2) } root, err := html.Parse(cr) if err != nil { fmt.Fprintf(os.Stderr, err.Error()) os.Exit(2) } inputStream.Close() if len(cmds) == 0 { PrintNode(root, 0) os.Exit(0) } selectors := make([]selector.Selector, len(cmds)) for i, cmd := range cmds { // if this is the last element, check for a function like // text{} or attr{} if i+1 == len(cmds) { d, err := funcs.NewDisplayFunc(cmd) if err == nil { displayer = d selectors = selectors[0 : len(cmds)-1] break } } selectors[i], err = selector.NewSelector(cmd) if err != nil { Fatal("Selector parse error: %s", err) } } currNodes := []*html.Node{root} for _, selector := range selectors { currNodes = selector.Select(currNodes) } if displayer != nil { displayer.Display(currNodes) } else if printNumber { fmt.Println(len(currNodes)) } else { for _, s := range currNodes { // defined in `printing.go` PrintNode(s, 0) } } }
func parseContent(r io.Reader, contentType string) (htmlBody, textBody string, err error) { media, params, _ := mime.ParseMediaType(contentType) switch { case media == textHTML, media == textPlain: r, err = charset.NewReader(r, params["charset"]) if err != nil { return "", "", err } if media == textHTML { r, err = sanitized(r) if err != nil { return "", "", err } } body, err := ioutil.ReadAll(r) s := string(body) if media == textHTML { return s, "", err } return "", s, err case strings.HasPrefix(media, "multipart"): mp := multipart.NewReader(r, params["boundary"]) for { part, err := mp.NextPart() if err == io.EOF { break } if err != nil { return "", "", err } if tmpHTML, tmpText, err := parseContent(part, part.Header.Get("Content-Type")); err == nil { if tmpHTML != "" { htmlBody = tmpHTML } if tmpText != "" { textBody = tmpText } if htmlBody != "" && textBody != "" { break } } } } return htmlBody, textBody, nil }
func main() { charset.CharsetDir = build.Default.GOPATH + "/src/code.google.com/p/go-charset/datafiles" for i := 1; i < len(os.Args); i++ { parse_arg(os.Args[i]) } res, err := http.Get(uri + strings.Join(params, "&")) defer res.Body.Close() if err != nil { log.Fatal(err) } r, err := hcs.NewReader(res.Body, res.Header["Content-Type"][0]) if err != nil { log.Fatal(err) } doc, err := html.Parse(r) if err != nil { log.Fatal(err) } find_table(doc) }
func GetContent(fullUrl string) (*Content, string, error) { resp, err := http.Get(fullUrl) if err != nil { return nil, "", errors.New( fmt.Sprintf("Desculpe, ocorreu ao tentar recuperar a pagina referente a URL passada. %s.", err)) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, "", errors.New( fmt.Sprintf("Desculpe, mas a pagina passada respondeu indevidamente. O Status Code recebido foi: %d.", resp.StatusCode)) } reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err != nil { return nil, "", errors.New( fmt.Sprintf("Erro ao decodificar o charset da pagina. %s.", err)) } content := &Content{} imageUrl := "" // This function create a Tokenizer for an io.Reader, obs. HTML should be UTF-8 z := html.NewTokenizer(reader) for { tokenType := z.Next() if tokenType == html.ErrorToken { if z.Err() == io.EOF { // EVERTHINGS WORKS WELL! break } // Ops, we've got something wrong, it isn't an EOF token return nil, "", errors.New( fmt.Sprintf("Desculpe, mas ocorreu um erro ao extrair as tags HTML da pagina passada. %s.", z.Err())) } switch tokenType { case html.StartTagToken, html.SelfClosingTagToken: token := z.Token() // Check if it is an title tag opennig, it's the fastest way to compare bytes if token.Data == "title" { // log.Printf("TAG: '%v'\n", token.Data) nextTokenType := z.Next() if nextTokenType == html.TextToken { nextToken := z.Token() content.Title = strings.TrimSpace(nextToken.Data) // log.Println("<title> = " + content.Title) } } else if token.Data == "meta" { key := "" value := "" // log.Printf("NewMeta: %s : ", token.String()) // Extracting this meta data information for _, attr := range token.Attr { switch attr.Key { case "property", "name": key = attr.Val case "content": value = attr.Val } } switch key { case "title", "og:title", "twitter:title": if strings.TrimSpace(value) != "" { content.Title = strings.TrimSpace(value) // log.Printf("Title: %s\n", strings.TrimSpace(value)) } case "og:site_name", "twitter:domain": if strings.TrimSpace(value) != "" { //content.SiteName = strings.TrimSpace(value) //log.Printf("Site Name: %s\n", strings.TrimSpace(value)) } case "description", "og:description", "twitter:description": if strings.TrimSpace(value) != "" { content.Description = strings.TrimSpace(value) // log.Printf("Description: %s\n", strings.TrimSpace(value)) } case "og:image", "twitter:image", "twitter:image:src": if strings.TrimSpace(value) != "" { imageUrl = strings.TrimSpace(value) // log.Printf("Image: %s\n", strings.TrimSpace(value)) } case "og:url", "twitter:url": if strings.TrimSpace(value) != "" { // Not used, cause user could use a redirect service // fullUrl = strings.TrimSpace(value) // log.Printf("Url: %s\n", strings.TrimSpace(value)) } } } } } // Limiting the size of Title and Description to 250 characters if len(content.Title) > 250 { content.Title = content.Title[0:250] } if len(content.Description) > 250 { content.Description = content.Description[0:250] } // If content description is empty, lets full fill with something if len(content.Description) == 0 { content.Description = "Veja o conteudo completo..." } // Adding the host of this content content.Host = resp.Request.URL.Host log.Printf("Title: %s\n description: %s\n host:%s\n imageUrl:%s\n", content.Title, content.Description, content.Host, imageUrl) return content, imageUrl, nil }