Esempio n. 1
0
// getLinks parses the response for links, doing it's best with bad HTML.
func getLinks(contents []byte) ([]*URL, error) {
	utf8Reader, err := charset.NewReader(bytes.NewReader(contents), "text/html")
	if err != nil {
		return nil, err
	}
	tokenizer := html.NewTokenizer(utf8Reader)

	var links []*URL
	tags := getIncludedTags()

	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			//TODO: should use tokenizer.Err() to see if this is io.EOF
			//		(meaning success) or an actual error
			return links, nil
		case html.StartTagToken:

			tagName, hasAttrs := tokenizer.TagName()
			if hasAttrs && tags[string(tagName)] {
				links = parseAnchorAttrs(tokenizer, links)
			}
		}
	}

	return links, nil
}
Esempio n. 2
0
File: main.go Progetto: eywu/pup
// pup
func main() {
	cmds := ProcessFlags(os.Args[1:])
	cr, err := charset.NewReader(inputStream, "")
	if err != nil {
		fmt.Fprintf(os.Stderr, err.Error())
		os.Exit(2)
	}
	root, err := html.Parse(cr)
	if err != nil {
		fmt.Fprintf(os.Stderr, err.Error())
		os.Exit(2)
	}
	inputStream.Close()
	if len(cmds) == 0 {
		PrintNode(root, 0)
		os.Exit(0)
	}
	selectors := make([]selector.Selector, len(cmds))
	for i, cmd := range cmds {
		if i+1 == len(cmds) {
			d, err := funcs.NewDisplayFunc(cmd)
			if err == nil {
				displayer = d
				selectors = selectors[0 : len(cmds)-1]
				break
			}
		}
		selectors[i], err = selector.NewSelector(cmd)
		if err != nil {
			Fatal("Selector parse error: %s", err)
		}
	}
	currNodes := []*html.Node{root}
	var selected []*html.Node
	for _, selector := range selectors {
		selected = []*html.Node{}
		for _, node := range currNodes {
			selected = append(selected,
				selector.FindAllChildren(node)...)
		}
		currNodes = selected
	}
	if displayer != nil {
		displayer.Display(currNodes)
	} else if printNumber {
		fmt.Println(len(currNodes))
	} else {
		for _, s := range currNodes {
			// defined in `printing.go`
			PrintNode(s, 0)
		}
	}
}
Esempio n. 3
0
// pup
func main() {
	cmds := ProcessFlags(os.Args[1:])
	cr, err := charset.NewReader(inputStream, "")
	if err != nil {
		fmt.Fprintf(os.Stderr, err.Error())
		os.Exit(2)
	}
	root, err := html.Parse(cr)
	if err != nil {
		fmt.Fprintf(os.Stderr, err.Error())
		os.Exit(2)
	}
	inputStream.Close()
	if len(cmds) == 0 {
		PrintNode(root, 0)
		os.Exit(0)
	}
	selectors := make([]selector.Selector, len(cmds))
	for i, cmd := range cmds {
		// if this is the last element, check for a function like
		// text{} or attr{}
		if i+1 == len(cmds) {
			d, err := funcs.NewDisplayFunc(cmd)
			if err == nil {
				displayer = d
				selectors = selectors[0 : len(cmds)-1]
				break
			}
		}
		selectors[i], err = selector.NewSelector(cmd)
		if err != nil {
			Fatal("Selector parse error: %s", err)
		}
	}
	currNodes := []*html.Node{root}
	for _, selector := range selectors {
		currNodes = selector.Select(currNodes)
	}
	if displayer != nil {
		displayer.Display(currNodes)
	} else if printNumber {
		fmt.Println(len(currNodes))
	} else {
		for _, s := range currNodes {
			// defined in `printing.go`
			PrintNode(s, 0)
		}
	}
}
Esempio n. 4
0
func parseContent(r io.Reader, contentType string) (htmlBody, textBody string, err error) {
	media, params, _ := mime.ParseMediaType(contentType)
	switch {
	case media == textHTML, media == textPlain:
		r, err = charset.NewReader(r, params["charset"])
		if err != nil {
			return "", "", err
		}
		if media == textHTML {
			r, err = sanitized(r)
			if err != nil {
				return "", "", err
			}
		}
		body, err := ioutil.ReadAll(r)
		s := string(body)
		if media == textHTML {
			return s, "", err
		}
		return "", s, err
	case strings.HasPrefix(media, "multipart"):
		mp := multipart.NewReader(r, params["boundary"])
		for {
			part, err := mp.NextPart()
			if err == io.EOF {
				break
			}
			if err != nil {
				return "", "", err
			}
			if tmpHTML, tmpText, err := parseContent(part, part.Header.Get("Content-Type")); err == nil {
				if tmpHTML != "" {
					htmlBody = tmpHTML
				}
				if tmpText != "" {
					textBody = tmpText
				}
				if htmlBody != "" && textBody != "" {
					break
				}
			}
		}
	}
	return htmlBody, textBody, nil
}
Esempio n. 5
0
func main() {
	charset.CharsetDir = build.Default.GOPATH + "/src/code.google.com/p/go-charset/datafiles"
	for i := 1; i < len(os.Args); i++ {
		parse_arg(os.Args[i])
	}

	res, err := http.Get(uri + strings.Join(params, "&"))
	defer res.Body.Close()
	if err != nil {
		log.Fatal(err)
	}

	r, err := hcs.NewReader(res.Body, res.Header["Content-Type"][0])
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(r)
	if err != nil {
		log.Fatal(err)
	}

	find_table(doc)
}
Esempio n. 6
0
func GetContent(fullUrl string) (*Content, string, error) {
	resp, err := http.Get(fullUrl)
	if err != nil {
		return nil, "", errors.New(
			fmt.Sprintf("Desculpe, ocorreu ao tentar recuperar a pagina referente a URL passada. %s.", err))
	}

	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, "", errors.New(
			fmt.Sprintf("Desculpe, mas a pagina passada respondeu indevidamente. O Status Code recebido foi: %d.", resp.StatusCode))
	}

	reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
	if err != nil {
		return nil, "", errors.New(
			fmt.Sprintf("Erro ao decodificar o charset da pagina. %s.", err))
	}

	content := &Content{}
	imageUrl := ""

	// This function create a Tokenizer for an io.Reader, obs. HTML should be UTF-8
	z := html.NewTokenizer(reader)
	for {
		tokenType := z.Next()

		if tokenType == html.ErrorToken {
			if z.Err() == io.EOF { // EVERTHINGS WORKS WELL!
				break
			}
			// Ops, we've got something wrong, it isn't an EOF token
			return nil, "", errors.New(
				fmt.Sprintf("Desculpe, mas ocorreu um erro ao extrair as tags HTML da pagina passada. %s.", z.Err()))
		}

		switch tokenType {
		case html.StartTagToken, html.SelfClosingTagToken:

			token := z.Token()
			// Check if it is an title tag opennig, it's the fastest way to compare bytes
			if token.Data == "title" {
				// log.Printf("TAG: '%v'\n", token.Data)
				nextTokenType := z.Next()
				if nextTokenType == html.TextToken {
					nextToken := z.Token()
					content.Title = strings.TrimSpace(nextToken.Data)
					// log.Println("<title> = " + content.Title)
				}

			} else if token.Data == "meta" {
				key := ""
				value := ""

				// log.Printf("NewMeta: %s : ", token.String())

				// Extracting this meta data information
				for _, attr := range token.Attr {
					switch attr.Key {
					case "property", "name":
						key = attr.Val
					case "content":
						value = attr.Val
					}
				}

				switch key {

				case "title", "og:title", "twitter:title":
					if strings.TrimSpace(value) != "" {
						content.Title = strings.TrimSpace(value)
						// log.Printf("Title: %s\n", strings.TrimSpace(value))
					}

				case "og:site_name", "twitter:domain":
					if strings.TrimSpace(value) != "" {
						//content.SiteName = strings.TrimSpace(value)
						//log.Printf("Site Name: %s\n", strings.TrimSpace(value))
					}

				case "description", "og:description", "twitter:description":
					if strings.TrimSpace(value) != "" {
						content.Description = strings.TrimSpace(value)
						// log.Printf("Description: %s\n", strings.TrimSpace(value))
					}
				case "og:image", "twitter:image", "twitter:image:src":
					if strings.TrimSpace(value) != "" {
						imageUrl = strings.TrimSpace(value)
						// log.Printf("Image: %s\n", strings.TrimSpace(value))
					}
				case "og:url", "twitter:url":
					if strings.TrimSpace(value) != "" {
						// Not used, cause user could use a redirect service
						// fullUrl = strings.TrimSpace(value)
						// log.Printf("Url: %s\n", strings.TrimSpace(value))
					}
				}
			}
		}
	}

	// Limiting the size of Title and Description to 250 characters
	if len(content.Title) > 250 {
		content.Title = content.Title[0:250]
	}
	if len(content.Description) > 250 {
		content.Description = content.Description[0:250]
	}
	// If content description is empty, lets full fill with something
	if len(content.Description) == 0 {
		content.Description = "Veja o conteudo completo..."
	}

	// Adding the host of this content
	content.Host = resp.Request.URL.Host

	log.Printf("Title: %s\n description: %s\n host:%s\n imageUrl:%s\n",
		content.Title, content.Description, content.Host, imageUrl)

	return content, imageUrl, nil
}