Skip to content

ericchiang/scrape

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

1 Commit
 
 
 
 
 
 

Repository files navigation

scrape

A simple, higher level interface for Go web scraping.

GoDoc

Sample

Scrape defines traversal functions like Find and FindAll while attempting to be generic. It also defines convenience functions such as Attr and Text.

// Parse the page
root, err := html.Parse(resp.Body)
if err != nil {
    // handle error
}
// Search for the title
title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
if ok {
    // Print the title
    fmt.Println(scrape.Text(title))
}

A full example: Scraping Hacker News

package main

import (
	"fmt"
	"net/http"

	"github.com/ericchiang/scrape"
	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

func main() {
	// request and parse the front page
	resp, err := http.Get("https://news.ycombinator.com/")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
			return scrape.Attr(n.Parent.Parent, "class") == "athing"
		}
		return false
	}
	// grab all articles and print them
	articles := scrape.FindAll(root, matcher)
	for i, article := range articles {
		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
	}
}

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages