Пример #1
func main() {
	// request and parse the front page
	resp, err := http.Get("https://news.ycombinator.com/")
	if err != nil {
	root, err := html.Parse(resp.Body)
	if err != nil {

	// define a matcher
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
			return scrape.Attr(n.Parent.Parent, "class") == "athing"
		return false
	// grab all articles and print them
	articles := scrape.FindAll(root, matcher)
	for i, article := range articles {
		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
Пример #2
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") })
	ret = make([]*r.Broadcast, len(nodes))
	for index, tim := range nodes {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),
		// some defaults
		bc.Language = &lang_de
		bc.Publisher = &publisher
		// set start time
			div_t := strings.TrimSpace(scrape.Text(tim))
			if 5 != len(div_t) {
			hour := r.MustParseInt(div_t[0:2])
			minute := r.MustParseInt(div_t[3:5])
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
		for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool {
			return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class")
		}) {
			// Title
			bc.Title = strings.TrimSpace(scrape.Text(tit))
			href := scrape.Attr(tit, "href")
			if "" != href {
				u, _ := url.Parse(href)
				bc.Subject = day.Source.ResolveReference(u)

			desc_node := tit.Parent
			description := r.TextWithBrFromNodeSet([]*html.Node{desc_node})
			bc.Description = &description
			// fmt.Fprintf(os.Stderr, "\n")
		ret[index] = &bc
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if len(nodes) > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[len(nodes)-1].DtEnd = &midnight
Пример #3
func NewListing(ctx appengine.Context, url string) (*Listing, error) {
	client := urlfetch.Client(ctx)
	resp, err := client.Get("" + url)
	if err != nil {
		ctx.Errorf("%s", err)
	ctx.Debugf("Craigslist request came back with status: %s", resp.Status)
	if err != nil {
		ctx.Errorf("%s", err)
		return nil, errors.New("Get listing failed")
	root, err := html.Parse(resp.Body)
	if err != nil {
		ctx.Errorf("%s", "Parsing Error")
		return nil, errors.New("Parse body failed")

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
	if !ok {
		ctx.Errorf("%s", "Error getting title")
		return nil, errors.New("Get title failed")
	price, ok := scrape.Find(root, scrape.ByClass("price"))
	if !ok {
		ctx.Errorf("%s", "Error getting price")
		return nil, errors.New("Get price failed")
	intPrice, err := strconv.Atoi(scrape.Text(price)[1:])
	if err != nil {
		ctx.Errorf("Error casting price: %s", scrape.Text(price))
		return nil, err
	images := scrape.FindAll(root, scrape.ByTag(atom.Img))
	imageUrl := ""
	for _, image := range images {
		if scrape.Attr(image, "title") == "image 1" {
			imageUrl = scrape.Attr(image, "src")

	ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title))

	return &Listing{
		Url:      url,
		Title:    scrape.Text(title),
		Price:    intPrice,
		ImageUrl: imageUrl,
	}, nil
Пример #4
func findOpenGraphTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, func(n *html.Node) bool {
		if n.DataAtom == atom.Meta {
			return scrape.Attr(n, "property") == "og:title" && scrape.Attr(n, "content") != ""

		return false

	if !found {
		return ""

	return scrape.Attr(el, "content")
Пример #5
func findTwitterTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, func(n *html.Node) bool {
		if n.DataAtom == atom.Meta {
			return scrape.Attr(n, "name") == "twitter:title" && scrape.Attr(n, "content") != ""

		return false

	if !found {
		return ""

	return scrape.Attr(el, "content")
Пример #6
func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) {
	const closeDownHour int = 5
	for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) {
		year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time)
		if nil != err {
		// fmt.Printf("%d-%d-%d %s\n", year, month, day, err)
		for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) {
			m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a))
			if nil == m {
				panic(errors.New("Couldn't parse <a>"))
			ur, _ := url.Parse(scrape.Attr(a, "href"))
			hour := r.MustParseInt(m[1])
			dayOffset := 0
			if hour < closeDownHour {
				dayOffset = 1
			// fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title)
			bcu := broadcastURL(r.BroadcastURL{
				TimeURL: r.TimeURL{
					Time:    time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc),
					Source:  *day.Source.ResolveReference(ur),
					Station: day.Station,
				Title: strings.TrimSpace(m[3]),
			ret = append(ret, &bcu)
Пример #7
// Scrape scrapes a site for a keyword
func (q *query) Scrape() []*match {

	// Request the URL
	resp, err := http.Get(q.SiteURL)
	if err != nil {
		log.Fatal("Couldn't GET ", q.SiteURL)

	// Parse the contents of the URL
	root, err := html.Parse(resp.Body)
	if err != nil {
		log.Fatal("Unable to parse response")

	// Grab all the posts and print them
	posts := scrape.FindAll(root, scrape.ByClass("description"))
	matches := make([]*match, len(posts))
	for i, post := range posts {
		matches[i] = &match{
			Title:       scrape.Text(post.FirstChild.NextSibling),
			Description: scrape.Text(post),
			Link:        "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"),
			Price:       scrape.Text(post.NextSibling.NextSibling),
			Matched:     false,

	return matches
Пример #8
func main() {

	resp, err := http.Get("https://www.reddit.com")
	if err != nil {
	root, err := html.Parse(resp.Body)
	if err != nil {

	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "id") == "siteTable"
		return false
	table, ok := scrape.Find(root, matcher)
	if !ok {
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "data-type") == "link"
		return false

	articles := scrape.FindAll(table, matcher)
	var posts []Post

	for i := 0; i < len(articles); i++ {
		go func(n *html.Node) {
			post := parsepost(n)
			posts = append(posts, post)


	for i := 0; i < len(posts); i++ {

Пример #9
func parsepost(n *html.Node) Post {
	post := Post{}

	// get the title. uses a scrape inbuilt matcher
	title_scrape, _ := scrape.Find(n, scrape.ByClass("title"))
	title := scrape.Text(title_scrape.FirstChild)

	// get the subreddit. This requires a custom matcher.
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent != nil {
			return scrape.Attr(n, "class") == "subreddit hover may-blank"
		return false
	sub, _ := scrape.Find(n, matcher)
	subreddit := scrape.Text(sub)

	// get the url to the comments. requires custom matcher.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Ul && n.FirstChild != nil {
			return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first"
		return false
	ul, _ := scrape.Find(n, matcher)          // ul is a list of two buttons: one that links to a post's comments page, one a "share" function
	li := ul.FirstChild                       // the first list item of ul -- this will always be the comments page link.
	url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item.

	// get the author. Uses custom matcher and magic.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P {
			return strings.Contains(scrape.Attr(n, "href"), "/user/")
		return false
	author_scrape, _ := scrape.Find(n, matcher)
	author := scrape.Text(author_scrape)

	post.title = title
	post.subreddit = subreddit
	post.url = url
	post.author = author

	return post
Пример #10
func getLink(r *html.Node) (s string) {
	buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn"))
	for _, button := range buttons {
		windowLocation := scrape.Attr(button, "onclick")
		link := strings.Split(windowLocation, "=")[1]
		s := strings.Trim(link, "'")
		return s
Пример #11
// Get Time, Source and Image from json html snippet
func (item *calendarItem) parseBroadcastSeedNode(root *html.Node) (bc *broadcastURL, err error) {
	bc = &broadcastURL{}
	bc.Station = *item.Station
	bc.Time = time.Time(item.DateTime)
	for _, a := range scrape.FindAll(root, func(n *html.Node) bool {
		if atom.A != n.DataAtom {
			return false
		href := scrape.Attr(n, "href")
		return strings.HasPrefix(href, "/programm/radio/ausstrahlung-") && strings.HasSuffix(href, ".html")
	}) {
		ru, _ := url.Parse(scrape.Attr(a, "href"))
		bc.Source = *item.Station.ProgramURL.ResolveReference(ru)
	for _, img := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
		ru, _ := url.Parse(scrape.Attr(img, "src"))
		bc.Image = item.Station.ProgramURL.ResolveReference(ru)
Пример #12
func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) {
		// Author
		meta, _ := scrape.Find(root, func(n *html.Node) bool {
			return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name")
		if nil != meta {
			content := scrape.Attr(meta, "content")
			bc.Author = &content
	for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <div class='epg-content-right'/>")
			// TitleEpisode
			txt, _ := scrape.Find(epg, func(n *html.Node) bool {
				return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom
			if nil != txt {
				t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data))
				bc.TitleEpisode = &t
			// Subject
			a, _ := scrape.Find(epg, func(n *html.Node) bool {
				return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom
			if nil != a {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)
		// purge some cruft
		for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool {
			clz := scrape.Attr(n, "class")
			return atom.H2 == n.DataAtom ||
				"mod modSharing" == clz ||
				"modGalery" == clz ||
				"sendungsLink" == clz ||
				"tabs-container" == clz
		}) {
			description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent }))
			bc.Description = &description
	bc_ := r.Broadcast(*bc)
	ret = append(ret, &bc_)
Пример #13
func TweetsToUser(u user.User) []tweet.Tweet {
	reqURL := SearchURL
	_url.SetQueryParams(&reqURL, map[string]string{
		"q": "to:" + u.ScreenName,
		"f": "tweets",

	res, err := http.Get(reqURL.String())
	root, err := html.Parse(res.Body)

	tweetsMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet")
	tweetScreenNameMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username")
	tweetTextMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text")

	tweetNodes := scrape.FindAll(root, tweetsMatcher)
	tweets := make([]tweet.Tweet, len(tweetNodes))
	for i, n := range tweetNodes {
		t := tweet.Tweet{
			ID: scrape.Attr(n, "data-user-id"),
		if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok {
			t.Author = *user.NewUser(scrape.Text(child))
		if child, ok := scrape.Find(n, tweetTextMatcher); ok {
			t.Text = scrape.Text(child)
		tweets[i] = t

	return tweets
Пример #14
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo {
	var info YoutubeVideoInfo

	info.ID = scrape.Attr(element, "data-context-item-id")

	thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple"))
	if ok {
		thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img))
		if ok {
			info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src"))

	videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time"))
	if ok {
		durationStr := strings.TrimSpace(scrape.Text(videoTimeElement))
		info.Length, _ = parseVideoDuration(durationStr)

	linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"}
	linkFieldPtrs := []*string{&info.Title, &info.Author}
	for i, class := range linkFieldClasses {
		linkContainer, ok := scrape.Find(element, scrape.ByClass(class))
		if ok {
			link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A))
			if ok {
				*linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link))

	descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description"))
	if ok {
		info.Description = strings.TrimSpace(scrape.Text(descBox))

	return &info
Пример #15
func eventDetailsToStrArr(eventDetails []*html.Node, eventID int) []string {
	return []string{
			scrape.Attr(eventDetails[5].FirstChild, "href"),
Пример #16
// History asynchronously fetches the user's
// video viewing history.
// You may provide a cancel channel which you
// can close to cancel the fetch mid-way.
func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) {
	videoChan := make(chan *YoutubeVideoInfo)
	errChan := make(chan error, 1)

	go func() {
		defer close(videoChan)
		defer close(errChan)

		historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil)
		historyReq.Header.Set("User-Agent", spoofedUserAgent)
		resp, err := y.s.Do(historyReq)
		rootNode, err := html.Parse(resp.Body)
		if err != nil {
			errChan <- err

		loadMoreHTML := rootNode
		contentHTML := rootNode
		for {
			items := parseHistoryItems(contentHTML)
			for _, item := range items {
				select {
				case videoChan <- item:
				case <-cancel:

			if loadMoreHTML == nil {

			loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more"))
			if ok {
				morePath := scrape.Attr(loadButton, "data-uix-load-more-href")
				loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath)
				if err != nil {
					errChan <- err

	return videoChan, errChan
Пример #17
func (s *station) parseDayURLsNode(root *html.Node) (ret []timeURL, err error) {
	i := 0
	for _, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom }) {
		rel := scrape.Attr(a, "href")
		d, err := s.newTimeURL(rel)
		if nil != err {
		// use only every 3rd day schedule url because each one contains 3 days
		i += 1
		if 2 != i%3 {
		// fmt.Printf("ok %s\n", d.String())
		ret = append(ret, timeURL(d))
Пример #18
Файл: main.go Проект: anykao/p
func ParseName(n *html.Node) (string, string, string) {
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td {
			return true
		return false

	var name, magnet, desc string

	if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok {
		name = scrape.Text(detName)
	if anchor, ok := scrape.Find(n, matcher); ok {
		magnet = scrape.Attr(anchor, "href")
	if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok {
		desc = scrape.Text(detDesc)
	return name, magnet, desc
Пример #19
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	// fmt.Fprintf(os.Stderr, "%s\n", day.Source.String())
	index := 0
	for _, at := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.A == n.DataAtom &&
			atom.Td == n.Parent.DataAtom &&
			atom.Tr == n.Parent.Parent.DataAtom &&
			"time" == scrape.Attr(n.Parent, "class")
	}) {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),

		// some defaults
		bc.Language = &lang_de
			publisher := "http://www.deutschlandfunk.de/"
			if "drk" == day.Station.Identifier {
				publisher = "http://www.deutschlandradiokultur.de/"
			bc.Publisher = &publisher
		// set start time
			a_id := scrape.Attr(at, "name")
			if "" == a_id {
			bc.Source.Fragment = a_id
			hour := r.MustParseInt(a_id[0:2])
			minute := r.MustParseInt(a_id[2:4])
			if 24 < hour || 60 < minute {
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
		// Title
		for idx, h3 := range scrape.FindAll(at.Parent.Parent, func(n *html.Node) bool {
			return atom.H3 == n.DataAtom &&
				atom.Td == n.Parent.DataAtom &&
				atom.Tr == n.Parent.Parent.DataAtom &&
				"description" == scrape.Attr(n.Parent, "class")
		}) {
			if idx != 0 {
				err = errors.New("There was more than 1 <tr><td class='description'><h3>")
			// purge 'aufnehmen' link:
			for _, chi := range scrape.FindAll(h3, func(n *html.Node) bool {
				return atom.A == n.DataAtom &&
					"psradio" == scrape.Attr(n, "class")
			}) {
			// fmt.Fprintf(os.Stderr, " '%s'\n", scrape.Text(h3))

			for idx, h3_a := range scrape.FindAll(h3, func(n *html.Node) bool {
				return atom.A == n.DataAtom
			}) {
				if idx != 0 {
					err = errors.New("There was more than 1 <tr><td class='description'><h3><a>")
				bc.Title = scrape.Text(h3_a)
				u, _ := url.Parse(scrape.Attr(h3_a, "href"))
				bc.Subject = day.Source.ResolveReference(u)
			bc.Title = strings.TrimSpace(bc.Title)
			if "" == bc.Title {
				bc.Title = r.TextChildrenNoClimb(h3)
			// fmt.Fprintf(os.Stderr, " '%s'", bc.Title)
				description := r.TextWithBrFromNodeSet(scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom }))
				bc.Description = &description
		// fmt.Fprintf(os.Stderr, "\n")
		ret = append(ret, &bc)
		index += 1
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if index > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[index-1].DtEnd = &midnight
Пример #20
Файл: co.go Проект: gozes/co
func jobCaptChaUrl(n *html.Node) string {
	img, _ := scrape.Find(n, captchaImageMatcher)
	return baseUrl + scrape.Attr(img, "src")
Пример #21
Файл: co.go Проект: gozes/co
func jobUrl(n *html.Node) string {
	return baseUrl + scrape.Attr(n, "href")
Пример #22
Файл: co.go Проект: gozes/co
	r := <-respCh
	return r


func fetchNextPage(keyword string) *html.Node {
	url := jobsKeywordUrl + keyword + jobsNextPageOffset + strconv.Itoa(pager)
	pager += 50
	urlCh <- url
	r := <-respCh
	return r


var nextPageMatcher = func(n *html.Node) bool {
	if n.DataAtom == atom.Img && scrape.Attr(n, "src") == "/UDClasMedia/Arte/Proximos50.gif" {
		return false
	return true

var allJobMatcher = func(n *html.Node) bool {
	if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Font && scrape.Attr(n.Parent, "class") == "Ver14nounder" {
		return scrape.Attr(n, "class") == "Ver14nounder"

	return false

var descriptionMatcher = func(n *html.Node) bool {
	if n.DataAtom == atom.P && scrape.Attr(n, "class") == "Ver14nounder" {
Пример #23
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title
func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bc r.Broadcast, err error) {
	bc.Station = bcu.Station
	if "" == bc.Station.Identifier {
		panic("How can the identifier miss?")
	bc.Source = bcu.Source
	bc.Time = bcu.Time
	bc.Image = bcu.Image
		s := "de"
		bc.Language = &s

	for i, main := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "br-main-text" == scrape.Attr(n, "class") }) {
		if 1 < i {
			err = errors.New("unexpected 2nd <div class='br-main-text'> ")

		// Subject
		for idx, h3 := range scrape.FindAll(root, func(n *html.Node) bool {
			return atom.H3 == n.DataAtom && "Weitere Informationen" == scrape.Text(n)
		}) {
			// fmt.Fprintf(os.Stderr, "GET %s\n", "uhu")
			if idx != 0 {
				err = errors.New("There was more than 1 <h3>Weitere Informationen")
			for _, a := range scrape.FindAll(h3.Parent, func(n *html.Node) bool {
				return atom.A == n.DataAtom
			}) {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)


		for i1, h2 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H2 == n.DataAtom }) {
			if 1 < i1 {
				err = errors.New("unexpected 2nd <h2> ")
			for i4, em := range scrape.FindAll(h2, func(n *html.Node) bool { return atom.Em == n.DataAtom }) {
				if 1 < i4 {
					err = errors.New("unexpected 2nd <em> ")
				bc.Title = scrape.Text(em)
			s := scrape.Text(h2)
			bc.TitleSeries = &s

			for i2, h3 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H3 == n.DataAtom }) {
				if 1 < i2 {
					err = errors.New("unexpected 2nd <h3> ")
				s := scrape.Text(h3)
				bc.TitleEpisode = &s

			inner := h2.Parent.Parent.Parent

			for ch := inner.FirstChild; ch != nil; ch = ch.NextSibling {
				if atom.Div == ch.DataAtom {
					inner.RemoveChild(ch) // once removed NextSibling returns nil

			// Description
			description := r.TextWithBrFromNodeSet(scrape.FindAll(inner, func(n *html.Node) bool { return atom.P == n.DataAtom || atom.Div == n.DataAtom }))
			bc.Description = &description

	// DtEnd
	for _, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "br-time" == scrape.Attr(n, "class") }) {
		m := bcDateRegExp.FindStringSubmatch(scrape.Text(p))
		if nil == m {
			err = errors.New("There was no date match")
		i := r.MustParseInt
		// bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc)
		t := time.Date(bc.Time.Year(), bc.Time.Month(), bc.Time.Day(), i(m[3]), i(m[4]), 0, 0, localLoc)
		if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight
			t = t.AddDate(0, 0, 1)
		bc.DtEnd = &t

	// Modified
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>")
		v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content"))
		bc.Modified = &v

	// Author
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta name='author'/>")
		s := scrape.Attr(meta, "content")
		bc.Author = &s

Пример #24
//LivescoreParser parse livescore
func LivescoreParser(root *html.Node) []Match {
	var matches []Match

	contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag))
	if contentOK {
		//find all row-gray
		rowGrayMatcher := func(n *html.Node) bool {
			classes := strings.Fields(scrape.Attr(n, "class"))
			for _, c := range classes {
				if c == classRowGray {
					parentClasses := strings.Fields(scrape.Attr(n.Parent, "class"))
					for _, pc := range parentClasses {
						if pc == classContentTag {
							return true
			return false
		rows := scrape.FindAll(contentElmt, rowGrayMatcher)

		matchChann := make(chan Match)
		for _, rowElmt := range rows {
			go func(rowElmt *html.Node) {
				var time string
				var homeTeam string
				var awayTeam string
				var score string

				timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt))
				if timeElmtOK {
					time = scrape.Text(timeElmt)

				scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink))
				if scoreElmtOK {
					score = scrape.Text(scoreElmt)

				teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt))
				for i := 0; i < len(teamElmts); i++ {
					teamElmt := teamElmts[i]
					if i%2 == 0 {
						homeTeam = scrape.Text(teamElmt)
					} else {
						awayTeam = scrape.Text(teamElmt)
				match := Match{
					HomeTeam: homeTeam,
					AwayTeam: awayTeam,
					Score:    score,
					Time:     time,

				matchChann <- match

		for i := 0; i < len(rows); i++ {
			select {
			case m := <-matchChann:
				matches = append(matches, m)
	return matches
Пример #25
func doScrape(urlString string) AppData {

	u, err := url.Parse(urlString)
	if err != nil {

	appData := AppData{}
	appData.PackageName = u.Query().Get("id")

	resp, err := http.Get(urlString)
	if err != nil {
	root, err := html.Parse(resp.Body)
	if err != nil {

	genreMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "genre"
	iconMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "image"
	softwareVersionMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "softwareVersion"

	name, ok := scrape.Find(root, scrape.ByClass("id-app-title"))
	if ok {
		appData.Name = scrape.Text(name)
	genre, ok := scrape.Find(root, genreMatcher)
	if ok {
		appData.Categories = append(appData.Categories, scrape.Text(genre))
	icon, ok := scrape.Find(root, iconMatcher)
	if ok {
		iconSrc := scrape.Attr(icon, "src")
		iconUrl, err := url.Parse(iconSrc)
		if err != nil {
		if iconUrl.Scheme == "" {
			iconSrc = "https:" + iconSrc

		resp, err = http.Get(iconSrc)
		if err != nil {
		defer resp.Body.Close()

		outputFile, err := os.Create("output/" + appData.PackageName + ".png")
		if err != nil {
		defer outputFile.Close()

		_, err = io.Copy(outputFile, resp.Body)
		if err != nil {
	version, ok := scrape.Find(root, softwareVersionMatcher)
	if ok {
		appData.Version = strings.TrimSpace(scrape.Text(version))

	return appData
Пример #26
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	// fmt.Fprintf(os.Stderr, "%s\n", day.Source.String())
	index := 0
	for _, at := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Div == n.DataAtom && "si_dayList_starttime" == scrape.Attr(n, "class")
	}) {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),
		// some defaults
		bc.Language = &lang_de
		bc.Publisher = &publisher
		empty_str := ""
		bc.Description = &empty_str
		// set start time
			hhmm := scrape.Text(at)
			// fmt.Fprintf(os.Stderr, "  a_id=%s\n", a_id)
			hour := r.MustParseInt(hhmm[0:2])
			minute := r.MustParseInt(hhmm[3:5])
			if 24 < hour || 60 < minute {
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
		// Title
		for idx, div := range scrape.FindAll(at.Parent, func(n *html.Node) bool {
			return atom.Div == n.DataAtom && "si_dayList_description" == scrape.Attr(n, "class")
		}) {
			if idx != 0 {
				err = errors.New("There was more than 1 <div class='si_dayList_description'>")
			bc.Title = scrape.Text(div)
			//				u, _ := url.Parse(scrape.Attr(h3_a, "href"))
			//			bc.Subject = day.Source.ResolveReference(u)

			bc.Title = strings.TrimSpace(bc.Title)
			for idx1, a := range scrape.FindAll(div, func(n *html.Node) bool {
				return atom.A == n.DataAtom
			}) {
				if idx1 != 0 {
					err = errors.New("There was more than 1 <a>")
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = day.Source.ResolveReference(u)
		// fmt.Fprintf(os.Stderr, "\n")
		ret = append(ret, &bc)
		index += 1
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if index > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[index-1].DtEnd = &midnight
Пример #27
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title
func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) {
	var bc r.Broadcast
	bc.Station = bcu.Station
	bc.Source = bcu.Source
		s := "de"
		bc.Language = &s
	// Title, TitleSeries, TitleEpisode
	for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) {
		if i != 0 {
			err = errors.New("There was more than 1 <h1 class='bcast_headline'>")
		bc.Title = r.TextChildrenNoClimb(h1)
		for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) {
			switch scrape.Attr(span, "class") {
			case "bcast_overline":
				s := scrape.Text(span)
				bc.TitleSeries = &s
			case "bcast_subtitle":
				s := scrape.Text(span)
				bc.TitleEpisode = &s
				err = errors.New("unexpected <span> inside <h1>")
			bc.Title = r.TextChildrenNoClimb(h1)
			description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") }))
			bc.Description = &description
		if nil == bc.Image {
			for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool {
				return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class")
			}) {
				for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
					u, _ := url.Parse(scrape.Attr(img, "src"))
					bc.Image = bcu.Source.ResolveReference(u)
					break FoundImage0
		if nil == bc.Image {
			// test some candidates:
			for _, no := range []*html.Node{h1.Parent, root} {
				for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) {
					for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
						u, _ := url.Parse(scrape.Attr(img, "src"))
						bc.Image = bcu.Source.ResolveReference(u)
						break FoundImage1

	// Time, DtEnd
	for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) {
		if idx != 0 {
			err = errors.New("There was more than 1 <p class='bcast_date'>")
		m := bcDateRegExp.FindStringSubmatch(scrape.Text(p))
		if nil == m {
			err = errors.New("There was no date match")
		i := r.MustParseInt
		bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc)
		t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc)
		if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight
			t = t.AddDate(0, 0, 1)
		bc.DtEnd = &t

	// Language
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:locale'/>")
		v := scrape.Attr(meta, "content")[0:2]
		bc.Language = &v

	// Subject
	for idx, a := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>")
		u, _ := url.Parse(scrape.Attr(a, "href"))
		bc.Subject = bc.Source.ResolveReference(u)

	// Modified
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>")
		v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content"))
		bc.Modified = &v

	// Author
	for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <meta name='author'/>")
		s := scrape.Attr(meta, "content")
		bc.Author = &s

	if "" == bc.Station.Identifier {
		panic("How can the identifier miss?")
	bcs = append(bcs, bc)
Пример #28
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	root, err := html.Parse(body)
	if err != nil {
		err = errors.New("Failed to html.Parse: " + err.Error())

	// define a matcher
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody {
			matched := scrape.Attr(n, "class") == "athing"
			return matched
		return false

	// grab all articles and loop over them
	articles := scrape.FindAll(root, matcher)
	for _, article := range articles {
		var ok bool
		// Get one post entry
		var titlenode *html.Node

		titlenode, ok = scrape.Find(article,
			func(n *html.Node) bool {
				if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" {
					return true
				return false
		if !ok {
		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"

		post.Title = scrape.Text(titlenode)
		post.Url = scrape.Attr(titlenode, "href")
		if strings.HasPrefix(post.Url, "item?id=") {
			post.Url = "https://news.ycombinator.com/" + post.Url

		ps = append(ps, &post)

		// Get additional info for this post
		scorenode := article.NextSibling
		if scorenode == nil {
			post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article))

		// Get the subtext containing scores, user and date
		subtext, ok := scrape.Find(scorenode,
			func(n *html.Node) bool {
				if scrape.Attr(n, "class") == "subtext" {
					return true
				return false
		if !ok {
			post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data))

		subs := scrape.FindAll(subtext,
			func(n *html.Node) bool {
				// Get the PostId and Score
				// span class="score" id="score_9643579">92 points</span>
				if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" {

					// Get score
					var scoreid int
					scorestr := strings.Split(scrape.Text(n), " ")[0]
					scoreid, err = strconv.Atoi(scorestr)
					if err != nil {
						fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error())
						return false
					post.Score = scoreid

					// Get PostId
					postidstr := scrape.Attr(n, "id")
					if len(strings.Split(postidstr, "_")) > 1 {
						post.WebPostId = strings.Split(postidstr, "_")[1]
						return true
				// Get the Username and Creation Date for this post
				if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil {
					href := strings.ToLower(scrape.Attr(n, "href"))
					if href != "" {
						s := strings.Split(href, "?")
						if s[0] == "user" && len(s) > 1 {
							// Username
							u := strings.Split(s[1], "=")
							if len(u) > 1 {
								post.User = u[1]
								return true
						} else {
							if s[0] == "item" && len(s) > 1 {
								// Created date
								createdago := scrape.Text(n)
								if strings.Contains(createdago, "ago") {
									var postDate time.Time

									postDate, err = GetDateFromCreatedAgo(createdago)
									if err != nil {
										err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago))
										return false
									post.PostDate = postDate

									return true
				} // end "class" == "subtext"
				return false

		if len(subs) == 0 {
			var w bytes.Buffer
			if rerr := html.Render(&w, subtext); rerr != nil {
				fmt.Printf("Render error: %s\n", rerr)
			post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String()))

	return ps, err
Пример #29
func main() {

	router := gin.Default()
	router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) {

		id, valid := validateAndFormatAmazonID(c.Param("amazon_id"))

		if !valid {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": "invalid amazon id",
				"id":    id,

		resp, err := http.Get("http://www.amazon.de/gp/product/" + id)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,

		//item does not exist in amazon.de
		if resp.StatusCode == http.StatusNotFound {
			c.JSON(http.StatusNotFound, gin.H{
				"error": "product not available",

		root, err := html.Parse(resp.Body)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,

		actorsMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Dd && n.Parent != nil &&
				n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil {
				return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" &&
					scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:"
			return false

		posterMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Img && n.Parent != nil {
				return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container"
			return false

		//NOTE: Since this is a demo, I assume matchers will always hit a result

		movie := &Movie{}

		titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title"))
		movie.Title = scrape.Text(titleNode.FirstChild)

		releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year"))
		year, _ := strconv.Atoi(scrape.Text(releaseYearNode))
		movie.ReleaseYear = year

		actorsNode, _ := scrape.Find(root, actorsMatcher)
		movie.Actors = strings.Split(scrape.Text(actorsNode), ",")

		posterNode, _ := scrape.Find(root, posterMatcher)
		movie.Poster = scrape.Attr(posterNode, "src")

		movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie"))
		ids := make([]string, len(movieNodes))
		for i, movieNode := range movieNodes {
			ids[i] = scrape.Attr(movieNode, "data-asin")
		movie.SimilarIDs = ids

		c.JSON(http.StatusOK, movie)

Пример #30
func Search(url string) (string, bool) {
	resp, err := http.Get("https://www.reddit.com/search?q=url%3A" + url + "&sort=new&t=all")
	if err != nil {
		return "", false
	root, err := html.Parse(resp.Body)
	if err != nil {
		return "", false

	matcher := func(n *html.Node) bool {
		return scrape.Attr(n, "class") == "search-title may-blank"
	m_comments := func(n *html.Node) bool {
		if n == nil {
			return false
		return scrape.Attr(n, "class") == "search-comments may-blank"
	m_subreddit := func(n *html.Node) bool {
		if n == nil {
			return false
		return scrape.Attr(n, "class") == "search-subreddit-link may-blank"
	m_time := func(n *html.Node) bool {
		if n == nil {
			return false
		return scrape.Attr(n, "datetime") != ""

	post, err_ := scrape.Find(root, matcher)
	if post == nil {
		return "", false
	if post.Parent == nil {
		return "", false
	if post.Parent.Parent == nil {
		return "", false
	main := post.Parent.Parent
	s_comments := "%error%"
	s_time := "%error%"
	s_subreddit := "%error%"
	title := scrape.Text(post)
	href := scrape.Attr(post, "href")

	comments, err_ := scrape.Find(main, m_comments)
	if err_ == true {
		s_comments = scrape.Text(comments)
	time, err_ := scrape.Find(main, m_time)
	if err_ == true {
		s_time = scrape.Text(time)
	subreddit, err_ := scrape.Find(main, m_subreddit)
	if err_ == true {
		s_subreddit = scrape.Text(subreddit)

	re := regexp.MustCompile("comments/([[:alnum:]]+)/")
	match := re.FindStringSubmatch(href)
	s_url := "https://redd.it/" + match[1]
	s_final := fmt.Sprintf("[Reddit %s] %s (%s) - %s [%s]\n", s_subreddit, title, s_url, s_comments, s_time)
	return s_final, true