func main() { // request and parse the front page resp, err := http.Get("https://news.ycombinator.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { return scrape.Attr(n.Parent.Parent, "class") == "athing" } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for i, article := range articles { fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") }) ret = make([]*r.Broadcast, len(nodes)) for index, tim := range nodes { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher // set start time { div_t := strings.TrimSpace(scrape.Text(tim)) if 5 != len(div_t) { continue } hour := r.MustParseInt(div_t[0:2]) minute := r.MustParseInt(div_t[3:5]) bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class") }) { // Title bc.Title = strings.TrimSpace(scrape.Text(tit)) href := scrape.Attr(tit, "href") if "" != href { u, _ := url.Parse(href) bc.Subject = day.Source.ResolveReference(u) } desc_node := tit.Parent desc_node.RemoveChild(tit) description := r.TextWithBrFromNodeSet([]*html.Node{desc_node}) bc.Description = &description // fmt.Fprintf(os.Stderr, "\n") } ret[index] = &bc } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if len(nodes) > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[len(nodes)-1].DtEnd = &midnight } return }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
func findOpenGraphTitle(doc *html.Node) string { el, found := scrape.Find(doc, func(n *html.Node) bool { if n.DataAtom == atom.Meta { return scrape.Attr(n, "property") == "og:title" && scrape.Attr(n, "content") != "" } return false }) if !found { return "" } return scrape.Attr(el, "content") }
func findTwitterTitle(doc *html.Node) string { el, found := scrape.Find(doc, func(n *html.Node) bool { if n.DataAtom == atom.Meta { return scrape.Attr(n, "name") == "twitter:title" && scrape.Attr(n, "content") != "" } return false }) if !found { return "" } return scrape.Attr(el, "content") }
func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) { const closeDownHour int = 5 for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) { year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time) if nil != err { panic(err) } // fmt.Printf("%d-%d-%d %s\n", year, month, day, err) for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) { m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a)) if nil == m { panic(errors.New("Couldn't parse <a>")) } ur, _ := url.Parse(scrape.Attr(a, "href")) hour := r.MustParseInt(m[1]) dayOffset := 0 if hour < closeDownHour { dayOffset = 1 } // fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title) bcu := broadcastURL(r.BroadcastURL{ TimeURL: r.TimeURL{ Time: time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc), Source: *day.Source.ResolveReference(ur), Station: day.Station, }, Title: strings.TrimSpace(m[3]), }) ret = append(ret, &bcu) } } return }
// Scrape scrapes a site for a keyword func (q *query) Scrape() []*match { // Request the URL resp, err := http.Get(q.SiteURL) if err != nil { panic(err) log.Fatal("Couldn't GET ", q.SiteURL) } // Parse the contents of the URL root, err := html.Parse(resp.Body) if err != nil { panic(err) log.Fatal("Unable to parse response") } // Grab all the posts and print them posts := scrape.FindAll(root, scrape.ByClass("description")) matches := make([]*match, len(posts)) for i, post := range posts { matches[i] = &match{ Title: scrape.Text(post.FirstChild.NextSibling), Description: scrape.Text(post), Link: "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"), Price: scrape.Text(post.NextSibling.NextSibling), Matched: false, } } return matches }
func main() { resp, err := http.Get("https://www.reddit.com") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } matcher := func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "id") == "siteTable" } return false } table, ok := scrape.Find(root, matcher) if !ok { panic(ok) } matcher = func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "data-type") == "link" } return false } articles := scrape.FindAll(table, matcher) var posts []Post for i := 0; i < len(articles); i++ { wg.Add(1) go func(n *html.Node) { post := parsepost(n) posts = append(posts, post) wg.Done() }(articles[i]) } wg.Wait() for i := 0; i < len(posts); i++ { printpost(posts[i]) } }
func parsepost(n *html.Node) Post { post := Post{} // get the title. uses a scrape inbuilt matcher title_scrape, _ := scrape.Find(n, scrape.ByClass("title")) title := scrape.Text(title_scrape.FirstChild) // get the subreddit. This requires a custom matcher. matcher := func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil { return scrape.Attr(n, "class") == "subreddit hover may-blank" } return false } sub, _ := scrape.Find(n, matcher) subreddit := scrape.Text(sub) // get the url to the comments. requires custom matcher. matcher = func(n *html.Node) bool { if n.DataAtom == atom.Ul && n.FirstChild != nil { return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first" } return false } ul, _ := scrape.Find(n, matcher) // ul is a list of two buttons: one that links to a post's comments page, one a "share" function li := ul.FirstChild // the first list item of ul -- this will always be the comments page link. url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item. // get the author. Uses custom matcher and magic. matcher = func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P { return strings.Contains(scrape.Attr(n, "href"), "/user/") } return false } author_scrape, _ := scrape.Find(n, matcher) author := scrape.Text(author_scrape) post.title = title post.subreddit = subreddit post.url = url post.author = author return post }
func getLink(r *html.Node) (s string) { buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn")) for _, button := range buttons { windowLocation := scrape.Attr(button, "onclick") link := strings.Split(windowLocation, "=")[1] s := strings.Trim(link, "'") return s } return }
// Get Time, Source and Image from json html snippet func (item *calendarItem) parseBroadcastSeedNode(root *html.Node) (bc *broadcastURL, err error) { bc = &broadcastURL{} bc.Station = *item.Station bc.Time = time.Time(item.DateTime) for _, a := range scrape.FindAll(root, func(n *html.Node) bool { if atom.A != n.DataAtom { return false } href := scrape.Attr(n, "href") return strings.HasPrefix(href, "/programm/radio/ausstrahlung-") && strings.HasSuffix(href, ".html") }) { ru, _ := url.Parse(scrape.Attr(a, "href")) bc.Source = *item.Station.ProgramURL.ResolveReference(ru) } for _, img := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { ru, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = item.Station.ProgramURL.ResolveReference(ru) } return }
func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) { { // Author meta, _ := scrape.Find(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name") }) if nil != meta { content := scrape.Attr(meta, "content") bc.Author = &content } } for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='epg-content-right'/>") return } { // TitleEpisode txt, _ := scrape.Find(epg, func(n *html.Node) bool { return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom }) if nil != txt { t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data)) bc.TitleEpisode = &t txt.Parent.RemoveChild(txt.NextSibling) txt.Parent.RemoveChild(txt) } } { // Subject a, _ := scrape.Find(epg, func(n *html.Node) bool { return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom }) if nil != a { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } } // purge some cruft for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool { clz := scrape.Attr(n, "class") return atom.H2 == n.DataAtom || "mod modSharing" == clz || "modGalery" == clz || "sendungsLink" == clz || "tabs-container" == clz }) { nn.Parent.RemoveChild(nn) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent })) bc.Description = &description } } bc_ := r.Broadcast(*bc) ret = append(ret, &bc_) return }
func TweetsToUser(u user.User) []tweet.Tweet { reqURL := SearchURL _url.SetQueryParams(&reqURL, map[string]string{ "q": "to:" + u.ScreenName, "f": "tweets", }) res, err := http.Get(reqURL.String()) PanicIf(err) root, err := html.Parse(res.Body) PanicIf(err) tweetsMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet") } tweetScreenNameMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username") } tweetTextMatcher := func(n *html.Node) bool { return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text") } tweetNodes := scrape.FindAll(root, tweetsMatcher) tweets := make([]tweet.Tweet, len(tweetNodes)) for i, n := range tweetNodes { t := tweet.Tweet{ ID: scrape.Attr(n, "data-user-id"), } if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok { t.Author = *user.NewUser(scrape.Text(child)) } if child, ok := scrape.Find(n, tweetTextMatcher); ok { t.Text = scrape.Text(child) } tweets[i] = t } return tweets }
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo { var info YoutubeVideoInfo info.ID = scrape.Attr(element, "data-context-item-id") thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple")) if ok { thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img)) if ok { info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src")) } } videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time")) if ok { durationStr := strings.TrimSpace(scrape.Text(videoTimeElement)) info.Length, _ = parseVideoDuration(durationStr) } linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"} linkFieldPtrs := []*string{&info.Title, &info.Author} for i, class := range linkFieldClasses { linkContainer, ok := scrape.Find(element, scrape.ByClass(class)) if ok { link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A)) if ok { *linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link)) } } } descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description")) if ok { info.Description = strings.TrimSpace(scrape.Text(descBox)) } return &info }
func eventDetailsToStrArr(eventDetails []*html.Node, eventID int) []string { return []string{ strconv.Itoa(eventID), scrape.Text(eventDetails[0]), scrape.Text(eventDetails[1]), scrape.Text(eventDetails[2]), scrape.Text(eventDetails[3]), scrape.Text(eventDetails[4]), scrape.Text(eventDetails[5]), strings.TrimPrefix( scrape.Attr(eventDetails[5].FirstChild, "href"), "mailto:"), } }
// History asynchronously fetches the user's // video viewing history. // You may provide a cancel channel which you // can close to cancel the fetch mid-way. func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) { videoChan := make(chan *YoutubeVideoInfo) errChan := make(chan error, 1) go func() { defer close(videoChan) defer close(errChan) historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil) historyReq.Header.Set("User-Agent", spoofedUserAgent) resp, err := y.s.Do(historyReq) rootNode, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { errChan <- err return } loadMoreHTML := rootNode contentHTML := rootNode for { items := parseHistoryItems(contentHTML) for _, item := range items { select { case videoChan <- item: case <-cancel: return } } if loadMoreHTML == nil { break } loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more")) if ok { morePath := scrape.Attr(loadButton, "data-uix-load-more-href") loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath) if err != nil { errChan <- err return } } } }() return videoChan, errChan }
func (s *station) parseDayURLsNode(root *html.Node) (ret []timeURL, err error) { i := 0 for _, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom }) { rel := scrape.Attr(a, "href") d, err := s.newTimeURL(rel) if nil != err { continue } // use only every 3rd day schedule url because each one contains 3 days i += 1 if 2 != i%3 { continue } // fmt.Printf("ok %s\n", d.String()) ret = append(ret, timeURL(d)) } return }
func ParseName(n *html.Node) (string, string, string) { matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td { return true } return false } var name, magnet, desc string if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok { name = scrape.Text(detName) } if anchor, ok := scrape.Find(n, matcher); ok { magnet = scrape.Attr(anchor, "href") } if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok { desc = scrape.Text(detDesc) } return name, magnet, desc }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "time" == scrape.Attr(n.Parent, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de { publisher := "http://www.deutschlandfunk.de/" if "drk" == day.Station.Identifier { publisher = "http://www.deutschlandradiokultur.de/" } bc.Publisher = &publisher } // set start time { a_id := scrape.Attr(at, "name") if "" == a_id { continue } bc.Source.Fragment = a_id hour := r.MustParseInt(a_id[0:2]) minute := r.MustParseInt(a_id[2:4]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, h3 := range scrape.FindAll(at.Parent.Parent, func(n *html.Node) bool { return atom.H3 == n.DataAtom && atom.Td == n.Parent.DataAtom && atom.Tr == n.Parent.Parent.DataAtom && "description" == scrape.Attr(n.Parent, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3>") return } // purge 'aufnehmen' link: for _, chi := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom && "psradio" == scrape.Attr(n, "class") }) { h3.RemoveChild(chi) } // fmt.Fprintf(os.Stderr, " '%s'\n", scrape.Text(h3)) for idx, h3_a := range scrape.FindAll(h3, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx != 0 { err = errors.New("There was more than 1 <tr><td class='description'><h3><a>") return } bc.Title = scrape.Text(h3_a) u, _ := url.Parse(scrape.Attr(h3_a, "href")) bc.Subject = day.Source.ResolveReference(u) } bc.Title = strings.TrimSpace(bc.Title) if "" == bc.Title { bc.Title = r.TextChildrenNoClimb(h3) } // fmt.Fprintf(os.Stderr, " '%s'", bc.Title) { description := r.TextWithBrFromNodeSet(scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom })) bc.Description = &description } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }
func jobCaptChaUrl(n *html.Node) string { img, _ := scrape.Find(n, captchaImageMatcher) return baseUrl + scrape.Attr(img, "src") }
func jobUrl(n *html.Node) string { return baseUrl + scrape.Attr(n, "href") }
r := <-respCh return r } func fetchNextPage(keyword string) *html.Node { url := jobsKeywordUrl + keyword + jobsNextPageOffset + strconv.Itoa(pager) pager += 50 urlCh <- url r := <-respCh return r } var nextPageMatcher = func(n *html.Node) bool { if n.DataAtom == atom.Img && scrape.Attr(n, "src") == "/UDClasMedia/Arte/Proximos50.gif" { return false } return true } var allJobMatcher = func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Font && scrape.Attr(n.Parent, "class") == "Ver14nounder" { return scrape.Attr(n, "class") == "Ver14nounder" } return false } var descriptionMatcher = func(n *html.Node) bool { if n.DataAtom == atom.P && scrape.Attr(n, "class") == "Ver14nounder" {
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bc r.Broadcast, err error) { bc.Station = bcu.Station if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bc.Source = bcu.Source bc.Time = bcu.Time bc.Image = bcu.Image { s := "de" bc.Language = &s } for i, main := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "br-main-text" == scrape.Attr(n, "class") }) { if 1 < i { err = errors.New("unexpected 2nd <div class='br-main-text'> ") return } // Subject for idx, h3 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H3 == n.DataAtom && "Weitere Informationen" == scrape.Text(n) }) { // fmt.Fprintf(os.Stderr, "GET %s\n", "uhu") if idx != 0 { err = errors.New("There was more than 1 <h3>Weitere Informationen") return } for _, a := range scrape.FindAll(h3.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } h3.Parent.Parent.RemoveChild(h3.Parent) } for i1, h2 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H2 == n.DataAtom }) { if 1 < i1 { err = errors.New("unexpected 2nd <h2> ") return } for i4, em := range scrape.FindAll(h2, func(n *html.Node) bool { return atom.Em == n.DataAtom }) { if 1 < i4 { err = errors.New("unexpected 2nd <em> ") return } bc.Title = scrape.Text(em) em.Parent.RemoveChild(em) } s := scrape.Text(h2) bc.TitleSeries = &s for i2, h3 := range scrape.FindAll(main, func(n *html.Node) bool { return atom.H3 == n.DataAtom }) { if 1 < i2 { err = errors.New("unexpected 2nd <h3> ") return } s := scrape.Text(h3) bc.TitleEpisode = &s h3.Parent.RemoveChild(h3) } inner := h2.Parent.Parent.Parent h2.Parent.RemoveChild(h2) for ch := inner.FirstChild; ch != nil; ch = ch.NextSibling { if atom.Div == ch.DataAtom { inner.RemoveChild(ch) // once removed NextSibling returns nil } } // Description description := r.TextWithBrFromNodeSet(scrape.FindAll(inner, func(n *html.Node) bool { return atom.P == n.DataAtom || atom.Div == n.DataAtom })) bc.Description = &description } } // DtEnd for _, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "br-time" == scrape.Attr(n, "class") }) { m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt // bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(bc.Time.Year(), bc.Time.Month(), bc.Time.Day(), i(m[3]), i(m[4]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } return }
//LivescoreParser parse livescore func LivescoreParser(root *html.Node) []Match { var matches []Match contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag)) if contentOK { //find all row-gray rowGrayMatcher := func(n *html.Node) bool { classes := strings.Fields(scrape.Attr(n, "class")) for _, c := range classes { if c == classRowGray { parentClasses := strings.Fields(scrape.Attr(n.Parent, "class")) for _, pc := range parentClasses { if pc == classContentTag { return true } } } } return false } rows := scrape.FindAll(contentElmt, rowGrayMatcher) matchChann := make(chan Match) for _, rowElmt := range rows { go func(rowElmt *html.Node) { var time string var homeTeam string var awayTeam string var score string timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt)) if timeElmtOK { time = scrape.Text(timeElmt) } scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink)) if scoreElmtOK { score = scrape.Text(scoreElmt) } teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt)) for i := 0; i < len(teamElmts); i++ { teamElmt := teamElmts[i] if i%2 == 0 { homeTeam = scrape.Text(teamElmt) } else { awayTeam = scrape.Text(teamElmt) } } match := Match{ HomeTeam: homeTeam, AwayTeam: awayTeam, Score: score, Time: time, } matchChann <- match }(rowElmt) } for i := 0; i < len(rows); i++ { select { case m := <-matchChann: matches = append(matches, m) } } close(matchChann) } return matches }
func doScrape(urlString string) AppData { fmt.Println(urlString) u, err := url.Parse(urlString) if err != nil { panic(err) } appData := AppData{} appData.PackageName = u.Query().Get("id") resp, err := http.Get(urlString) if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } genreMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "genre" } iconMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "image" } softwareVersionMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "softwareVersion" } name, ok := scrape.Find(root, scrape.ByClass("id-app-title")) if ok { appData.Name = scrape.Text(name) } genre, ok := scrape.Find(root, genreMatcher) if ok { appData.Categories = append(appData.Categories, scrape.Text(genre)) } icon, ok := scrape.Find(root, iconMatcher) if ok { iconSrc := scrape.Attr(icon, "src") iconUrl, err := url.Parse(iconSrc) if err != nil { panic(err) } if iconUrl.Scheme == "" { iconSrc = "https:" + iconSrc } resp, err = http.Get(iconSrc) if err != nil { panic(err) } defer resp.Body.Close() outputFile, err := os.Create("output/" + appData.PackageName + ".png") if err != nil { panic(err) } defer outputFile.Close() _, err = io.Copy(outputFile, resp.Body) if err != nil { panic(err) } } version, ok := scrape.Find(root, softwareVersionMatcher) if ok { appData.Version = strings.TrimSpace(scrape.Text(version)) } return appData }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_starttime" == scrape.Attr(n, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher empty_str := "" bc.Description = &empty_str // set start time { hhmm := scrape.Text(at) // fmt.Fprintf(os.Stderr, " a_id=%s\n", a_id) hour := r.MustParseInt(hhmm[0:2]) minute := r.MustParseInt(hhmm[3:5]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, div := range scrape.FindAll(at.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_description" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='si_dayList_description'>") return } bc.Title = scrape.Text(div) // u, _ := url.Parse(scrape.Attr(h3_a, "href")) // bc.Subject = day.Source.ResolveReference(u) bc.Title = strings.TrimSpace(bc.Title) for idx1, a := range scrape.FindAll(div, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx1 != 0 { err = errors.New("There was more than 1 <a>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = day.Source.ResolveReference(u) } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) { var bc r.Broadcast bc.Station = bcu.Station bc.Source = bcu.Source { s := "de" bc.Language = &s } // Title, TitleSeries, TitleEpisode for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) { if i != 0 { err = errors.New("There was more than 1 <h1 class='bcast_headline'>") return } bc.Title = r.TextChildrenNoClimb(h1) for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) { switch scrape.Attr(span, "class") { case "bcast_overline": s := scrape.Text(span) bc.TitleSeries = &s case "bcast_subtitle": s := scrape.Text(span) bc.TitleEpisode = &s default: err = errors.New("unexpected <span> inside <h1>") return } bc.Title = r.TextChildrenNoClimb(h1) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") })) bc.Description = &description } if nil == bc.Image { FoundImage0: for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage0 } } } if nil == bc.Image { FoundImage1: // test some candidates: for _, no := range []*html.Node{h1.Parent, root} { for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage1 } } } } } // Time, DtEnd for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <p class='bcast_date'>") return } m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Language for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:locale'/>") return } v := scrape.Attr(meta, "content")[0:2] bc.Language = &v } // Subject for idx, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries") }) { if idx != 0 { err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bcs = append(bcs, bc) return }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { root, err := html.Parse(body) if err != nil { err = errors.New("Failed to html.Parse: " + err.Error()) return } // define a matcher matcher := func(n *html.Node) bool { if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody { matched := scrape.Attr(n, "class") == "athing" return matched } return false } // grab all articles and loop over them articles := scrape.FindAll(root, matcher) for _, article := range articles { var ok bool // Get one post entry var titlenode *html.Node titlenode, ok = scrape.Find(article, func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" { return true } return false }) if !ok { continue } // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" post.Title = scrape.Text(titlenode) post.Url = scrape.Attr(titlenode, "href") if strings.HasPrefix(post.Url, "item?id=") { post.Url = "https://news.ycombinator.com/" + post.Url } ps = append(ps, &post) // Get additional info for this post scorenode := article.NextSibling if scorenode == nil { post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article)) continue } // Get the subtext containing scores, user and date subtext, ok := scrape.Find(scorenode, func(n *html.Node) bool { if scrape.Attr(n, "class") == "subtext" { return true } return false }) if !ok { post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data)) continue } subs := scrape.FindAll(subtext, func(n *html.Node) bool { // Get the PostId and Score // span class="score" id="score_9643579">92 points</span> if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" { // Get score var scoreid int scorestr := strings.Split(scrape.Text(n), " ")[0] scoreid, err = strconv.Atoi(scorestr) if err != nil { fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error()) return false } post.Score = scoreid // Get PostId postidstr := scrape.Attr(n, "id") if len(strings.Split(postidstr, "_")) > 1 { post.WebPostId = strings.Split(postidstr, "_")[1] return true } } // Get the Username and Creation Date for this post if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil { href := strings.ToLower(scrape.Attr(n, "href")) if href != "" { s := strings.Split(href, "?") if s[0] == "user" && len(s) > 1 { // Username u := strings.Split(s[1], "=") if len(u) > 1 { post.User = u[1] return true } } else { if s[0] == "item" && len(s) > 1 { // Created date createdago := scrape.Text(n) if strings.Contains(createdago, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(createdago) if err != nil { err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago)) return false } post.PostDate = postDate return true } } } } } // end "class" == "subtext" return false }) if len(subs) == 0 { var w bytes.Buffer if rerr := html.Render(&w, subtext); rerr != nil { fmt.Printf("Render error: %s\n", rerr) } post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String())) } } return ps, err }
func main() { router := gin.Default() router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) { id, valid := validateAndFormatAmazonID(c.Param("amazon_id")) if !valid { c.JSON(http.StatusInternalServerError, gin.H{ "error": "invalid amazon id", "id": id, }) return } resp, err := http.Get("http://www.amazon.de/gp/product/" + id) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } //item does not exist in amazon.de if resp.StatusCode == http.StatusNotFound { c.JSON(http.StatusNotFound, gin.H{ "error": "product not available", }) return } root, err := html.Parse(resp.Body) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } actorsMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Dd && n.Parent != nil && n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil { return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" && scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:" } return false } posterMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Img && n.Parent != nil { return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container" } return false } //NOTE: Since this is a demo, I assume matchers will always hit a result movie := &Movie{} titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title")) movie.Title = scrape.Text(titleNode.FirstChild) releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year")) year, _ := strconv.Atoi(scrape.Text(releaseYearNode)) movie.ReleaseYear = year actorsNode, _ := scrape.Find(root, actorsMatcher) movie.Actors = strings.Split(scrape.Text(actorsNode), ",") posterNode, _ := scrape.Find(root, posterMatcher) movie.Poster = scrape.Attr(posterNode, "src") movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie")) ids := make([]string, len(movieNodes)) for i, movieNode := range movieNodes { ids[i] = scrape.Attr(movieNode, "data-asin") } movie.SimilarIDs = ids c.JSON(http.StatusOK, movie) }) router.Run(":8080") }
func Search(url string) (string, bool) { resp, err := http.Get("https://www.reddit.com/search?q=url%3A" + url + "&sort=new&t=all") if err != nil { return "", false } root, err := html.Parse(resp.Body) if err != nil { return "", false } matcher := func(n *html.Node) bool { return scrape.Attr(n, "class") == "search-title may-blank" } m_comments := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-comments may-blank" } m_subreddit := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-subreddit-link may-blank" } m_time := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "datetime") != "" } post, err_ := scrape.Find(root, matcher) if post == nil { return "", false } if post.Parent == nil { return "", false } if post.Parent.Parent == nil { return "", false } main := post.Parent.Parent s_comments := "%error%" s_time := "%error%" s_subreddit := "%error%" title := scrape.Text(post) href := scrape.Attr(post, "href") comments, err_ := scrape.Find(main, m_comments) if err_ == true { s_comments = scrape.Text(comments) } time, err_ := scrape.Find(main, m_time) if err_ == true { s_time = scrape.Text(time) } subreddit, err_ := scrape.Find(main, m_subreddit) if err_ == true { s_subreddit = scrape.Text(subreddit) } re := regexp.MustCompile("comments/([[:alnum:]]+)/") match := re.FindStringSubmatch(href) s_url := "https://redd.it/" + match[1] s_final := fmt.Sprintf("[Reddit %s] %s (%s) - %s [%s]\n", s_subreddit, title, s_url, s_comments, s_time) return s_final, true }