func main() { // request and parse the front page resp, err := http.Get("https://torguard.net/downloads.php") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values // if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { if n.DataAtom == atom.Tr { return true } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for _, article := range articles { if strings.Contains(scrape.Text(article), "DEBIAN x64Bit") { fmt.Printf("%s\n", scrape.Text(article)) } //fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
// Scrape scrapes a site for a keyword func (q *query) Scrape() []*match { // Request the URL resp, err := http.Get(q.SiteURL) if err != nil { panic(err) log.Fatal("Couldn't GET ", q.SiteURL) } // Parse the contents of the URL root, err := html.Parse(resp.Body) if err != nil { panic(err) log.Fatal("Unable to parse response") } // Grab all the posts and print them posts := scrape.FindAll(root, scrape.ByClass("description")) matches := make([]*match, len(posts)) for i, post := range posts { matches[i] = &match{ Title: scrape.Text(post.FirstChild.NextSibling), Description: scrape.Text(post), Link: "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"), Price: scrape.Text(post.NextSibling.NextSibling), Matched: false, } } return matches }
func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) { const closeDownHour int = 5 for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) { year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time) if nil != err { panic(err) } // fmt.Printf("%d-%d-%d %s\n", year, month, day, err) for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) { m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a)) if nil == m { panic(errors.New("Couldn't parse <a>")) } ur, _ := url.Parse(scrape.Attr(a, "href")) hour := r.MustParseInt(m[1]) dayOffset := 0 if hour < closeDownHour { dayOffset = 1 } // fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title) bcu := broadcastURL(r.BroadcastURL{ TimeURL: r.TimeURL{ Time: time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc), Source: *day.Source.ResolveReference(ur), Station: day.Station, }, Title: strings.TrimSpace(m[3]), }) ret = append(ret, &bcu) } } return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") }) ret = make([]*r.Broadcast, len(nodes)) for index, tim := range nodes { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher // set start time { div_t := strings.TrimSpace(scrape.Text(tim)) if 5 != len(div_t) { continue } hour := r.MustParseInt(div_t[0:2]) minute := r.MustParseInt(div_t[3:5]) bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class") }) { // Title bc.Title = strings.TrimSpace(scrape.Text(tit)) href := scrape.Attr(tit, "href") if "" != href { u, _ := url.Parse(href) bc.Subject = day.Source.ResolveReference(u) } desc_node := tit.Parent desc_node.RemoveChild(tit) description := r.TextWithBrFromNodeSet([]*html.Node{desc_node}) bc.Description = &description // fmt.Fprintf(os.Stderr, "\n") } ret[index] = &bc } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if len(nodes) > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[len(nodes)-1].DtEnd = &midnight } return }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
func ParseRecord(n *html.Node) Torrent { tds := scrape.FindAll(n, scrape.ByTag(atom.Td)) var size, uptime, uploader string if len(tds) == 4 { cat := scrape.Text(tds[0])[0:3] name, magnet, desc := ParseName(tds[1]) matches := re.FindStringSubmatch(desc) uptime, size, uploader = matches[1], matches[2], matches[3] seed := scrape.Text(tds[2]) leech := scrape.Text(tds[3]) return Torrent{cat, name, magnet, size, uptime, uploader, seed, leech} } else { fmt.Println("Error: not expected format") } return Torrent{} }
func main() { // request and parse the front page resp, err := http.Get("https://news.ycombinator.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { return scrape.Attr(n.Parent.Parent, "class") == "athing" } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for i, article := range articles { fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
func fillJobStruct(n *html.Node) *Job { job := new(Job) job.Title = scrape.Text(n) job.RetriveOn = time.Now().Format(time.RFC822Z) job.url = jobUrl(n) fmt.Println(job.url) job.ID = jobID(job.url) job.EmailFormLink = jobEmailFromUrl + job.ID jp := fetchByID(job.ID) job.jobPage = jp desc, _ := scrape.Find(job.jobPage, descriptionMatcher) job.Description = scrape.Text(desc) req, _ := scrape.Find(job.jobPage, requiermentMatcher) job.Requierments = scrape.Text(req) return job }
func parsepost(n *html.Node) Post { post := Post{} // get the title. uses a scrape inbuilt matcher title_scrape, _ := scrape.Find(n, scrape.ByClass("title")) title := scrape.Text(title_scrape.FirstChild) // get the subreddit. This requires a custom matcher. matcher := func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil { return scrape.Attr(n, "class") == "subreddit hover may-blank" } return false } sub, _ := scrape.Find(n, matcher) subreddit := scrape.Text(sub) // get the url to the comments. requires custom matcher. matcher = func(n *html.Node) bool { if n.DataAtom == atom.Ul && n.FirstChild != nil { return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first" } return false } ul, _ := scrape.Find(n, matcher) // ul is a list of two buttons: one that links to a post's comments page, one a "share" function li := ul.FirstChild // the first list item of ul -- this will always be the comments page link. url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item. // get the author. Uses custom matcher and magic. matcher = func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P { return strings.Contains(scrape.Attr(n, "href"), "/user/") } return false } author_scrape, _ := scrape.Find(n, matcher) author := scrape.Text(author_scrape) post.title = title post.subreddit = subreddit post.url = url post.author = author return post }
func findHTMLTitle(doc *html.Node) string { el, found := scrape.Find(doc, scrape.ByTag(atom.Title)) if !found { return "" } return scrape.Text(el) }
func main() { // request and parse the front page resp, err := http.Get("https://torguard.net/downloads.php") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) for _, row := range rows { if strings.Contains(scrape.Text(row), "DEBIAN x64") { l := getLink(row) fmt.Printf("%s \n %s \n", scrape.Text(row), l) } } }
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada // dada. Hay que comprobar que no se devuelve error. func Get(parada int) (TiemposParada, error) { resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" + strconv.Itoa(parada)) if err != nil { return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.") } rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder()) root, err := html.Parse(rInUTF8) if err != nil { return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.") } headers := scrape.FindAll(root, scrape.ByTag(atom.H1)) if len(headers) < 2 { return TiemposParada{}, errors.New("La parada indicada parece errónea.") } lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36")) resultados := make([]ProximoBus, len(lineasTiempos)) for i, item := range lineasTiempos { valores := scrape.FindAll(item, scrape.ByClass("style38")) resultados[i] = ProximoBus{ Linea: scrape.Text(valores[0]), Destino: scrape.Text(valores[2]), Minutos: scrape.Text(valores[3]), } } if len(resultados) == 0 { return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.") } return TiemposParada{ Nombre: scrape.Text(headers[1]), Tiempos: resultados, Momento: time.Now(), Codigo: parada, }, nil }
func TweetsToUser(u user.User) []tweet.Tweet { reqURL := SearchURL _url.SetQueryParams(&reqURL, map[string]string{ "q": "to:" + u.ScreenName, "f": "tweets", }) res, err := http.Get(reqURL.String()) PanicIf(err) root, err := html.Parse(res.Body) PanicIf(err) tweetsMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet") } tweetScreenNameMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username") } tweetTextMatcher := func(n *html.Node) bool { return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text") } tweetNodes := scrape.FindAll(root, tweetsMatcher) tweets := make([]tweet.Tweet, len(tweetNodes)) for i, n := range tweetNodes { t := tweet.Tweet{ ID: scrape.Attr(n, "data-user-id"), } if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok { t.Author = *user.NewUser(scrape.Text(child)) } if child, ok := scrape.Find(n, tweetTextMatcher); ok { t.Text = scrape.Text(child) } tweets[i] = t } return tweets }
func ParseName(n *html.Node) (string, string, string) { matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td { return true } return false } var name, magnet, desc string if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok { name = scrape.Text(detName) } if anchor, ok := scrape.Find(n, matcher); ok { magnet = scrape.Attr(anchor, "href") } if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok { desc = scrape.Text(detDesc) } return name, magnet, desc }
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo { var info YoutubeVideoInfo info.ID = scrape.Attr(element, "data-context-item-id") thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple")) if ok { thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img)) if ok { info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src")) } } videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time")) if ok { durationStr := strings.TrimSpace(scrape.Text(videoTimeElement)) info.Length, _ = parseVideoDuration(durationStr) } linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"} linkFieldPtrs := []*string{&info.Title, &info.Author} for i, class := range linkFieldClasses { linkContainer, ok := scrape.Find(element, scrape.ByClass(class)) if ok { link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A)) if ok { *linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link)) } } } descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description")) if ok { info.Description = strings.TrimSpace(scrape.Text(descBox)) } return &info }
func resolveUrl(website string) string { site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Printf("%s", err) os.Exit(1) panic(err) } title, _ := scrape.Find(contents, scrape.ByTag(atom.Title)) var titulo string = scrape.Text(title) return titulo }
func queryWikipedia(word string) string { word = strings.TrimSpace(word) website := "http://en.wikipedia.com/wiki/" + word site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Print("%s", err) panic(err) os.Exit(1) } intro, _ := scrape.Find(contents, scrape.ByTag(atom.P)) var resp string = scrape.Text(intro) return resp }
// parseServerStatus returns a slice of strings containing only server stats. func parseServerStatus(root *html.Node) []string { var apacheStats []string // Lines with stats start with a number. var validStats = regexp.MustCompile(`^[0-9]`) // Grab all the table rows. rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) // If each row matches - add it to the stats lines. for _, row := range rows { content := scrape.Text(row) if validStats.MatchString(content) { apacheStats = append(apacheStats, content) } } Log(fmt.Sprintf("parseServerStatus apacheStats='%d'", len(apacheStats)), "debug") return apacheStats }
func eventDetailsToStrArr(eventDetails []*html.Node, eventID int) []string { return []string{ strconv.Itoa(eventID), scrape.Text(eventDetails[0]), scrape.Text(eventDetails[1]), scrape.Text(eventDetails[2]), scrape.Text(eventDetails[3]), scrape.Text(eventDetails[4]), scrape.Text(eventDetails[5]), strings.TrimPrefix( scrape.Attr(eventDetails[5].FirstChild, "href"), "mailto:"), } }
func scraper() { fd, err := os.Open("/mnt/hgfs/Downloads/wiki.html") if err != nil { panic(err) } defer fd.Close() root, err := html.Parse(fd) if err != nil { panic(err) } t := html.NewTokenizer(root) // matcher := func(n *html.Node) bool { // if n.DataAtom == atom.Table { // return true // } // return false // } // rowMatcher := func(n *html.Node) bool { // if n.DataAtom == atom.Tr { // return true // } // return false // } tableMatcher := scrape.ById(tableID) table := scrape.FindAll(root, tableMatcher) for _, v := range table { if t.Token().Data == "tr" { fmt.Printf("%s\n", scrape.Text(v)) } else { t.Next() } } // for , v := range table { // fmt.Printf("%s\n", scrape.Text(v)) // } }
func login() { client := &http.Client{} // POST /ajax/login.html HTTP/1.1 // Host: learn.infiniteskills.com // Connection: close // Content-Length: 64 // Cache-Control: max-age=0 // Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 // Origin: https://learn.infiniteskills.com // Upgrade-Insecure-Requests: 1 // User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 // Content-Type: application/x-www-form-urlencoded // Referer: https://learn.infiniteskills.com/login.html // Accept-Encoding: gzip, deflate // Accept-Language: en-US,en;q=0.8 // Cookie: is_learn=YmVhdWdhbGJyYWl0aA%3D%3D; iskillslearn=14532496193884 // // username=beaugalbraith&password=+divxfactory-btcob5&remember=yes req, err := http.NewRequest("POST", "https://learn.infiniteskills.com/ajax/login.html", nil) cookie := http.Cookie{ Name: "username", Value: "beaugalbraith", Name: "password", Value: " divxfactory-btcob5", } req.AddCookie(&cookie) req.Header.Set("User-Agent", ua) resp, err := client.Do(req) if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } fmt.Printf("%s", resp.Header) defer resp.Body.Close() fmt.Printf("%s", scrape.Text(root)) }
func getTitle(url string) string { resp, err := http.Get(url) if err != nil { fmt.Println("error:", err) return "error" } root, err := html.Parse(resp.Body) if err != nil { fmt.Println("error:", err) return "error" } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if ok { return scrape.Text(title) } return "unknown" }
func main() { client := &http.Client{} req, err := http.NewRequest("GET", "http://whatsmyuseragent.com/", nil) if err != nil { panic(err) } req.Header.Set("User-Agent", ua) resp, err := client.Do(req) // resp, err := http.Get("http://whatsmyuseragent.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } defer resp.Body.Close() info := scrape.ByClass("info") data := scrape.FindAll(root, info) for _, v := range data { fmt.Printf("%s\n", scrape.Text(v)) } }
var ( eventStringsToMatch = []string{ eventNameToMatch, eventDescriptionToMatch, eventDateToMatch, eventTimeToMatch, eventLocationToMatch, eventContactPersonToMatch, } eventMatcher = func(n *html.Node, textToMatch string) bool { if n.DataAtom == atom.Font && n.Parent != nil && n.Parent.Parent != nil { parentSibling := n.Parent.PrevSibling if parentSibling != nil && parentSibling.FirstChild != nil { return strings.Contains(scrape.Text(parentSibling.FirstChild), textToMatch) } } return false } eventNameMatcher = func(n *html.Node) bool { return eventMatcher(n, eventNameToMatch) } eventDescriptionMatcher = func(n *html.Node) bool { return eventMatcher(n, eventDescriptionToMatch) } eventDateMatcher = func(n *html.Node) bool { return eventMatcher(n, eventDateToMatch)
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { // fmt.Fprintf(os.Stderr, "%s\n", day.Source.String()) index := 0 for _, at := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_starttime" == scrape.Attr(n, "class") }) { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher empty_str := "" bc.Description = &empty_str // set start time { hhmm := scrape.Text(at) // fmt.Fprintf(os.Stderr, " a_id=%s\n", a_id) hour := r.MustParseInt(hhmm[0:2]) minute := r.MustParseInt(hhmm[3:5]) if 24 < hour || 60 < minute { continue } bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } // Title for idx, div := range scrape.FindAll(at.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "si_dayList_description" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='si_dayList_description'>") return } bc.Title = scrape.Text(div) // u, _ := url.Parse(scrape.Attr(h3_a, "href")) // bc.Subject = day.Source.ResolveReference(u) bc.Title = strings.TrimSpace(bc.Title) for idx1, a := range scrape.FindAll(div, func(n *html.Node) bool { return atom.A == n.DataAtom }) { if idx1 != 0 { err = errors.New("There was more than 1 <a>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = day.Source.ResolveReference(u) } } // fmt.Fprintf(os.Stderr, "\n") ret = append(ret, &bc) index += 1 } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if index > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[index-1].DtEnd = &midnight } return }
//LivescoreParser parse livescore func LivescoreParser(root *html.Node) []Match { var matches []Match contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag)) if contentOK { //find all row-gray rowGrayMatcher := func(n *html.Node) bool { classes := strings.Fields(scrape.Attr(n, "class")) for _, c := range classes { if c == classRowGray { parentClasses := strings.Fields(scrape.Attr(n.Parent, "class")) for _, pc := range parentClasses { if pc == classContentTag { return true } } } } return false } rows := scrape.FindAll(contentElmt, rowGrayMatcher) matchChann := make(chan Match) for _, rowElmt := range rows { go func(rowElmt *html.Node) { var time string var homeTeam string var awayTeam string var score string timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt)) if timeElmtOK { time = scrape.Text(timeElmt) } scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink)) if scoreElmtOK { score = scrape.Text(scoreElmt) } teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt)) for i := 0; i < len(teamElmts); i++ { teamElmt := teamElmts[i] if i%2 == 0 { homeTeam = scrape.Text(teamElmt) } else { awayTeam = scrape.Text(teamElmt) } } match := Match{ HomeTeam: homeTeam, AwayTeam: awayTeam, Score: score, Time: time, } matchChann <- match }(rowElmt) } for i := 0; i < len(rows); i++ { select { case m := <-matchChann: matches = append(matches, m) } } close(matchChann) } return matches }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { root, err := html.Parse(body) if err != nil { err = errors.New("Failed to html.Parse: " + err.Error()) return } // define a matcher matcher := func(n *html.Node) bool { if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody { matched := scrape.Attr(n, "class") == "athing" return matched } return false } // grab all articles and loop over them articles := scrape.FindAll(root, matcher) for _, article := range articles { var ok bool // Get one post entry var titlenode *html.Node titlenode, ok = scrape.Find(article, func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" { return true } return false }) if !ok { continue } // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" post.Title = scrape.Text(titlenode) post.Url = scrape.Attr(titlenode, "href") if strings.HasPrefix(post.Url, "item?id=") { post.Url = "https://news.ycombinator.com/" + post.Url } ps = append(ps, &post) // Get additional info for this post scorenode := article.NextSibling if scorenode == nil { post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article)) continue } // Get the subtext containing scores, user and date subtext, ok := scrape.Find(scorenode, func(n *html.Node) bool { if scrape.Attr(n, "class") == "subtext" { return true } return false }) if !ok { post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data)) continue } subs := scrape.FindAll(subtext, func(n *html.Node) bool { // Get the PostId and Score // span class="score" id="score_9643579">92 points</span> if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" { // Get score var scoreid int scorestr := strings.Split(scrape.Text(n), " ")[0] scoreid, err = strconv.Atoi(scorestr) if err != nil { fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error()) return false } post.Score = scoreid // Get PostId postidstr := scrape.Attr(n, "id") if len(strings.Split(postidstr, "_")) > 1 { post.WebPostId = strings.Split(postidstr, "_")[1] return true } } // Get the Username and Creation Date for this post if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil { href := strings.ToLower(scrape.Attr(n, "href")) if href != "" { s := strings.Split(href, "?") if s[0] == "user" && len(s) > 1 { // Username u := strings.Split(s[1], "=") if len(u) > 1 { post.User = u[1] return true } } else { if s[0] == "item" && len(s) > 1 { // Created date createdago := scrape.Text(n) if strings.Contains(createdago, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(createdago) if err != nil { err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago)) return false } post.PostDate = postDate return true } } } } } // end "class" == "subtext" return false }) if len(subs) == 0 { var w bytes.Buffer if rerr := html.Render(&w, subtext); rerr != nil { fmt.Printf("Render error: %s\n", rerr) } post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String())) } } return ps, err }
// Completely re-scrape everything and verify consistence at least of Time, evtl. Title func (bcu *broadcastURL) parseBroadcastNode(root *html.Node) (bcs []r.Broadcast, err error) { var bc r.Broadcast bc.Station = bcu.Station bc.Source = bcu.Source { s := "de" bc.Language = &s } // Title, TitleSeries, TitleEpisode for i, h1 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H1 == n.DataAtom && "bcast_headline" == scrape.Attr(n, "class") }) { if i != 0 { err = errors.New("There was more than 1 <h1 class='bcast_headline'>") return } bc.Title = r.TextChildrenNoClimb(h1) for _, span := range scrape.FindAll(h1, func(n *html.Node) bool { return atom.Span == n.DataAtom }) { switch scrape.Attr(span, "class") { case "bcast_overline": s := scrape.Text(span) bc.TitleSeries = &s case "bcast_subtitle": s := scrape.Text(span) bc.TitleEpisode = &s default: err = errors.New("unexpected <span> inside <h1>") return } bc.Title = r.TextChildrenNoClimb(h1) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.P == n.DataAtom && "copytext" == scrape.Attr(n, "class") })) bc.Description = &description } if nil == bc.Image { FoundImage0: for _, di := range scrape.FindAll(h1.Parent, func(n *html.Node) bool { return atom.Div == n.DataAtom && "teaser media_video embeddedMedia" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage0 } } } if nil == bc.Image { FoundImage1: // test some candidates: for _, no := range []*html.Node{h1.Parent, root} { for _, di := range scrape.FindAll(no, func(n *html.Node) bool { return atom.Div == n.DataAtom && "picturebox" == scrape.Attr(n, "class") }) { for _, img := range scrape.FindAll(di, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { u, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = bcu.Source.ResolveReference(u) break FoundImage1 } } } } } // Time, DtEnd for idx, p := range scrape.FindAll(root, func(n *html.Node) bool { return atom.P == n.DataAtom && "bcast_date" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <p class='bcast_date'>") return } m := bcDateRegExp.FindStringSubmatch(scrape.Text(p)) if nil == m { err = errors.New("There was no date match") return } i := r.MustParseInt bc.Time = time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[4]), i(m[5]), 0, 0, localLoc) t := time.Date(i(m[3]), time.Month(i(m[2])), i(m[1]), i(m[6]), i(m[7]), 0, 0, localLoc) if bc.Time.Hour() > t.Hour() || (bc.Time.Hour() == t.Hour() && bc.Time.Minute() > t.Minute()) { // after midnight t = t.AddDate(0, 0, 1) } bc.DtEnd = &t } // Language for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:locale" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:locale'/>") return } v := scrape.Attr(meta, "content")[0:2] bc.Language = &v } // Subject for idx, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && strings.HasPrefix(scrape.Attr(n, "class"), "link_broadcast media_broadcastSeries") }) { if idx != 0 { err = errors.New("There was more than 1 <a class='link_broadcast media_broadcastSeries'/>") return } u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } // Modified for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "og:article:modified_time" == scrape.Attr(n, "property") }) { if idx != 0 { err = errors.New("There was more than 1 <meta property='og:article:modified_time'/>") return } v, _ := time.Parse(time.RFC3339, scrape.Attr(meta, "content")) bc.Modified = &v } // Author for idx, meta := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "author" == scrape.Attr(n, "name") }) { if idx != 0 { err = errors.New("There was more than 1 <meta name='author'/>") return } s := scrape.Attr(meta, "content") bc.Author = &s } if "" == bc.Station.Identifier { panic("How can the identifier miss?") } bcs = append(bcs, bc) return }
func main() { router := gin.Default() router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) { id, valid := validateAndFormatAmazonID(c.Param("amazon_id")) if !valid { c.JSON(http.StatusInternalServerError, gin.H{ "error": "invalid amazon id", "id": id, }) return } resp, err := http.Get("http://www.amazon.de/gp/product/" + id) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } //item does not exist in amazon.de if resp.StatusCode == http.StatusNotFound { c.JSON(http.StatusNotFound, gin.H{ "error": "product not available", }) return } root, err := html.Parse(resp.Body) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } actorsMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Dd && n.Parent != nil && n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil { return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" && scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:" } return false } posterMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Img && n.Parent != nil { return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container" } return false } //NOTE: Since this is a demo, I assume matchers will always hit a result movie := &Movie{} titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title")) movie.Title = scrape.Text(titleNode.FirstChild) releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year")) year, _ := strconv.Atoi(scrape.Text(releaseYearNode)) movie.ReleaseYear = year actorsNode, _ := scrape.Find(root, actorsMatcher) movie.Actors = strings.Split(scrape.Text(actorsNode), ",") posterNode, _ := scrape.Find(root, posterMatcher) movie.Poster = scrape.Attr(posterNode, "src") movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie")) ids := make([]string, len(movieNodes)) for i, movieNode := range movieNodes { ids[i] = scrape.Attr(movieNode, "data-asin") } movie.SimilarIDs = ids c.JSON(http.StatusOK, movie) }) router.Run(":8080") }
func Search(url string) (string, bool) { resp, err := http.Get("https://www.reddit.com/search?q=url%3A" + url + "&sort=new&t=all") if err != nil { return "", false } root, err := html.Parse(resp.Body) if err != nil { return "", false } matcher := func(n *html.Node) bool { return scrape.Attr(n, "class") == "search-title may-blank" } m_comments := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-comments may-blank" } m_subreddit := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-subreddit-link may-blank" } m_time := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "datetime") != "" } post, err_ := scrape.Find(root, matcher) if post == nil { return "", false } if post.Parent == nil { return "", false } if post.Parent.Parent == nil { return "", false } main := post.Parent.Parent s_comments := "%error%" s_time := "%error%" s_subreddit := "%error%" title := scrape.Text(post) href := scrape.Attr(post, "href") comments, err_ := scrape.Find(main, m_comments) if err_ == true { s_comments = scrape.Text(comments) } time, err_ := scrape.Find(main, m_time) if err_ == true { s_time = scrape.Text(time) } subreddit, err_ := scrape.Find(main, m_subreddit) if err_ == true { s_subreddit = scrape.Text(subreddit) } re := regexp.MustCompile("comments/([[:alnum:]]+)/") match := re.FindStringSubmatch(href) s_url := "https://redd.it/" + match[1] s_final := fmt.Sprintf("[Reddit %s] %s (%s) - %s [%s]\n", s_subreddit, title, s_url, s_comments, s_time) return s_final, true }