func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) { { // Author meta, _ := scrape.Find(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name") }) if nil != meta { content := scrape.Attr(meta, "content") bc.Author = &content } } for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='epg-content-right'/>") return } { // TitleEpisode txt, _ := scrape.Find(epg, func(n *html.Node) bool { return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom }) if nil != txt { t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data)) bc.TitleEpisode = &t txt.Parent.RemoveChild(txt.NextSibling) txt.Parent.RemoveChild(txt) } } { // Subject a, _ := scrape.Find(epg, func(n *html.Node) bool { return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom }) if nil != a { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } } // purge some cruft for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool { clz := scrape.Attr(n, "class") return atom.H2 == n.DataAtom || "mod modSharing" == clz || "modGalery" == clz || "sendungsLink" == clz || "tabs-container" == clz }) { nn.Parent.RemoveChild(nn) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent })) bc.Description = &description } } bc_ := r.Broadcast(*bc) ret = append(ret, &bc_) return }
// fetchExtraScheduleInfo gets more information about each component. // // The rootNode argument should be the parsed schedule list view. func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error { psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm")) if !ok { return errors.New("could not find PSForm") } icsid, ok := scrape.Find(psForm, scrape.ById("ICSID")) if !ok { return errors.New("could not find ICSID") } formAction := getNodeAttribute(psForm, "action") sid := getNodeAttribute(icsid, "value") // TODO: figure out if there's a way to make this more robust or to load it lazily. sectionIndex := 0 for courseIndex := range courses { course := &courses[courseIndex] for componentIndex := range course.Components { component := &course.Components[componentIndex] postData := generateClassDetailForm(sid, sectionIndex) res, reqErr := client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } courseOpen, parseErr := parseExtraComponentInfo(res.Body, component) if parseErr != nil { return parseErr } course.Open = &courseOpen postData = generateClassDetailBackForm(sid, sectionIndex) res, reqErr = client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } sectionIndex++ } } return nil }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
// Auth attempts to access a given URL, then enters the given // credentials when the URL redirects to a login page. func (s *Session) Auth(serviceURL, email, password string) error { resp, err := s.Get(serviceURL) if err != nil { return err } defer resp.Body.Close() parsed, err := html.ParseFragment(resp.Body, nil) if err != nil || len(parsed) == 0 { return err } root := parsed[0] form, ok := scrape.Find(root, scrape.ById("gaia_loginform")) if !ok { return errors.New("failed to process login page") } submission := url.Values{} for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) { submission.Add(getAttribute(input, "name"), getAttribute(input, "value")) } submission["Email"] = []string{email} submission["Passwd"] = []string{password} postResp, err := s.PostForm(resp.Request.URL.String(), submission) if err != nil { return err } postResp.Body.Close() if postResp.Request.Method == "POST" { return errors.New("login incorrect") } return nil }
func TorrentList(url string) ([]Torrent, error) { // request and parse the front page resp, err := http.Get(url) if err != nil { return make([]Torrent, 0), err } root, err := html.Parse(resp.Body) if err != nil { return make([]Torrent, 0), err } var torrents []Torrent if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok { // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody { return true } return false } // grab all articles and print them trs := scrape.FindAll(content, matcher) for _, tr := range trs { torrents = append(torrents, ParseRecord(tr)) } } resp.Body.Close() return torrents, nil }
func indexPage(page string) (ind map[string]int, branches []string, err error) { resp, err := http.Get(page) if err != nil { return } root, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return } content, ok := scrape.Find(root, scrape.ById("bodyContent")) if !ok { return nil, nil, errors.New("no bodyContent element") } paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P)) pageText := "" for _, p := range paragraphs { pageText += elementInnerText(p) + " " } words := strings.Fields(strings.ToLower(pageText)) ind = map[string]int{} for _, word := range words { ind[word] = ind[word] + 1 } links := findWikiLinks(content) branches = make([]string, len(links)) for i, link := range links { branches[i] = "https://en.wikipedia.org" + link } return }
func fillJobStruct(n *html.Node) *Job { job := new(Job) job.Title = scrape.Text(n) job.RetriveOn = time.Now().Format(time.RFC822Z) job.url = jobUrl(n) fmt.Println(job.url) job.ID = jobID(job.url) job.EmailFormLink = jobEmailFromUrl + job.ID jp := fetchByID(job.ID) job.jobPage = jp desc, _ := scrape.Find(job.jobPage, descriptionMatcher) job.Description = scrape.Text(desc) req, _ := scrape.Find(job.jobPage, requiermentMatcher) job.Requierments = scrape.Text(req) return job }
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the // login form. func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) { parsed, err := html.ParseFragment(res.Body, nil) if err != nil { return } else if len(parsed) != 1 { return nil, errors.New("wrong number of root elements") } root := parsed[0] var form loginFormInfo htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form)) if !ok { return nil, errors.New("no form element found") } if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" { form.action = res.Request.URL.String() } else { actionURL, err := url.Parse(actionStr) if err != nil { return nil, err } if actionURL.Host == "" { actionURL.Host = res.Request.URL.Host } if actionURL.Scheme == "" { actionURL.Scheme = res.Request.URL.Scheme } if !path.IsAbs(actionURL.Path) { actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path) } form.action = actionURL.String() } inputs := scrape.FindAll(root, scrape.ByTag(atom.Input)) form.otherFields = url.Values{} for _, input := range inputs { inputName := getNodeAttribute(input, "name") switch getNodeAttribute(input, "type") { case "text": form.usernameField = inputName case "password": form.passwordField = inputName default: form.otherFields.Add(inputName, getNodeAttribute(input, "value")) } } if form.usernameField == "" { return nil, errors.New("no username field found") } else if form.passwordField == "" { return nil, errors.New("no password field found") } return &form, nil }
func parsepost(n *html.Node) Post { post := Post{} // get the title. uses a scrape inbuilt matcher title_scrape, _ := scrape.Find(n, scrape.ByClass("title")) title := scrape.Text(title_scrape.FirstChild) // get the subreddit. This requires a custom matcher. matcher := func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil { return scrape.Attr(n, "class") == "subreddit hover may-blank" } return false } sub, _ := scrape.Find(n, matcher) subreddit := scrape.Text(sub) // get the url to the comments. requires custom matcher. matcher = func(n *html.Node) bool { if n.DataAtom == atom.Ul && n.FirstChild != nil { return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first" } return false } ul, _ := scrape.Find(n, matcher) // ul is a list of two buttons: one that links to a post's comments page, one a "share" function li := ul.FirstChild // the first list item of ul -- this will always be the comments page link. url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item. // get the author. Uses custom matcher and magic. matcher = func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P { return strings.Contains(scrape.Attr(n, "href"), "/user/") } return false } author_scrape, _ := scrape.Find(n, matcher) author := scrape.Text(author_scrape) post.title = title post.subreddit = subreddit post.url = url post.author = author return post }
func findHTMLTitle(doc *html.Node) string { el, found := scrape.Find(doc, scrape.ByTag(atom.Title)) if !found { return "" } return scrape.Text(el) }
func ParseName(n *html.Node) (string, string, string) { matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td { return true } return false } var name, magnet, desc string if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok { name = scrape.Text(detName) } if anchor, ok := scrape.Find(n, matcher); ok { magnet = scrape.Attr(anchor, "href") } if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok { desc = scrape.Text(detDesc) } return name, magnet, desc }
func TweetsToUser(u user.User) []tweet.Tweet { reqURL := SearchURL _url.SetQueryParams(&reqURL, map[string]string{ "q": "to:" + u.ScreenName, "f": "tweets", }) res, err := http.Get(reqURL.String()) PanicIf(err) root, err := html.Parse(res.Body) PanicIf(err) tweetsMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet") } tweetScreenNameMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username") } tweetTextMatcher := func(n *html.Node) bool { return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text") } tweetNodes := scrape.FindAll(root, tweetsMatcher) tweets := make([]tweet.Tweet, len(tweetNodes)) for i, n := range tweetNodes { t := tweet.Tweet{ ID: scrape.Attr(n, "data-user-id"), } if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok { t.Author = *user.NewUser(scrape.Text(child)) } if child, ok := scrape.Find(n, tweetTextMatcher); ok { t.Text = scrape.Text(child) } tweets[i] = t } return tweets }
func resolveUrl(website string) string { site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Printf("%s", err) os.Exit(1) panic(err) } title, _ := scrape.Find(contents, scrape.ByTag(atom.Title)) var titulo string = scrape.Text(title) return titulo }
func findOpenGraphTitle(doc *html.Node) string { el, found := scrape.Find(doc, func(n *html.Node) bool { if n.DataAtom == atom.Meta { return scrape.Attr(n, "property") == "og:title" && scrape.Attr(n, "content") != "" } return false }) if !found { return "" } return scrape.Attr(el, "content") }
func findTwitterTitle(doc *html.Node) string { el, found := scrape.Find(doc, func(n *html.Node) bool { if n.DataAtom == atom.Meta { return scrape.Attr(n, "name") == "twitter:title" && scrape.Attr(n, "content") != "" } return false }) if !found { return "" } return scrape.Attr(el, "content") }
func queryWikipedia(word string) string { word = strings.TrimSpace(word) website := "http://en.wikipedia.com/wiki/" + word site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Print("%s", err) panic(err) os.Exit(1) } intro, _ := scrape.Find(contents, scrape.ByTag(atom.P)) var resp string = scrape.Text(intro) return resp }
// History asynchronously fetches the user's // video viewing history. // You may provide a cancel channel which you // can close to cancel the fetch mid-way. func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) { videoChan := make(chan *YoutubeVideoInfo) errChan := make(chan error, 1) go func() { defer close(videoChan) defer close(errChan) historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil) historyReq.Header.Set("User-Agent", spoofedUserAgent) resp, err := y.s.Do(historyReq) rootNode, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { errChan <- err return } loadMoreHTML := rootNode contentHTML := rootNode for { items := parseHistoryItems(contentHTML) for _, item := range items { select { case videoChan <- item: case <-cancel: return } } if loadMoreHTML == nil { break } loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more")) if ok { morePath := scrape.Attr(loadButton, "data-uix-load-more-href") loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath) if err != nil { errChan <- err return } } } }() return videoChan, errChan }
func main() { resp, err := http.Get("https://www.reddit.com") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } matcher := func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "id") == "siteTable" } return false } table, ok := scrape.Find(root, matcher) if !ok { panic(ok) } matcher = func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "data-type") == "link" } return false } articles := scrape.FindAll(table, matcher) var posts []Post for i := 0; i < len(articles); i++ { wg.Add(1) go func(n *html.Node) { post := parsepost(n) posts = append(posts, post) wg.Done() }(articles[i]) } wg.Wait() for i := 0; i < len(posts); i++ { printpost(posts[i]) } }
// parseCurrentSchedule parses the courses from the schedule list view page. // // If fetchMoreInfo is true, this will perform a request for each component to find out information // about it. func parseSchedule(rootNode *html.Node) ([]Course, error) { courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO")) result := make([]Course, 0, len(courseTables)) for _, classTable := range courseTables { println("found course") titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER")) if !ok { // This will occur at least once, since the filter options are a PSGROUPBOXWBO. continue } infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO")) if len(infoTables) != 2 { return nil, errors.New("expected exactly 2 info tables but found " + strconv.Itoa(len(infoTables))) } courseInfoTable := infoTables[0] course, err := parseCourseInfoTable(courseInfoTable) if err != nil { return nil, err } // NOTE: there isn't really a standard way to parse the department/number. course.Name = nodeInnerText(titleElement) componentsInfoTable := infoTables[1] componentMaps, err := tableEntriesAsMaps(componentsInfoTable) if err != nil { return nil, err } course.Components = make([]Component, len(componentMaps)) for i, componentMap := range componentMaps { course.Components[i], err = parseComponentInfoMap(componentMap) if err != nil { return nil, err } } result = append(result, course) } return result, nil }
func getTitle(url string) string { resp, err := http.Get(url) if err != nil { fmt.Println("error:", err) return "error" } root, err := html.Parse(resp.Body) if err != nil { fmt.Println("error:", err) return "error" } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if ok { return scrape.Text(title) } return "unknown" }
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo { var info YoutubeVideoInfo info.ID = scrape.Attr(element, "data-context-item-id") thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple")) if ok { thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img)) if ok { info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src")) } } videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time")) if ok { durationStr := strings.TrimSpace(scrape.Text(videoTimeElement)) info.Length, _ = parseVideoDuration(durationStr) } linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"} linkFieldPtrs := []*string{&info.Title, &info.Author} for i, class := range linkFieldClasses { linkContainer, ok := scrape.Find(element, scrape.ByClass(class)) if ok { link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A)) if ok { *linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link)) } } } descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description")) if ok { info.Description = strings.TrimSpace(scrape.Text(descBox)) } return &info }
func jobCaptChaUrl(n *html.Node) string { img, _ := scrape.Find(n, captchaImageMatcher) return baseUrl + scrape.Attr(img, "src") }
func doScrape(urlString string) AppData { fmt.Println(urlString) u, err := url.Parse(urlString) if err != nil { panic(err) } appData := AppData{} appData.PackageName = u.Query().Get("id") resp, err := http.Get(urlString) if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } genreMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "genre" } iconMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "image" } softwareVersionMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "softwareVersion" } name, ok := scrape.Find(root, scrape.ByClass("id-app-title")) if ok { appData.Name = scrape.Text(name) } genre, ok := scrape.Find(root, genreMatcher) if ok { appData.Categories = append(appData.Categories, scrape.Text(genre)) } icon, ok := scrape.Find(root, iconMatcher) if ok { iconSrc := scrape.Attr(icon, "src") iconUrl, err := url.Parse(iconSrc) if err != nil { panic(err) } if iconUrl.Scheme == "" { iconSrc = "https:" + iconSrc } resp, err = http.Get(iconSrc) if err != nil { panic(err) } defer resp.Body.Close() outputFile, err := os.Create("output/" + appData.PackageName + ".png") if err != nil { panic(err) } defer outputFile.Close() _, err = io.Copy(outputFile, resp.Body) if err != nil { panic(err) } } version, ok := scrape.Find(root, softwareVersionMatcher) if ok { appData.Version = strings.TrimSpace(scrape.Text(version)) } return appData }
//LivescoreParser parse livescore func LivescoreParser(root *html.Node) []Match { var matches []Match contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag)) if contentOK { //find all row-gray rowGrayMatcher := func(n *html.Node) bool { classes := strings.Fields(scrape.Attr(n, "class")) for _, c := range classes { if c == classRowGray { parentClasses := strings.Fields(scrape.Attr(n.Parent, "class")) for _, pc := range parentClasses { if pc == classContentTag { return true } } } } return false } rows := scrape.FindAll(contentElmt, rowGrayMatcher) matchChann := make(chan Match) for _, rowElmt := range rows { go func(rowElmt *html.Node) { var time string var homeTeam string var awayTeam string var score string timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt)) if timeElmtOK { time = scrape.Text(timeElmt) } scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink)) if scoreElmtOK { score = scrape.Text(scoreElmt) } teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt)) for i := 0; i < len(teamElmts); i++ { teamElmt := teamElmts[i] if i%2 == 0 { homeTeam = scrape.Text(teamElmt) } else { awayTeam = scrape.Text(teamElmt) } } match := Match{ HomeTeam: homeTeam, AwayTeam: awayTeam, Score: score, Time: time, } matchChann <- match }(rowElmt) } for i := 0; i < len(rows); i++ { select { case m := <-matchChann: matches = append(matches, m) } } close(matchChann) } return matches }
func Search(url string) (string, bool) { resp, err := http.Get("https://www.reddit.com/search?q=url%3A" + url + "&sort=new&t=all") if err != nil { return "", false } root, err := html.Parse(resp.Body) if err != nil { return "", false } matcher := func(n *html.Node) bool { return scrape.Attr(n, "class") == "search-title may-blank" } m_comments := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-comments may-blank" } m_subreddit := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "class") == "search-subreddit-link may-blank" } m_time := func(n *html.Node) bool { if n == nil { return false } return scrape.Attr(n, "datetime") != "" } post, err_ := scrape.Find(root, matcher) if post == nil { return "", false } if post.Parent == nil { return "", false } if post.Parent.Parent == nil { return "", false } main := post.Parent.Parent s_comments := "%error%" s_time := "%error%" s_subreddit := "%error%" title := scrape.Text(post) href := scrape.Attr(post, "href") comments, err_ := scrape.Find(main, m_comments) if err_ == true { s_comments = scrape.Text(comments) } time, err_ := scrape.Find(main, m_time) if err_ == true { s_time = scrape.Text(time) } subreddit, err_ := scrape.Find(main, m_subreddit) if err_ == true { s_subreddit = scrape.Text(subreddit) } re := regexp.MustCompile("comments/([[:alnum:]]+)/") match := re.FindStringSubmatch(href) s_url := "https://redd.it/" + match[1] s_final := fmt.Sprintf("[Reddit %s] %s (%s) - %s [%s]\n", s_subreddit, title, s_url, s_comments, s_time) return s_final, true }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { root, err := html.Parse(body) if err != nil { err = errors.New("Failed to html.Parse: " + err.Error()) return } // define a matcher matcher := func(n *html.Node) bool { if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody { matched := scrape.Attr(n, "class") == "athing" return matched } return false } // grab all articles and loop over them articles := scrape.FindAll(root, matcher) for _, article := range articles { var ok bool // Get one post entry var titlenode *html.Node titlenode, ok = scrape.Find(article, func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" { return true } return false }) if !ok { continue } // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" post.Title = scrape.Text(titlenode) post.Url = scrape.Attr(titlenode, "href") if strings.HasPrefix(post.Url, "item?id=") { post.Url = "https://news.ycombinator.com/" + post.Url } ps = append(ps, &post) // Get additional info for this post scorenode := article.NextSibling if scorenode == nil { post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article)) continue } // Get the subtext containing scores, user and date subtext, ok := scrape.Find(scorenode, func(n *html.Node) bool { if scrape.Attr(n, "class") == "subtext" { return true } return false }) if !ok { post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data)) continue } subs := scrape.FindAll(subtext, func(n *html.Node) bool { // Get the PostId and Score // span class="score" id="score_9643579">92 points</span> if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" { // Get score var scoreid int scorestr := strings.Split(scrape.Text(n), " ")[0] scoreid, err = strconv.Atoi(scorestr) if err != nil { fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error()) return false } post.Score = scoreid // Get PostId postidstr := scrape.Attr(n, "id") if len(strings.Split(postidstr, "_")) > 1 { post.WebPostId = strings.Split(postidstr, "_")[1] return true } } // Get the Username and Creation Date for this post if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil { href := strings.ToLower(scrape.Attr(n, "href")) if href != "" { s := strings.Split(href, "?") if s[0] == "user" && len(s) > 1 { // Username u := strings.Split(s[1], "=") if len(u) > 1 { post.User = u[1] return true } } else { if s[0] == "item" && len(s) > 1 { // Created date createdago := scrape.Text(n) if strings.Contains(createdago, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(createdago) if err != nil { err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago)) return false } post.PostDate = postDate return true } } } } } // end "class" == "subtext" return false }) if len(subs) == 0 { var w bytes.Buffer if rerr := html.Render(&w, subtext); rerr != nil { fmt.Printf("Render error: %s\n", rerr) } post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String())) } } return ps, err }
// parseExtraComponentInfo parses the "Class Detail" page for a component. func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) { nodes, err := html.ParseFragment(body, nil) if err != nil { return } if len(nodes) != 1 { return false, errors.New("invalid number of root elements") } openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT")) if !ok { return false, errors.New("open status not found") } courseOpen = (nodeInnerText(openStatus) == "Open") availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3")) if !ok { return courseOpen, errors.New("could not find availability info") } rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr)) if len(rows) != 7 { return courseOpen, errors.New("invalid number of rows in availability table") } var availability ClassAvailability cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 2") } availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 4") } availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td))) if len(cols) != 1 { return courseOpen, errors.New("expected 1 aligned column in row 6") } availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } component.ClassAvailability = &availability return }
func checkNextPage(s JobSearch) bool { _, ok := scrape.Find(s.root, nextPageMatcher) fmt.Println(ok) return ok }
func main() { router := gin.Default() router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) { id, valid := validateAndFormatAmazonID(c.Param("amazon_id")) if !valid { c.JSON(http.StatusInternalServerError, gin.H{ "error": "invalid amazon id", "id": id, }) return } resp, err := http.Get("http://www.amazon.de/gp/product/" + id) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } //item does not exist in amazon.de if resp.StatusCode == http.StatusNotFound { c.JSON(http.StatusNotFound, gin.H{ "error": "product not available", }) return } root, err := html.Parse(resp.Body) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } actorsMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Dd && n.Parent != nil && n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil { return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" && scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:" } return false } posterMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Img && n.Parent != nil { return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container" } return false } //NOTE: Since this is a demo, I assume matchers will always hit a result movie := &Movie{} titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title")) movie.Title = scrape.Text(titleNode.FirstChild) releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year")) year, _ := strconv.Atoi(scrape.Text(releaseYearNode)) movie.ReleaseYear = year actorsNode, _ := scrape.Find(root, actorsMatcher) movie.Actors = strings.Split(scrape.Text(actorsNode), ",") posterNode, _ := scrape.Find(root, posterMatcher) movie.Poster = scrape.Attr(posterNode, "src") movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie")) ids := make([]string, len(movieNodes)) for i, movieNode := range movieNodes { ids[i] = scrape.Attr(movieNode, "data-asin") } movie.SimilarIDs = ids c.JSON(http.StatusOK, movie) }) router.Run(":8080") }
func (s structure) getSpaces() ([]space, error) { spaces := []space{ space{Name: "WSU Permit"}, space{Name: "Student OneCard"}, space{Name: "Visitor"}, } re := map[string]*regexp.Regexp{ "avail": regexp.MustCompile(`([0-9]+|NONE)`), "status": regexp.MustCompile(`(OPEN|CLOSED|FULL)`), "updated": regexp.MustCompile(`(?P<a>^.+: )(?P<b>.+)`), } // Request client := &http.Client{ Timeout: time.Second * 10, } req, err := http.NewRequest("GET", "http://m.wayne.edu/parking.php?location="+s.URLCode, nil) if err != nil { return spaces, errors.New("Request failed") } req.Header.Set("User-Agent", "Apple-iPhone6C1/") // Response resp, err := client.Do(req) if err != nil { return spaces, errors.New("Response failed") } defer resp.Body.Close() body, err := html.Parse(resp.Body) if err != nil { return spaces, errors.New("Error parsing response body") } // Parse relevant response data dataString, ok := scrape.Find(body, scrape.ByClass("available")) if !ok { return spaces, errors.New("Error: Line 105 - scrape.Find (available) -- not finding scrape info") } lastUpdated, ok := scrape.Find(body, scrape.ByClass("last_updated")) if !ok { return spaces, errors.New("Error: Line 109 - scrape.Find (last_updated) -- not finding scrape info") } avail := re["avail"].FindAllString(scrape.Text(dataString), -1) if len(avail) == 0 { avail = []string{"0", "0", "0"} } status := re["status"].FindAllString(scrape.Text(dataString), -1) if len(status) != 3 { return spaces, errors.New("Error: Line 118 - FindAllString (status) not returning 3 matches") } updated := re["updated"].FindStringSubmatch(scrape.Text(lastUpdated)) if len(updated) == 0 { return spaces, errors.New("Error: Line 122 - FindAllStringSubmatch (updated) not finding a match") } for key := range spaces { spaces[key].Available = avail[key] spaces[key].Status = status[key] spaces[key].Updated = updated[2] } return spaces, nil }