func TorrentList(url string) ([]Torrent, error) { // request and parse the front page resp, err := http.Get(url) if err != nil { return make([]Torrent, 0), err } root, err := html.Parse(resp.Body) if err != nil { return make([]Torrent, 0), err } var torrents []Torrent if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok { // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody { return true } return false } // grab all articles and print them trs := scrape.FindAll(content, matcher) for _, tr := range trs { torrents = append(torrents, ParseRecord(tr)) } } resp.Body.Close() return torrents, nil }
// Auth attempts to access a given URL, then enters the given // credentials when the URL redirects to a login page. func (s *Session) Auth(serviceURL, email, password string) error { resp, err := s.Get(serviceURL) if err != nil { return err } defer resp.Body.Close() parsed, err := html.ParseFragment(resp.Body, nil) if err != nil || len(parsed) == 0 { return err } root := parsed[0] form, ok := scrape.Find(root, scrape.ById("gaia_loginform")) if !ok { return errors.New("failed to process login page") } submission := url.Values{} for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) { submission.Add(getAttribute(input, "name"), getAttribute(input, "value")) } submission["Email"] = []string{email} submission["Passwd"] = []string{password} postResp, err := s.PostForm(resp.Request.URL.String(), submission) if err != nil { return err } postResp.Body.Close() if postResp.Request.Method == "POST" { return errors.New("login incorrect") } return nil }
func indexPage(page string) (ind map[string]int, branches []string, err error) { resp, err := http.Get(page) if err != nil { return } root, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return } content, ok := scrape.Find(root, scrape.ById("bodyContent")) if !ok { return nil, nil, errors.New("no bodyContent element") } paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P)) pageText := "" for _, p := range paragraphs { pageText += elementInnerText(p) + " " } words := strings.Fields(strings.ToLower(pageText)) ind = map[string]int{} for _, word := range words { ind[word] = ind[word] + 1 } links := findWikiLinks(content) branches = make([]string, len(links)) for i, link := range links { branches[i] = "https://en.wikipedia.org" + link } return }
// fetchExtraScheduleInfo gets more information about each component. // // The rootNode argument should be the parsed schedule list view. func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error { psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm")) if !ok { return errors.New("could not find PSForm") } icsid, ok := scrape.Find(psForm, scrape.ById("ICSID")) if !ok { return errors.New("could not find ICSID") } formAction := getNodeAttribute(psForm, "action") sid := getNodeAttribute(icsid, "value") // TODO: figure out if there's a way to make this more robust or to load it lazily. sectionIndex := 0 for courseIndex := range courses { course := &courses[courseIndex] for componentIndex := range course.Components { component := &course.Components[componentIndex] postData := generateClassDetailForm(sid, sectionIndex) res, reqErr := client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } courseOpen, parseErr := parseExtraComponentInfo(res.Body, component) if parseErr != nil { return parseErr } course.Open = &courseOpen postData = generateClassDetailBackForm(sid, sectionIndex) res, reqErr = client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } sectionIndex++ } } return nil }
func scraper() { fd, err := os.Open("/mnt/hgfs/Downloads/wiki.html") if err != nil { panic(err) } defer fd.Close() root, err := html.Parse(fd) if err != nil { panic(err) } t := html.NewTokenizer(root) // matcher := func(n *html.Node) bool { // if n.DataAtom == atom.Table { // return true // } // return false // } // rowMatcher := func(n *html.Node) bool { // if n.DataAtom == atom.Tr { // return true // } // return false // } tableMatcher := scrape.ById(tableID) table := scrape.FindAll(root, tableMatcher) for _, v := range table { if t.Token().Data == "tr" { fmt.Printf("%s\n", scrape.Text(v)) } else { t.Next() } } // for , v := range table { // fmt.Printf("%s\n", scrape.Text(v)) // } }
func main() { router := gin.Default() router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) { id, valid := validateAndFormatAmazonID(c.Param("amazon_id")) if !valid { c.JSON(http.StatusInternalServerError, gin.H{ "error": "invalid amazon id", "id": id, }) return } resp, err := http.Get("http://www.amazon.de/gp/product/" + id) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } //item does not exist in amazon.de if resp.StatusCode == http.StatusNotFound { c.JSON(http.StatusNotFound, gin.H{ "error": "product not available", }) return } root, err := html.Parse(resp.Body) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } actorsMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Dd && n.Parent != nil && n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil { return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" && scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:" } return false } posterMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Img && n.Parent != nil { return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container" } return false } //NOTE: Since this is a demo, I assume matchers will always hit a result movie := &Movie{} titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title")) movie.Title = scrape.Text(titleNode.FirstChild) releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year")) year, _ := strconv.Atoi(scrape.Text(releaseYearNode)) movie.ReleaseYear = year actorsNode, _ := scrape.Find(root, actorsMatcher) movie.Actors = strings.Split(scrape.Text(actorsNode), ",") posterNode, _ := scrape.Find(root, posterMatcher) movie.Poster = scrape.Attr(posterNode, "src") movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie")) ids := make([]string, len(movieNodes)) for i, movieNode := range movieNodes { ids[i] = scrape.Attr(movieNode, "data-asin") } movie.SimilarIDs = ids c.JSON(http.StatusOK, movie) }) router.Run(":8080") }
// parseExtraComponentInfo parses the "Class Detail" page for a component. func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) { nodes, err := html.ParseFragment(body, nil) if err != nil { return } if len(nodes) != 1 { return false, errors.New("invalid number of root elements") } openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT")) if !ok { return false, errors.New("open status not found") } courseOpen = (nodeInnerText(openStatus) == "Open") availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3")) if !ok { return courseOpen, errors.New("could not find availability info") } rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr)) if len(rows) != 7 { return courseOpen, errors.New("invalid number of rows in availability table") } var availability ClassAvailability cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 2") } availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 4") } availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td))) if len(cols) != 1 { return courseOpen, errors.New("expected 1 aligned column in row 6") } availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } component.ClassAvailability = &availability return }