// Scrape scrapes a site for a keyword func (q *query) Scrape() []*match { // Request the URL resp, err := http.Get(q.SiteURL) if err != nil { panic(err) log.Fatal("Couldn't GET ", q.SiteURL) } // Parse the contents of the URL root, err := html.Parse(resp.Body) if err != nil { panic(err) log.Fatal("Unable to parse response") } // Grab all the posts and print them posts := scrape.FindAll(root, scrape.ByClass("description")) matches := make([]*match, len(posts)) for i, post := range posts { matches[i] = &match{ Title: scrape.Text(post.FirstChild.NextSibling), Description: scrape.Text(post), Link: "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"), Price: scrape.Text(post.NextSibling.NextSibling), Matched: false, } } return matches }
// parseCurrentSchedule parses the courses from the schedule list view page. // // If fetchMoreInfo is true, this will perform a request for each component to find out information // about it. func parseSchedule(rootNode *html.Node) ([]Course, error) { courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO")) result := make([]Course, 0, len(courseTables)) for _, classTable := range courseTables { println("found course") titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER")) if !ok { // This will occur at least once, since the filter options are a PSGROUPBOXWBO. continue } infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO")) if len(infoTables) != 2 { return nil, errors.New("expected exactly 2 info tables but found " + strconv.Itoa(len(infoTables))) } courseInfoTable := infoTables[0] course, err := parseCourseInfoTable(courseInfoTable) if err != nil { return nil, err } // NOTE: there isn't really a standard way to parse the department/number. course.Name = nodeInnerText(titleElement) componentsInfoTable := infoTables[1] componentMaps, err := tableEntriesAsMaps(componentsInfoTable) if err != nil { return nil, err } course.Components = make([]Component, len(componentMaps)) for i, componentMap := range componentMaps { course.Components[i], err = parseComponentInfoMap(componentMap) if err != nil { return nil, err } } result = append(result, course) } return result, nil }
func getLink(r *html.Node) (s string) { buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn")) for _, button := range buttons { windowLocation := scrape.Attr(button, "onclick") link := strings.Split(windowLocation, "=")[1] s := strings.Trim(link, "'") return s } return }
func parseHistoryItems(rootNode *html.Node) []*YoutubeVideoInfo { videoElements := scrape.FindAll(rootNode, scrape.ByClass("yt-lockup-video")) res := make([]*YoutubeVideoInfo, len(videoElements)) for i, element := range videoElements { res[i] = parseVideoInfo(element) } return res }
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada // dada. Hay que comprobar que no se devuelve error. func Get(parada int) (TiemposParada, error) { resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" + strconv.Itoa(parada)) if err != nil { return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.") } rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder()) root, err := html.Parse(rInUTF8) if err != nil { return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.") } headers := scrape.FindAll(root, scrape.ByTag(atom.H1)) if len(headers) < 2 { return TiemposParada{}, errors.New("La parada indicada parece errónea.") } lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36")) resultados := make([]ProximoBus, len(lineasTiempos)) for i, item := range lineasTiempos { valores := scrape.FindAll(item, scrape.ByClass("style38")) resultados[i] = ProximoBus{ Linea: scrape.Text(valores[0]), Destino: scrape.Text(valores[2]), Minutos: scrape.Text(valores[3]), } } if len(resultados) == 0 { return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.") } return TiemposParada{ Nombre: scrape.Text(headers[1]), Tiempos: resultados, Momento: time.Now(), Codigo: parada, }, nil }
func ParseName(n *html.Node) (string, string, string) { matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td { return true } return false } var name, magnet, desc string if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok { name = scrape.Text(detName) } if anchor, ok := scrape.Find(n, matcher); ok { magnet = scrape.Attr(anchor, "href") } if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok { desc = scrape.Text(detDesc) } return name, magnet, desc }
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo { var info YoutubeVideoInfo info.ID = scrape.Attr(element, "data-context-item-id") thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple")) if ok { thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img)) if ok { info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src")) } } videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time")) if ok { durationStr := strings.TrimSpace(scrape.Text(videoTimeElement)) info.Length, _ = parseVideoDuration(durationStr) } linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"} linkFieldPtrs := []*string{&info.Title, &info.Author} for i, class := range linkFieldClasses { linkContainer, ok := scrape.Find(element, scrape.ByClass(class)) if ok { link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A)) if ok { *linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link)) } } } descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description")) if ok { info.Description = strings.TrimSpace(scrape.Text(descBox)) } return &info }
// fetchExtraScheduleInfo gets more information about each component. // // The rootNode argument should be the parsed schedule list view. func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error { psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm")) if !ok { return errors.New("could not find PSForm") } icsid, ok := scrape.Find(psForm, scrape.ById("ICSID")) if !ok { return errors.New("could not find ICSID") } formAction := getNodeAttribute(psForm, "action") sid := getNodeAttribute(icsid, "value") // TODO: figure out if there's a way to make this more robust or to load it lazily. sectionIndex := 0 for courseIndex := range courses { course := &courses[courseIndex] for componentIndex := range course.Components { component := &course.Components[componentIndex] postData := generateClassDetailForm(sid, sectionIndex) res, reqErr := client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } courseOpen, parseErr := parseExtraComponentInfo(res.Body, component) if parseErr != nil { return parseErr } course.Open = &courseOpen postData = generateClassDetailBackForm(sid, sectionIndex) res, reqErr = client.PostForm(formAction, postData) if res != nil { defer res.Body.Close() } if reqErr != nil { return reqErr } sectionIndex++ } } return nil }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
// History asynchronously fetches the user's // video viewing history. // You may provide a cancel channel which you // can close to cancel the fetch mid-way. func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) { videoChan := make(chan *YoutubeVideoInfo) errChan := make(chan error, 1) go func() { defer close(videoChan) defer close(errChan) historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil) historyReq.Header.Set("User-Agent", spoofedUserAgent) resp, err := y.s.Do(historyReq) rootNode, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { errChan <- err return } loadMoreHTML := rootNode contentHTML := rootNode for { items := parseHistoryItems(contentHTML) for _, item := range items { select { case videoChan <- item: case <-cancel: return } } if loadMoreHTML == nil { break } loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more")) if ok { morePath := scrape.Attr(loadButton, "data-uix-load-more-href") loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath) if err != nil { errChan <- err return } } } }() return videoChan, errChan }
func parsepost(n *html.Node) Post { post := Post{} // get the title. uses a scrape inbuilt matcher title_scrape, _ := scrape.Find(n, scrape.ByClass("title")) title := scrape.Text(title_scrape.FirstChild) // get the subreddit. This requires a custom matcher. matcher := func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil { return scrape.Attr(n, "class") == "subreddit hover may-blank" } return false } sub, _ := scrape.Find(n, matcher) subreddit := scrape.Text(sub) // get the url to the comments. requires custom matcher. matcher = func(n *html.Node) bool { if n.DataAtom == atom.Ul && n.FirstChild != nil { return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first" } return false } ul, _ := scrape.Find(n, matcher) // ul is a list of two buttons: one that links to a post's comments page, one a "share" function li := ul.FirstChild // the first list item of ul -- this will always be the comments page link. url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item. // get the author. Uses custom matcher and magic. matcher = func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P { return strings.Contains(scrape.Attr(n, "href"), "/user/") } return false } author_scrape, _ := scrape.Find(n, matcher) author := scrape.Text(author_scrape) post.title = title post.subreddit = subreddit post.url = url post.author = author return post }
func main() { client := &http.Client{} req, err := http.NewRequest("GET", "http://whatsmyuseragent.com/", nil) if err != nil { panic(err) } req.Header.Set("User-Agent", ua) resp, err := client.Do(req) // resp, err := http.Get("http://whatsmyuseragent.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } defer resp.Body.Close() info := scrape.ByClass("info") data := scrape.FindAll(root, info) for _, v := range data { fmt.Printf("%s\n", scrape.Text(v)) } }
func main() { router := gin.Default() router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) { id, valid := validateAndFormatAmazonID(c.Param("amazon_id")) if !valid { c.JSON(http.StatusInternalServerError, gin.H{ "error": "invalid amazon id", "id": id, }) return } resp, err := http.Get("http://www.amazon.de/gp/product/" + id) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } //item does not exist in amazon.de if resp.StatusCode == http.StatusNotFound { c.JSON(http.StatusNotFound, gin.H{ "error": "product not available", }) return } root, err := html.Parse(resp.Body) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{ "error": err, }) return } actorsMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Dd && n.Parent != nil && n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil { return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" && scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:" } return false } posterMatcher := func(n *html.Node) bool { if n.DataAtom == atom.Img && n.Parent != nil { return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container" } return false } //NOTE: Since this is a demo, I assume matchers will always hit a result movie := &Movie{} titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title")) movie.Title = scrape.Text(titleNode.FirstChild) releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year")) year, _ := strconv.Atoi(scrape.Text(releaseYearNode)) movie.ReleaseYear = year actorsNode, _ := scrape.Find(root, actorsMatcher) movie.Actors = strings.Split(scrape.Text(actorsNode), ",") posterNode, _ := scrape.Find(root, posterMatcher) movie.Poster = scrape.Attr(posterNode, "src") movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie")) ids := make([]string, len(movieNodes)) for i, movieNode := range movieNodes { ids[i] = scrape.Attr(movieNode, "data-asin") } movie.SimilarIDs = ids c.JSON(http.StatusOK, movie) }) router.Run(":8080") }
//LivescoreParser parse livescore func LivescoreParser(root *html.Node) []Match { var matches []Match contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag)) if contentOK { //find all row-gray rowGrayMatcher := func(n *html.Node) bool { classes := strings.Fields(scrape.Attr(n, "class")) for _, c := range classes { if c == classRowGray { parentClasses := strings.Fields(scrape.Attr(n.Parent, "class")) for _, pc := range parentClasses { if pc == classContentTag { return true } } } } return false } rows := scrape.FindAll(contentElmt, rowGrayMatcher) matchChann := make(chan Match) for _, rowElmt := range rows { go func(rowElmt *html.Node) { var time string var homeTeam string var awayTeam string var score string timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt)) if timeElmtOK { time = scrape.Text(timeElmt) } scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink)) if scoreElmtOK { score = scrape.Text(scoreElmt) } teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt)) for i := 0; i < len(teamElmts); i++ { teamElmt := teamElmts[i] if i%2 == 0 { homeTeam = scrape.Text(teamElmt) } else { awayTeam = scrape.Text(teamElmt) } } match := Match{ HomeTeam: homeTeam, AwayTeam: awayTeam, Score: score, Time: time, } matchChann <- match }(rowElmt) } for i := 0; i < len(rows); i++ { select { case m := <-matchChann: matches = append(matches, m) } } close(matchChann) } return matches }
func doScrape(urlString string) AppData { fmt.Println(urlString) u, err := url.Parse(urlString) if err != nil { panic(err) } appData := AppData{} appData.PackageName = u.Query().Get("id") resp, err := http.Get(urlString) if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } genreMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "genre" } iconMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "image" } softwareVersionMatcher := func(n *html.Node) bool { return scrape.Attr(n, "itemprop") == "softwareVersion" } name, ok := scrape.Find(root, scrape.ByClass("id-app-title")) if ok { appData.Name = scrape.Text(name) } genre, ok := scrape.Find(root, genreMatcher) if ok { appData.Categories = append(appData.Categories, scrape.Text(genre)) } icon, ok := scrape.Find(root, iconMatcher) if ok { iconSrc := scrape.Attr(icon, "src") iconUrl, err := url.Parse(iconSrc) if err != nil { panic(err) } if iconUrl.Scheme == "" { iconSrc = "https:" + iconSrc } resp, err = http.Get(iconSrc) if err != nil { panic(err) } defer resp.Body.Close() outputFile, err := os.Create("output/" + appData.PackageName + ".png") if err != nil { panic(err) } defer outputFile.Close() _, err = io.Copy(outputFile, resp.Body) if err != nil { panic(err) } } version, ok := scrape.Find(root, softwareVersionMatcher) if ok { appData.Version = strings.TrimSpace(scrape.Text(version)) } return appData }
func (s structure) getSpaces() ([]space, error) { spaces := []space{ space{Name: "WSU Permit"}, space{Name: "Student OneCard"}, space{Name: "Visitor"}, } re := map[string]*regexp.Regexp{ "avail": regexp.MustCompile(`([0-9]+|NONE)`), "status": regexp.MustCompile(`(OPEN|CLOSED|FULL)`), "updated": regexp.MustCompile(`(?P<a>^.+: )(?P<b>.+)`), } // Request client := &http.Client{ Timeout: time.Second * 10, } req, err := http.NewRequest("GET", "http://m.wayne.edu/parking.php?location="+s.URLCode, nil) if err != nil { return spaces, errors.New("Request failed") } req.Header.Set("User-Agent", "Apple-iPhone6C1/") // Response resp, err := client.Do(req) if err != nil { return spaces, errors.New("Response failed") } defer resp.Body.Close() body, err := html.Parse(resp.Body) if err != nil { return spaces, errors.New("Error parsing response body") } // Parse relevant response data dataString, ok := scrape.Find(body, scrape.ByClass("available")) if !ok { return spaces, errors.New("Error: Line 105 - scrape.Find (available) -- not finding scrape info") } lastUpdated, ok := scrape.Find(body, scrape.ByClass("last_updated")) if !ok { return spaces, errors.New("Error: Line 109 - scrape.Find (last_updated) -- not finding scrape info") } avail := re["avail"].FindAllString(scrape.Text(dataString), -1) if len(avail) == 0 { avail = []string{"0", "0", "0"} } status := re["status"].FindAllString(scrape.Text(dataString), -1) if len(status) != 3 { return spaces, errors.New("Error: Line 118 - FindAllString (status) not returning 3 matches") } updated := re["updated"].FindStringSubmatch(scrape.Text(lastUpdated)) if len(updated) == 0 { return spaces, errors.New("Error: Line 122 - FindAllStringSubmatch (updated) not finding a match") } for key := range spaces { spaces[key].Available = avail[key] spaces[key].Status = status[key] spaces[key].Updated = updated[2] } return spaces, nil }