コード例 #1
0
ファイル: main.go プロジェクト: anykao/p
func TorrentList(url string) ([]Torrent, error) {
	// request and parse the front page
	resp, err := http.Get(url)
	if err != nil {
		return make([]Torrent, 0), err
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		return make([]Torrent, 0), err
	}
	var torrents []Torrent
	if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok {
		// define a matcher
		matcher := func(n *html.Node) bool {
			// must check for nil values
			if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody {
				return true
			}
			return false
		}
		// grab all articles and print them
		trs := scrape.FindAll(content, matcher)
		for _, tr := range trs {
			torrents = append(torrents, ParseRecord(tr))
		}
	}
	resp.Body.Close()
	return torrents, nil
}
コード例 #2
0
ファイル: auth.go プロジェクト: unixpickle/gscrape
// Auth attempts to access a given URL, then enters the given
// credentials when the URL redirects to a login page.
func (s *Session) Auth(serviceURL, email, password string) error {
	resp, err := s.Get(serviceURL)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	parsed, err := html.ParseFragment(resp.Body, nil)
	if err != nil || len(parsed) == 0 {
		return err
	}
	root := parsed[0]
	form, ok := scrape.Find(root, scrape.ById("gaia_loginform"))
	if !ok {
		return errors.New("failed to process login page")
	}
	submission := url.Values{}
	for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) {
		submission.Add(getAttribute(input, "name"), getAttribute(input, "value"))
	}
	submission["Email"] = []string{email}
	submission["Passwd"] = []string{password}

	postResp, err := s.PostForm(resp.Request.URL.String(), submission)
	if err != nil {
		return err
	}
	postResp.Body.Close()

	if postResp.Request.Method == "POST" {
		return errors.New("login incorrect")
	}

	return nil
}
コード例 #3
0
ファイル: index.go プロジェクト: unixpickle/weakai
func indexPage(page string) (ind map[string]int, branches []string, err error) {
	resp, err := http.Get(page)
	if err != nil {
		return
	}
	root, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return
	}

	content, ok := scrape.Find(root, scrape.ById("bodyContent"))
	if !ok {
		return nil, nil, errors.New("no bodyContent element")
	}

	paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P))
	pageText := ""
	for _, p := range paragraphs {
		pageText += elementInnerText(p) + " "
	}
	words := strings.Fields(strings.ToLower(pageText))

	ind = map[string]int{}
	for _, word := range words {
		ind[word] = ind[word] + 1
	}

	links := findWikiLinks(content)
	branches = make([]string, len(links))
	for i, link := range links {
		branches[i] = "https://en.wikipedia.org" + link
	}
	return
}
コード例 #4
0
// fetchExtraScheduleInfo gets more information about each component.
//
// The rootNode argument should be the parsed schedule list view.
func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error {
	psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm"))
	if !ok {
		return errors.New("could not find PSForm")
	}
	icsid, ok := scrape.Find(psForm, scrape.ById("ICSID"))
	if !ok {
		return errors.New("could not find ICSID")
	}

	formAction := getNodeAttribute(psForm, "action")
	sid := getNodeAttribute(icsid, "value")

	// TODO: figure out if there's a way to make this more robust or to load it lazily.
	sectionIndex := 0
	for courseIndex := range courses {
		course := &courses[courseIndex]
		for componentIndex := range course.Components {
			component := &course.Components[componentIndex]

			postData := generateClassDetailForm(sid, sectionIndex)
			res, reqErr := client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			courseOpen, parseErr := parseExtraComponentInfo(res.Body, component)
			if parseErr != nil {
				return parseErr
			}
			course.Open = &courseOpen

			postData = generateClassDetailBackForm(sid, sectionIndex)
			res, reqErr = client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			sectionIndex++
		}
	}

	return nil
}
コード例 #5
0
ファイル: main.go プロジェクト: jmonmane/golang
func scraper() {
	fd, err := os.Open("/mnt/hgfs/Downloads/wiki.html")
	if err != nil {
		panic(err)
	}
	defer fd.Close()
	root, err := html.Parse(fd)
	if err != nil {
		panic(err)
	}
	t := html.NewTokenizer(root)

	// matcher := func(n *html.Node) bool {
	// 	if n.DataAtom == atom.Table {
	// 		return true
	// 	}
	// 	return false
	// }

	// rowMatcher := func(n *html.Node) bool {
	// 	if n.DataAtom == atom.Tr {
	// 		return true
	// 	}
	// 	return false
	// }
	tableMatcher := scrape.ById(tableID)
	table := scrape.FindAll(root, tableMatcher)

	for _, v := range table {
		if t.Token().Data == "tr" {
			fmt.Printf("%s\n", scrape.Text(v))
		} else {
			t.Next()
		}
	}

	// for , v := range table  {
	// 	fmt.Printf("%s\n", scrape.Text(v))
	// }

}
コード例 #6
0
ファイル: main.go プロジェクト: ezeql/gin-scrape-example
func main() {

	router := gin.Default()
	router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) {

		id, valid := validateAndFormatAmazonID(c.Param("amazon_id"))

		if !valid {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": "invalid amazon id",
				"id":    id,
			})
			return
		}

		resp, err := http.Get("http://www.amazon.de/gp/product/" + id)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		//item does not exist in amazon.de
		if resp.StatusCode == http.StatusNotFound {
			c.JSON(http.StatusNotFound, gin.H{
				"error": "product not available",
			})
			return
		}

		root, err := html.Parse(resp.Body)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		actorsMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Dd && n.Parent != nil &&
				n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil {
				return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" &&
					scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:"
			}
			return false
		}

		posterMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Img && n.Parent != nil {
				return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container"
			}
			return false
		}

		//NOTE: Since this is a demo, I assume matchers will always hit a result

		movie := &Movie{}

		titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title"))
		movie.Title = scrape.Text(titleNode.FirstChild)

		releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year"))
		year, _ := strconv.Atoi(scrape.Text(releaseYearNode))
		movie.ReleaseYear = year

		actorsNode, _ := scrape.Find(root, actorsMatcher)
		movie.Actors = strings.Split(scrape.Text(actorsNode), ",")

		posterNode, _ := scrape.Find(root, posterMatcher)
		movie.Poster = scrape.Attr(posterNode, "src")

		movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie"))
		ids := make([]string, len(movieNodes))
		for i, movieNode := range movieNodes {
			ids[i] = scrape.Attr(movieNode, "data-asin")
		}
		movie.SimilarIDs = ids

		c.JSON(http.StatusOK, movie)
	})

	router.Run(":8080")
}
コード例 #7
0
// parseExtraComponentInfo parses the "Class Detail" page for a component.
func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) {
	nodes, err := html.ParseFragment(body, nil)
	if err != nil {
		return
	}
	if len(nodes) != 1 {
		return false, errors.New("invalid number of root elements")
	}

	openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT"))
	if !ok {
		return false, errors.New("open status not found")
	}
	courseOpen = (nodeInnerText(openStatus) == "Open")

	availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3"))
	if !ok {
		return courseOpen, errors.New("could not find availability info")
	}

	rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr))
	if len(rows) != 7 {
		return courseOpen, errors.New("invalid number of rows in availability table")
	}

	var availability ClassAvailability

	cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 2")
	}
	availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 4")
	}
	availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td)))
	if len(cols) != 1 {
		return courseOpen, errors.New("expected 1 aligned column in row 6")
	}
	availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}

	component.ClassAvailability = &availability

	return
}