Golang Find示例，github.com/yhat/scrape.Find Golang示例

示例#1

0

显示文件

文件： wdr.go 项目： mro/internet-radio-recorder

func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) {
	{
		// Author
		meta, _ := scrape.Find(root, func(n *html.Node) bool {
			return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name")
		})
		if nil != meta {
			content := scrape.Attr(meta, "content")
			bc.Author = &content
		}
	}
	for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <div class='epg-content-right'/>")
			return
		}
		{
			// TitleEpisode
			txt, _ := scrape.Find(epg, func(n *html.Node) bool {
				return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom
			})
			if nil != txt {
				t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data))
				bc.TitleEpisode = &t
				txt.Parent.RemoveChild(txt.NextSibling)
				txt.Parent.RemoveChild(txt)
			}
		}
		{
			// Subject
			a, _ := scrape.Find(epg, func(n *html.Node) bool {
				return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom
			})
			if nil != a {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)
			}
		}
		// purge some cruft
		for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool {
			clz := scrape.Attr(n, "class")
			return atom.H2 == n.DataAtom ||
				"mod modSharing" == clz ||
				"modGalery" == clz ||
				"sendungsLink" == clz ||
				"tabs-container" == clz
		}) {
			nn.Parent.RemoveChild(nn)
		}
		{
			description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent }))
			bc.Description = &description
		}
	}
	bc_ := r.Broadcast(*bc)
	ret = append(ret, &bc_)
	return
}

示例#2

0

显示文件

文件： schedule.go 项目： unixpickle/better-student-center

// fetchExtraScheduleInfo gets more information about each component.
//
// The rootNode argument should be the parsed schedule list view.
func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error {
	psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm"))
	if !ok {
		return errors.New("could not find PSForm")
	}
	icsid, ok := scrape.Find(psForm, scrape.ById("ICSID"))
	if !ok {
		return errors.New("could not find ICSID")
	}

	formAction := getNodeAttribute(psForm, "action")
	sid := getNodeAttribute(icsid, "value")

	// TODO: figure out if there's a way to make this more robust or to load it lazily.
	sectionIndex := 0
	for courseIndex := range courses {
		course := &courses[courseIndex]
		for componentIndex := range course.Components {
			component := &course.Components[componentIndex]

			postData := generateClassDetailForm(sid, sectionIndex)
			res, reqErr := client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			courseOpen, parseErr := parseExtraComponentInfo(res.Body, component)
			if parseErr != nil {
				return parseErr
			}
			course.Open = &courseOpen

			postData = generateClassDetailBackForm(sid, sectionIndex)
			res, reqErr = client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			sectionIndex++
		}
	}

	return nil
}

示例#3

0

显示文件

文件： craigslist.go 项目： matthewdu/powerplug

func NewListing(ctx appengine.Context, url string) (*Listing, error) {
	client := urlfetch.Client(ctx)
	resp, err := client.Get("http://167.88.16.61:2138/" + url)
	if err != nil {
		ctx.Errorf("%s", err)
	}
	ctx.Debugf("Craigslist request came back with status: %s", resp.Status)
	if err != nil {
		ctx.Errorf("%s", err)
		return nil, errors.New("Get listing failed")
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		ctx.Errorf("%s", "Parsing Error")
		return nil, errors.New("Parse body failed")
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
	if !ok {
		ctx.Errorf("%s", "Error getting title")
		return nil, errors.New("Get title failed")
	}
	price, ok := scrape.Find(root, scrape.ByClass("price"))
	if !ok {
		ctx.Errorf("%s", "Error getting price")
		return nil, errors.New("Get price failed")
	}
	intPrice, err := strconv.Atoi(scrape.Text(price)[1:])
	if err != nil {
		ctx.Errorf("Error casting price: %s", scrape.Text(price))
		return nil, err
	}
	images := scrape.FindAll(root, scrape.ByTag(atom.Img))
	imageUrl := ""
	for _, image := range images {
		if scrape.Attr(image, "title") == "image 1" {
			imageUrl = scrape.Attr(image, "src")
		}
	}

	ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title))

	return &Listing{
		Url:      url,
		Title:    scrape.Text(title),
		Price:    intPrice,
		ImageUrl: imageUrl,
	}, nil
}

示例#4

0

显示文件

文件： auth.go 项目： unixpickle/gscrape

// Auth attempts to access a given URL, then enters the given
// credentials when the URL redirects to a login page.
func (s *Session) Auth(serviceURL, email, password string) error {
	resp, err := s.Get(serviceURL)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	parsed, err := html.ParseFragment(resp.Body, nil)
	if err != nil || len(parsed) == 0 {
		return err
	}
	root := parsed[0]
	form, ok := scrape.Find(root, scrape.ById("gaia_loginform"))
	if !ok {
		return errors.New("failed to process login page")
	}
	submission := url.Values{}
	for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) {
		submission.Add(getAttribute(input, "name"), getAttribute(input, "value"))
	}
	submission["Email"] = []string{email}
	submission["Passwd"] = []string{password}

	postResp, err := s.PostForm(resp.Request.URL.String(), submission)
	if err != nil {
		return err
	}
	postResp.Body.Close()

	if postResp.Request.Method == "POST" {
		return errors.New("login incorrect")
	}

	return nil
}

示例#5

0

显示文件

文件： main.go 项目： anykao/p

func TorrentList(url string) ([]Torrent, error) {
	// request and parse the front page
	resp, err := http.Get(url)
	if err != nil {
		return make([]Torrent, 0), err
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		return make([]Torrent, 0), err
	}
	var torrents []Torrent
	if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok {
		// define a matcher
		matcher := func(n *html.Node) bool {
			// must check for nil values
			if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody {
				return true
			}
			return false
		}
		// grab all articles and print them
		trs := scrape.FindAll(content, matcher)
		for _, tr := range trs {
			torrents = append(torrents, ParseRecord(tr))
		}
	}
	resp.Body.Close()
	return torrents, nil
}

示例#6

0

显示文件

文件： index.go 项目： unixpickle/weakai

func indexPage(page string) (ind map[string]int, branches []string, err error) {
	resp, err := http.Get(page)
	if err != nil {
		return
	}
	root, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return
	}

	content, ok := scrape.Find(root, scrape.ById("bodyContent"))
	if !ok {
		return nil, nil, errors.New("no bodyContent element")
	}

	paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P))
	pageText := ""
	for _, p := range paragraphs {
		pageText += elementInnerText(p) + " "
	}
	words := strings.Fields(strings.ToLower(pageText))

	ind = map[string]int{}
	for _, word := range words {
		ind[word] = ind[word] + 1
	}

	links := findWikiLinks(content)
	branches = make([]string, len(links))
	for i, link := range links {
		branches[i] = "https://en.wikipedia.org" + link
	}
	return
}

示例#7

0

显示文件

文件： co.go 项目： gozes/co

func fillJobStruct(n *html.Node) *Job {
	job := new(Job)
	job.Title = scrape.Text(n)
	job.RetriveOn = time.Now().Format(time.RFC822Z)
	job.url = jobUrl(n)
	fmt.Println(job.url)
	job.ID = jobID(job.url)
	job.EmailFormLink = jobEmailFromUrl + job.ID
	jp := fetchByID(job.ID)
	job.jobPage = jp
	desc, _ := scrape.Find(job.jobPage, descriptionMatcher)
	job.Description = scrape.Text(desc)
	req, _ := scrape.Find(job.jobPage, requiermentMatcher)
	job.Requierments = scrape.Text(req)

	return job
}

示例#8

0

显示文件

文件： html.go 项目： unixpickle/better-student-center

// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the
// login form.
func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) {
	parsed, err := html.ParseFragment(res.Body, nil)
	if err != nil {
		return
	} else if len(parsed) != 1 {
		return nil, errors.New("wrong number of root elements")
	}

	root := parsed[0]

	var form loginFormInfo

	htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form))
	if !ok {
		return nil, errors.New("no form element found")
	}

	if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" {
		form.action = res.Request.URL.String()
	} else {
		actionURL, err := url.Parse(actionStr)
		if err != nil {
			return nil, err
		}
		if actionURL.Host == "" {
			actionURL.Host = res.Request.URL.Host
		}
		if actionURL.Scheme == "" {
			actionURL.Scheme = res.Request.URL.Scheme
		}
		if !path.IsAbs(actionURL.Path) {
			actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path)
		}
		form.action = actionURL.String()
	}

	inputs := scrape.FindAll(root, scrape.ByTag(atom.Input))
	form.otherFields = url.Values{}
	for _, input := range inputs {
		inputName := getNodeAttribute(input, "name")
		switch getNodeAttribute(input, "type") {
		case "text":
			form.usernameField = inputName
		case "password":
			form.passwordField = inputName
		default:
			form.otherFields.Add(inputName, getNodeAttribute(input, "value"))
		}
	}

	if form.usernameField == "" {
		return nil, errors.New("no username field found")
	} else if form.passwordField == "" {
		return nil, errors.New("no password field found")
	}

	return &form, nil
}

示例#9

0

显示文件

文件： reddit_scraper.go 项目： jalavosus/redditscraper

func parsepost(n *html.Node) Post {
	post := Post{}

	// get the title. uses a scrape inbuilt matcher
	title_scrape, _ := scrape.Find(n, scrape.ByClass("title"))
	title := scrape.Text(title_scrape.FirstChild)

	// get the subreddit. This requires a custom matcher.
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent != nil {
			return scrape.Attr(n, "class") == "subreddit hover may-blank"
		}
		return false
	}
	sub, _ := scrape.Find(n, matcher)
	subreddit := scrape.Text(sub)

	// get the url to the comments. requires custom matcher.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Ul && n.FirstChild != nil {
			return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first"
		}
		return false
	}
	ul, _ := scrape.Find(n, matcher)          // ul is a list of two buttons: one that links to a post's comments page, one a "share" function
	li := ul.FirstChild                       // the first list item of ul -- this will always be the comments page link.
	url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item.

	// get the author. Uses custom matcher and magic.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P {
			return strings.Contains(scrape.Attr(n, "href"), "/user/")
		}
		return false
	}
	author_scrape, _ := scrape.Find(n, matcher)
	author := scrape.Text(author_scrape)

	post.title = title
	post.subreddit = subreddit
	post.url = url
	post.author = author

	return post
}

示例#10

0

显示文件

文件： title.go 项目： mcmillan/socialite

func findHTMLTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, scrape.ByTag(atom.Title))

	if !found {
		return ""
	}

	return scrape.Text(el)
}

示例#11

0

显示文件

文件： main.go 项目： anykao/p

func ParseName(n *html.Node) (string, string, string) {
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td {
			return true
		}
		return false
	}

	var name, magnet, desc string

	if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok {
		name = scrape.Text(detName)
	}
	if anchor, ok := scrape.Find(n, matcher); ok {
		magnet = scrape.Attr(anchor, "href")
	}
	if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok {
		desc = scrape.Text(detDesc)
	}
	return name, magnet, desc
}

示例#12

0

显示文件

文件： search.go 项目： mrap/twitterget

func TweetsToUser(u user.User) []tweet.Tweet {
	reqURL := SearchURL
	_url.SetQueryParams(&reqURL, map[string]string{
		"q": "to:" + u.ScreenName,
		"f": "tweets",
	})

	res, err := http.Get(reqURL.String())
	PanicIf(err)
	root, err := html.Parse(res.Body)
	PanicIf(err)

	tweetsMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet")
	}
	tweetScreenNameMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username")
	}
	tweetTextMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text")
	}

	tweetNodes := scrape.FindAll(root, tweetsMatcher)
	tweets := make([]tweet.Tweet, len(tweetNodes))
	for i, n := range tweetNodes {
		t := tweet.Tweet{
			ID: scrape.Attr(n, "data-user-id"),
		}
		if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok {
			t.Author = *user.NewUser(scrape.Text(child))
		}
		if child, ok := scrape.Find(n, tweetTextMatcher); ok {
			t.Text = scrape.Text(child)
		}
		tweets[i] = t
	}

	return tweets
}

示例#13

0

显示文件

文件： gobot.go 项目： ChrisFernandez/GoBot

func resolveUrl(website string) string {
	site := getURL(website)

	contents, err := html.Parse(site.Body)
	if err != nil {
		fmt.Printf("%s", err)
		os.Exit(1)
		panic(err)
	}
	title, _ := scrape.Find(contents, scrape.ByTag(atom.Title))
	var titulo string = scrape.Text(title)
	return titulo

}

示例#14

0

显示文件

文件： title.go 项目： mcmillan/socialite

func findOpenGraphTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, func(n *html.Node) bool {
		if n.DataAtom == atom.Meta {
			return scrape.Attr(n, "property") == "og:title" && scrape.Attr(n, "content") != ""
		}

		return false
	})

	if !found {
		return ""
	}

	return scrape.Attr(el, "content")
}

示例#15

0

显示文件

文件： title.go 项目： mcmillan/socialite

func findTwitterTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, func(n *html.Node) bool {
		if n.DataAtom == atom.Meta {
			return scrape.Attr(n, "name") == "twitter:title" && scrape.Attr(n, "content") != ""
		}

		return false
	})

	if !found {
		return ""
	}

	return scrape.Attr(el, "content")
}

示例#16

0

显示文件

文件： gobot.go 项目： ChrisFernandez/GoBot

func queryWikipedia(word string) string {
	word = strings.TrimSpace(word)
	website := "http://en.wikipedia.com/wiki/" + word
	site := getURL(website)
	contents, err := html.Parse(site.Body)

	if err != nil {
		fmt.Print("%s", err)
		panic(err)
		os.Exit(1)
	}
	intro, _ := scrape.Find(contents, scrape.ByTag(atom.P))
	var resp string = scrape.Text(intro)
	return resp
}

示例#17

0

显示文件

文件： youtube.go 项目： unixpickle/gscrape

// History asynchronously fetches the user's
// video viewing history.
// You may provide a cancel channel which you
// can close to cancel the fetch mid-way.
func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) {
	videoChan := make(chan *YoutubeVideoInfo)
	errChan := make(chan error, 1)

	go func() {
		defer close(videoChan)
		defer close(errChan)

		historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil)
		historyReq.Header.Set("User-Agent", spoofedUserAgent)
		resp, err := y.s.Do(historyReq)
		rootNode, err := html.Parse(resp.Body)
		resp.Body.Close()
		if err != nil {
			errChan <- err
			return
		}

		loadMoreHTML := rootNode
		contentHTML := rootNode
		for {
			items := parseHistoryItems(contentHTML)
			for _, item := range items {
				select {
				case videoChan <- item:
				case <-cancel:
					return
				}
			}

			if loadMoreHTML == nil {
				break
			}

			loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more"))
			if ok {
				morePath := scrape.Attr(loadButton, "data-uix-load-more-href")
				loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath)
				if err != nil {
					errChan <- err
					return
				}
			}
		}
	}()

	return videoChan, errChan
}

示例#18

0

显示文件

文件： reddit_scraper.go 项目： jalavosus/redditscraper

func main() {

	resp, err := http.Get("https://www.reddit.com")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "id") == "siteTable"
		}
		return false
	}
	table, ok := scrape.Find(root, matcher)
	if !ok {
		panic(ok)
	}
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "data-type") == "link"
		}
		return false
	}

	articles := scrape.FindAll(table, matcher)
	var posts []Post

	for i := 0; i < len(articles); i++ {
		wg.Add(1)
		go func(n *html.Node) {
			post := parsepost(n)
			posts = append(posts, post)
			wg.Done()
		}(articles[i])
	}

	wg.Wait()

	for i := 0; i < len(posts); i++ {
		printpost(posts[i])
	}

}

示例#19

0

显示文件

文件： schedule.go 项目： unixpickle/better-student-center

// parseCurrentSchedule parses the courses from the schedule list view page.
//
// If fetchMoreInfo is true, this will perform a request for each component to find out information
// about it.
func parseSchedule(rootNode *html.Node) ([]Course, error) {
	courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO"))
	result := make([]Course, 0, len(courseTables))
	for _, classTable := range courseTables {
		println("found course")

		titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER"))
		if !ok {
			// This will occur at least once, since the filter options are a PSGROUPBOXWBO.
			continue
		}

		infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO"))
		if len(infoTables) != 2 {
			return nil, errors.New("expected exactly 2 info tables but found " +
				strconv.Itoa(len(infoTables)))
		}

		courseInfoTable := infoTables[0]
		course, err := parseCourseInfoTable(courseInfoTable)
		if err != nil {
			return nil, err
		}

		// NOTE: there isn't really a standard way to parse the department/number.
		course.Name = nodeInnerText(titleElement)

		componentsInfoTable := infoTables[1]
		componentMaps, err := tableEntriesAsMaps(componentsInfoTable)
		if err != nil {
			return nil, err
		}
		course.Components = make([]Component, len(componentMaps))
		for i, componentMap := range componentMaps {
			course.Components[i], err = parseComponentInfoMap(componentMap)
			if err != nil {
				return nil, err
			}
		}

		result = append(result, course)
	}
	return result, nil
}

示例#20

0

显示文件

文件： main.go 项目： jasonthomas/teddy-go

func getTitle(url string) string {

	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("error:", err)
		return "error"
	}

	root, err := html.Parse(resp.Body)
	if err != nil {
		fmt.Println("error:", err)
		return "error"
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))

	if ok {
		return scrape.Text(title)
	}

	return "unknown"
}

示例#21

0

显示文件

文件： youtube.go 项目： unixpickle/gscrape

func parseVideoInfo(element *html.Node) *YoutubeVideoInfo {
	var info YoutubeVideoInfo

	info.ID = scrape.Attr(element, "data-context-item-id")

	thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple"))
	if ok {
		thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img))
		if ok {
			info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src"))
		}
	}

	videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time"))
	if ok {
		durationStr := strings.TrimSpace(scrape.Text(videoTimeElement))
		info.Length, _ = parseVideoDuration(durationStr)
	}

	linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"}
	linkFieldPtrs := []*string{&info.Title, &info.Author}
	for i, class := range linkFieldClasses {
		linkContainer, ok := scrape.Find(element, scrape.ByClass(class))
		if ok {
			link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A))
			if ok {
				*linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link))
			}
		}
	}

	descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description"))
	if ok {
		info.Description = strings.TrimSpace(scrape.Text(descBox))
	}

	return &info
}

示例#22

0

显示文件

文件： co.go 项目： gozes/co

func jobCaptChaUrl(n *html.Node) string {
	img, _ := scrape.Find(n, captchaImageMatcher)
	return baseUrl + scrape.Attr(img, "src")
}

示例#23

0

显示文件

文件： main.go 项目： ngyewch/gp-scraper

func doScrape(urlString string) AppData {
	fmt.Println(urlString)

	u, err := url.Parse(urlString)
	if err != nil {
		panic(err)
	}

	appData := AppData{}
	appData.PackageName = u.Query().Get("id")

	resp, err := http.Get(urlString)
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	genreMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "genre"
	}
	iconMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "image"
	}
	softwareVersionMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "softwareVersion"
	}

	name, ok := scrape.Find(root, scrape.ByClass("id-app-title"))
	if ok {
		appData.Name = scrape.Text(name)
	}
	genre, ok := scrape.Find(root, genreMatcher)
	if ok {
		appData.Categories = append(appData.Categories, scrape.Text(genre))
	}
	icon, ok := scrape.Find(root, iconMatcher)
	if ok {
		iconSrc := scrape.Attr(icon, "src")
		iconUrl, err := url.Parse(iconSrc)
		if err != nil {
			panic(err)
		}
		if iconUrl.Scheme == "" {
			iconSrc = "https:" + iconSrc
		}

		resp, err = http.Get(iconSrc)
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()

		outputFile, err := os.Create("output/" + appData.PackageName + ".png")
		if err != nil {
			panic(err)
		}
		defer outputFile.Close()

		_, err = io.Copy(outputFile, resp.Body)
		if err != nil {
			panic(err)
		}
	}
	version, ok := scrape.Find(root, softwareVersionMatcher)
	if ok {
		appData.Version = strings.TrimSpace(scrape.Text(version))
	}

	return appData
}

示例#24

0

显示文件

文件： livescore.go 项目： yogyrahmawan/soccer-score

//LivescoreParser parse livescore
func LivescoreParser(root *html.Node) []Match {
	var matches []Match

	contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag))
	if contentOK {
		//find all row-gray
		rowGrayMatcher := func(n *html.Node) bool {
			classes := strings.Fields(scrape.Attr(n, "class"))
			for _, c := range classes {
				if c == classRowGray {
					parentClasses := strings.Fields(scrape.Attr(n.Parent, "class"))
					for _, pc := range parentClasses {
						if pc == classContentTag {
							return true
						}
					}
				}
			}
			return false
		}
		rows := scrape.FindAll(contentElmt, rowGrayMatcher)

		matchChann := make(chan Match)
		for _, rowElmt := range rows {
			go func(rowElmt *html.Node) {
				var time string
				var homeTeam string
				var awayTeam string
				var score string

				timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt))
				if timeElmtOK {
					time = scrape.Text(timeElmt)
				}

				scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink))
				if scoreElmtOK {
					score = scrape.Text(scoreElmt)
				}

				teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt))
				for i := 0; i < len(teamElmts); i++ {
					teamElmt := teamElmts[i]
					if i%2 == 0 {
						homeTeam = scrape.Text(teamElmt)
					} else {
						awayTeam = scrape.Text(teamElmt)
					}
				}
				match := Match{
					HomeTeam: homeTeam,
					AwayTeam: awayTeam,
					Score:    score,
					Time:     time,
				}

				matchChann <- match
			}(rowElmt)
		}

		for i := 0; i < len(rows); i++ {
			select {
			case m := <-matchChann:
				matches = append(matches, m)
			}
		}
		close(matchChann)
	}
	return matches
}

示例#25

0

显示文件

文件： reddit.go 项目： ghost2238/zax-irc

func Search(url string) (string, bool) {
	resp, err := http.Get("https://www.reddit.com/search?q=url%3A" + url + "&sort=new&t=all")
	if err != nil {
		return "", false
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		return "", false
	}

	matcher := func(n *html.Node) bool {
		return scrape.Attr(n, "class") == "search-title may-blank"
	}
	m_comments := func(n *html.Node) bool {
		if n == nil {
			return false
		}
		return scrape.Attr(n, "class") == "search-comments may-blank"
	}
	m_subreddit := func(n *html.Node) bool {
		if n == nil {
			return false
		}
		return scrape.Attr(n, "class") == "search-subreddit-link may-blank"
	}
	m_time := func(n *html.Node) bool {
		if n == nil {
			return false
		}
		return scrape.Attr(n, "datetime") != ""
	}

	post, err_ := scrape.Find(root, matcher)
	if post == nil {
		return "", false
	}
	if post.Parent == nil {
		return "", false
	}
	if post.Parent.Parent == nil {
		return "", false
	}
	main := post.Parent.Parent
	s_comments := "%error%"
	s_time := "%error%"
	s_subreddit := "%error%"
	title := scrape.Text(post)
	href := scrape.Attr(post, "href")

	comments, err_ := scrape.Find(main, m_comments)
	if err_ == true {
		s_comments = scrape.Text(comments)
	}
	time, err_ := scrape.Find(main, m_time)
	if err_ == true {
		s_time = scrape.Text(time)
	}
	subreddit, err_ := scrape.Find(main, m_subreddit)
	if err_ == true {
		s_subreddit = scrape.Text(subreddit)
	}

	re := regexp.MustCompile("comments/([[:alnum:]]+)/")
	match := re.FindStringSubmatch(href)
	s_url := "https://redd.it/" + match[1]
	s_final := fmt.Sprintf("[Reddit %s] %s (%s) - %s [%s]\n", s_subreddit, title, s_url, s_comments, s_time)
	return s_final, true
}

示例#26

0

显示文件

文件： hackernewsFetchGorp.go 项目： jimmytuc/intogooglego

// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	root, err := html.Parse(body)
	if err != nil {
		err = errors.New("Failed to html.Parse: " + err.Error())
		return
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody {
			matched := scrape.Attr(n, "class") == "athing"
			return matched
		}
		return false
	}

	// grab all articles and loop over them
	articles := scrape.FindAll(root, matcher)
	for _, article := range articles {
		var ok bool
		// Get one post entry
		var titlenode *html.Node

		titlenode, ok = scrape.Find(article,
			func(n *html.Node) bool {
				if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" {
					return true
				}
				return false
			})
		if !ok {
			continue
		}
		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"

		post.Title = scrape.Text(titlenode)
		post.Url = scrape.Attr(titlenode, "href")
		if strings.HasPrefix(post.Url, "item?id=") {
			post.Url = "https://news.ycombinator.com/" + post.Url
		}

		ps = append(ps, &post)

		// Get additional info for this post
		scorenode := article.NextSibling
		if scorenode == nil {
			post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article))
			continue
		}

		// Get the subtext containing scores, user and date
		subtext, ok := scrape.Find(scorenode,
			func(n *html.Node) bool {
				if scrape.Attr(n, "class") == "subtext" {
					return true
				}
				return false
			})
		if !ok {
			post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data))
			continue
		}

		subs := scrape.FindAll(subtext,
			func(n *html.Node) bool {
				// Get the PostId and Score
				// span class="score" id="score_9643579">92 points</span>
				if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" {

					// Get score
					var scoreid int
					scorestr := strings.Split(scrape.Text(n), " ")[0]
					scoreid, err = strconv.Atoi(scorestr)
					if err != nil {
						fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error())
						return false
					}
					post.Score = scoreid

					// Get PostId
					postidstr := scrape.Attr(n, "id")
					if len(strings.Split(postidstr, "_")) > 1 {
						post.WebPostId = strings.Split(postidstr, "_")[1]
						return true
					}
				}
				// Get the Username and Creation Date for this post
				if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil {
					href := strings.ToLower(scrape.Attr(n, "href"))
					if href != "" {
						s := strings.Split(href, "?")
						if s[0] == "user" && len(s) > 1 {
							// Username
							u := strings.Split(s[1], "=")
							if len(u) > 1 {
								post.User = u[1]
								return true
							}
						} else {
							if s[0] == "item" && len(s) > 1 {
								// Created date
								createdago := scrape.Text(n)
								if strings.Contains(createdago, "ago") {
									var postDate time.Time

									postDate, err = GetDateFromCreatedAgo(createdago)
									if err != nil {
										err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago))
										return false
									}
									post.PostDate = postDate

									return true
								}
							}
						}
					}
				} // end "class" == "subtext"
				return false
			})

		if len(subs) == 0 {
			var w bytes.Buffer
			if rerr := html.Render(&w, subtext); rerr != nil {
				fmt.Printf("Render error: %s\n", rerr)
			}
			post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String()))
		}
	}

	return ps, err
}

示例#27

0

显示文件

文件： schedule.go 项目： unixpickle/better-student-center

// parseExtraComponentInfo parses the "Class Detail" page for a component.
func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) {
	nodes, err := html.ParseFragment(body, nil)
	if err != nil {
		return
	}
	if len(nodes) != 1 {
		return false, errors.New("invalid number of root elements")
	}

	openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT"))
	if !ok {
		return false, errors.New("open status not found")
	}
	courseOpen = (nodeInnerText(openStatus) == "Open")

	availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3"))
	if !ok {
		return courseOpen, errors.New("could not find availability info")
	}

	rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr))
	if len(rows) != 7 {
		return courseOpen, errors.New("invalid number of rows in availability table")
	}

	var availability ClassAvailability

	cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 2")
	}
	availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 4")
	}
	availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td)))
	if len(cols) != 1 {
		return courseOpen, errors.New("expected 1 aligned column in row 6")
	}
	availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}

	component.ClassAvailability = &availability

	return
}

示例#28

0

显示文件

文件： co.go 项目： gozes/co

func checkNextPage(s JobSearch) bool {
	_, ok := scrape.Find(s.root, nextPageMatcher)
	fmt.Println(ok)

	return ok
}

示例#29

0

显示文件

文件： main.go 项目： ezeql/gin-scrape-example

func main() {

	router := gin.Default()
	router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) {

		id, valid := validateAndFormatAmazonID(c.Param("amazon_id"))

		if !valid {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": "invalid amazon id",
				"id":    id,
			})
			return
		}

		resp, err := http.Get("http://www.amazon.de/gp/product/" + id)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		//item does not exist in amazon.de
		if resp.StatusCode == http.StatusNotFound {
			c.JSON(http.StatusNotFound, gin.H{
				"error": "product not available",
			})
			return
		}

		root, err := html.Parse(resp.Body)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		actorsMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Dd && n.Parent != nil &&
				n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil {
				return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" &&
					scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:"
			}
			return false
		}

		posterMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Img && n.Parent != nil {
				return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container"
			}
			return false
		}

		//NOTE: Since this is a demo, I assume matchers will always hit a result

		movie := &Movie{}

		titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title"))
		movie.Title = scrape.Text(titleNode.FirstChild)

		releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year"))
		year, _ := strconv.Atoi(scrape.Text(releaseYearNode))
		movie.ReleaseYear = year

		actorsNode, _ := scrape.Find(root, actorsMatcher)
		movie.Actors = strings.Split(scrape.Text(actorsNode), ",")

		posterNode, _ := scrape.Find(root, posterMatcher)
		movie.Poster = scrape.Attr(posterNode, "src")

		movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie"))
		ids := make([]string, len(movieNodes))
		for i, movieNode := range movieNodes {
			ids[i] = scrape.Attr(movieNode, "data-asin")
		}
		movie.SimilarIDs = ids

		c.JSON(http.StatusOK, movie)
	})

	router.Run(":8080")
}

示例#30

0

显示文件

文件： main.go 项目： dsifford/wsu-parking-data

func (s structure) getSpaces() ([]space, error) {

	spaces := []space{
		space{Name: "WSU Permit"},
		space{Name: "Student OneCard"},
		space{Name: "Visitor"},
	}
	re := map[string]*regexp.Regexp{
		"avail":   regexp.MustCompile(`([0-9]+|NONE)`),
		"status":  regexp.MustCompile(`(OPEN|CLOSED|FULL)`),
		"updated": regexp.MustCompile(`(?P<a>^.+: )(?P<b>.+)`),
	}

	// Request
	client := &http.Client{
		Timeout: time.Second * 10,
	}
	req, err := http.NewRequest("GET", "http://m.wayne.edu/parking.php?location="+s.URLCode, nil)
	if err != nil {
		return spaces, errors.New("Request failed")
	}
	req.Header.Set("User-Agent", "Apple-iPhone6C1/")

	// Response
	resp, err := client.Do(req)
	if err != nil {
		return spaces, errors.New("Response failed")
	}
	defer resp.Body.Close()

	body, err := html.Parse(resp.Body)
	if err != nil {
		return spaces, errors.New("Error parsing response body")
	}

	// Parse relevant response data
	dataString, ok := scrape.Find(body, scrape.ByClass("available"))
	if !ok {
		return spaces, errors.New("Error: Line 105 - scrape.Find (available) -- not finding scrape info")
	}
	lastUpdated, ok := scrape.Find(body, scrape.ByClass("last_updated"))
	if !ok {
		return spaces, errors.New("Error: Line 109 - scrape.Find (last_updated) -- not finding scrape info")
	}

	avail := re["avail"].FindAllString(scrape.Text(dataString), -1)
	if len(avail) == 0 {
		avail = []string{"0", "0", "0"}
	}
	status := re["status"].FindAllString(scrape.Text(dataString), -1)
	if len(status) != 3 {
		return spaces, errors.New("Error: Line 118 - FindAllString (status) not returning 3 matches")
	}
	updated := re["updated"].FindStringSubmatch(scrape.Text(lastUpdated))
	if len(updated) == 0 {
		return spaces, errors.New("Error: Line 122 - FindAllStringSubmatch (updated) not finding a match")
	}

	for key := range spaces {
		spaces[key].Available = avail[key]
		spaces[key].Status = status[key]
		spaces[key].Updated = updated[2]
	}

	return spaces, nil

}