Beispiel #1
0
// Scrape scrapes a site for a keyword
func (q *query) Scrape() []*match {

	// Request the URL
	resp, err := http.Get(q.SiteURL)
	if err != nil {
		panic(err)
		log.Fatal("Couldn't GET ", q.SiteURL)
	}

	// Parse the contents of the URL
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
		log.Fatal("Unable to parse response")
	}

	// Grab all the posts and print them
	posts := scrape.FindAll(root, scrape.ByClass("description"))
	matches := make([]*match, len(posts))
	for i, post := range posts {
		matches[i] = &match{
			Title:       scrape.Text(post.FirstChild.NextSibling),
			Description: scrape.Text(post),
			Link:        "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"),
			Price:       scrape.Text(post.NextSibling.NextSibling),
			Matched:     false,
		}
	}

	return matches
}
// parseCurrentSchedule parses the courses from the schedule list view page.
//
// If fetchMoreInfo is true, this will perform a request for each component to find out information
// about it.
func parseSchedule(rootNode *html.Node) ([]Course, error) {
	courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO"))
	result := make([]Course, 0, len(courseTables))
	for _, classTable := range courseTables {
		println("found course")

		titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER"))
		if !ok {
			// This will occur at least once, since the filter options are a PSGROUPBOXWBO.
			continue
		}

		infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO"))
		if len(infoTables) != 2 {
			return nil, errors.New("expected exactly 2 info tables but found " +
				strconv.Itoa(len(infoTables)))
		}

		courseInfoTable := infoTables[0]
		course, err := parseCourseInfoTable(courseInfoTable)
		if err != nil {
			return nil, err
		}

		// NOTE: there isn't really a standard way to parse the department/number.
		course.Name = nodeInnerText(titleElement)

		componentsInfoTable := infoTables[1]
		componentMaps, err := tableEntriesAsMaps(componentsInfoTable)
		if err != nil {
			return nil, err
		}
		course.Components = make([]Component, len(componentMaps))
		for i, componentMap := range componentMaps {
			course.Components[i], err = parseComponentInfoMap(componentMap)
			if err != nil {
				return nil, err
			}
		}

		result = append(result, course)
	}
	return result, nil
}
Beispiel #3
0
func getLink(r *html.Node) (s string) {
	buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn"))
	for _, button := range buttons {
		windowLocation := scrape.Attr(button, "onclick")
		link := strings.Split(windowLocation, "=")[1]
		s := strings.Trim(link, "'")
		return s
	}
	return
}
Beispiel #4
0
func parseHistoryItems(rootNode *html.Node) []*YoutubeVideoInfo {
	videoElements := scrape.FindAll(rootNode, scrape.ByClass("yt-lockup-video"))

	res := make([]*YoutubeVideoInfo, len(videoElements))
	for i, element := range videoElements {
		res[i] = parseVideoInfo(element)
	}

	return res
}
Beispiel #5
0
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada
// dada. Hay que comprobar que no se devuelve error.
func Get(parada int) (TiemposParada, error) {
	resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" +
		strconv.Itoa(parada))
	if err != nil {
		return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.")
	}

	rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder())
	root, err := html.Parse(rInUTF8)
	if err != nil {
		return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.")
	}

	headers := scrape.FindAll(root, scrape.ByTag(atom.H1))
	if len(headers) < 2 {
		return TiemposParada{}, errors.New("La parada indicada parece errónea.")
	}

	lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36"))
	resultados := make([]ProximoBus, len(lineasTiempos))
	for i, item := range lineasTiempos {
		valores := scrape.FindAll(item, scrape.ByClass("style38"))
		resultados[i] = ProximoBus{
			Linea:   scrape.Text(valores[0]),
			Destino: scrape.Text(valores[2]),
			Minutos: scrape.Text(valores[3]),
		}
	}

	if len(resultados) == 0 {
		return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.")
	}

	return TiemposParada{
		Nombre:  scrape.Text(headers[1]),
		Tiempos: resultados,
		Momento: time.Now(),
		Codigo:  parada,
	}, nil

}
Beispiel #6
0
Datei: main.go Projekt: anykao/p
func ParseName(n *html.Node) (string, string, string) {
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.Td {
			return true
		}
		return false
	}

	var name, magnet, desc string

	if detName, ok := scrape.Find(n, scrape.ByClass("detName")); ok {
		name = scrape.Text(detName)
	}
	if anchor, ok := scrape.Find(n, matcher); ok {
		magnet = scrape.Attr(anchor, "href")
	}
	if detDesc, ok := scrape.Find(n, scrape.ByClass("detDesc")); ok {
		desc = scrape.Text(detDesc)
	}
	return name, magnet, desc
}
Beispiel #7
0
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo {
	var info YoutubeVideoInfo

	info.ID = scrape.Attr(element, "data-context-item-id")

	thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple"))
	if ok {
		thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img))
		if ok {
			info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src"))
		}
	}

	videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time"))
	if ok {
		durationStr := strings.TrimSpace(scrape.Text(videoTimeElement))
		info.Length, _ = parseVideoDuration(durationStr)
	}

	linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"}
	linkFieldPtrs := []*string{&info.Title, &info.Author}
	for i, class := range linkFieldClasses {
		linkContainer, ok := scrape.Find(element, scrape.ByClass(class))
		if ok {
			link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A))
			if ok {
				*linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link))
			}
		}
	}

	descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description"))
	if ok {
		info.Description = strings.TrimSpace(scrape.Text(descBox))
	}

	return &info
}
// fetchExtraScheduleInfo gets more information about each component.
//
// The rootNode argument should be the parsed schedule list view.
func fetchExtraScheduleInfo(client *http.Client, courses []Course, rootNode *html.Node) error {
	psForm, ok := scrape.Find(rootNode, scrape.ByClass("PSForm"))
	if !ok {
		return errors.New("could not find PSForm")
	}
	icsid, ok := scrape.Find(psForm, scrape.ById("ICSID"))
	if !ok {
		return errors.New("could not find ICSID")
	}

	formAction := getNodeAttribute(psForm, "action")
	sid := getNodeAttribute(icsid, "value")

	// TODO: figure out if there's a way to make this more robust or to load it lazily.
	sectionIndex := 0
	for courseIndex := range courses {
		course := &courses[courseIndex]
		for componentIndex := range course.Components {
			component := &course.Components[componentIndex]

			postData := generateClassDetailForm(sid, sectionIndex)
			res, reqErr := client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			courseOpen, parseErr := parseExtraComponentInfo(res.Body, component)
			if parseErr != nil {
				return parseErr
			}
			course.Open = &courseOpen

			postData = generateClassDetailBackForm(sid, sectionIndex)
			res, reqErr = client.PostForm(formAction, postData)
			if res != nil {
				defer res.Body.Close()
			}
			if reqErr != nil {
				return reqErr
			}

			sectionIndex++
		}
	}

	return nil
}
Beispiel #9
0
func NewListing(ctx appengine.Context, url string) (*Listing, error) {
	client := urlfetch.Client(ctx)
	resp, err := client.Get("http://167.88.16.61:2138/" + url)
	if err != nil {
		ctx.Errorf("%s", err)
	}
	ctx.Debugf("Craigslist request came back with status: %s", resp.Status)
	if err != nil {
		ctx.Errorf("%s", err)
		return nil, errors.New("Get listing failed")
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		ctx.Errorf("%s", "Parsing Error")
		return nil, errors.New("Parse body failed")
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
	if !ok {
		ctx.Errorf("%s", "Error getting title")
		return nil, errors.New("Get title failed")
	}
	price, ok := scrape.Find(root, scrape.ByClass("price"))
	if !ok {
		ctx.Errorf("%s", "Error getting price")
		return nil, errors.New("Get price failed")
	}
	intPrice, err := strconv.Atoi(scrape.Text(price)[1:])
	if err != nil {
		ctx.Errorf("Error casting price: %s", scrape.Text(price))
		return nil, err
	}
	images := scrape.FindAll(root, scrape.ByTag(atom.Img))
	imageUrl := ""
	for _, image := range images {
		if scrape.Attr(image, "title") == "image 1" {
			imageUrl = scrape.Attr(image, "src")
		}
	}

	ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title))

	return &Listing{
		Url:      url,
		Title:    scrape.Text(title),
		Price:    intPrice,
		ImageUrl: imageUrl,
	}, nil
}
Beispiel #10
0
// History asynchronously fetches the user's
// video viewing history.
// You may provide a cancel channel which you
// can close to cancel the fetch mid-way.
func (y *Youtube) History(cancel <-chan struct{}) (<-chan *YoutubeVideoInfo, <-chan error) {
	videoChan := make(chan *YoutubeVideoInfo)
	errChan := make(chan error, 1)

	go func() {
		defer close(videoChan)
		defer close(errChan)

		historyReq, _ := http.NewRequest("GET", "https://www.youtube.com/feed/history", nil)
		historyReq.Header.Set("User-Agent", spoofedUserAgent)
		resp, err := y.s.Do(historyReq)
		rootNode, err := html.Parse(resp.Body)
		resp.Body.Close()
		if err != nil {
			errChan <- err
			return
		}

		loadMoreHTML := rootNode
		contentHTML := rootNode
		for {
			items := parseHistoryItems(contentHTML)
			for _, item := range items {
				select {
				case videoChan <- item:
				case <-cancel:
					return
				}
			}

			if loadMoreHTML == nil {
				break
			}

			loadButton, ok := scrape.Find(loadMoreHTML, scrape.ByClass("yt-uix-load-more"))
			if ok {
				morePath := scrape.Attr(loadButton, "data-uix-load-more-href")
				loadMoreHTML, contentHTML, err = y.fetchMoreHistory(morePath)
				if err != nil {
					errChan <- err
					return
				}
			}
		}
	}()

	return videoChan, errChan
}
func parsepost(n *html.Node) Post {
	post := Post{}

	// get the title. uses a scrape inbuilt matcher
	title_scrape, _ := scrape.Find(n, scrape.ByClass("title"))
	title := scrape.Text(title_scrape.FirstChild)

	// get the subreddit. This requires a custom matcher.
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent != nil {
			return scrape.Attr(n, "class") == "subreddit hover may-blank"
		}
		return false
	}
	sub, _ := scrape.Find(n, matcher)
	subreddit := scrape.Text(sub)

	// get the url to the comments. requires custom matcher.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Ul && n.FirstChild != nil {
			return scrape.Attr(n, "class") == "flat-list buttons" && scrape.Attr(n.FirstChild, "class") == "first"
		}
		return false
	}
	ul, _ := scrape.Find(n, matcher)          // ul is a list of two buttons: one that links to a post's comments page, one a "share" function
	li := ul.FirstChild                       // the first list item of ul -- this will always be the comments page link.
	url := scrape.Attr(li.FirstChild, "href") // finally, the url found in the list item.

	// get the author. Uses custom matcher and magic.
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.A && n.Parent.DataAtom == atom.P {
			return strings.Contains(scrape.Attr(n, "href"), "/user/")
		}
		return false
	}
	author_scrape, _ := scrape.Find(n, matcher)
	author := scrape.Text(author_scrape)

	post.title = title
	post.subreddit = subreddit
	post.url = url
	post.author = author

	return post
}
Beispiel #12
0
func main() {
	client := &http.Client{}
	req, err := http.NewRequest("GET", "http://whatsmyuseragent.com/", nil)
	if err != nil {
		panic(err)
	}
	req.Header.Set("User-Agent", ua)
	resp, err := client.Do(req)
	// resp, err := http.Get("http://whatsmyuseragent.com/")
	if err != nil {
		panic(err)
	}

	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	info := scrape.ByClass("info")
	data := scrape.FindAll(root, info)
	for _, v := range data {
		fmt.Printf("%s\n", scrape.Text(v))
	}
}
Beispiel #13
0
func main() {

	router := gin.Default()
	router.GET("/movie/amazon/:amazon_id", func(c *gin.Context) {

		id, valid := validateAndFormatAmazonID(c.Param("amazon_id"))

		if !valid {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": "invalid amazon id",
				"id":    id,
			})
			return
		}

		resp, err := http.Get("http://www.amazon.de/gp/product/" + id)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		//item does not exist in amazon.de
		if resp.StatusCode == http.StatusNotFound {
			c.JSON(http.StatusNotFound, gin.H{
				"error": "product not available",
			})
			return
		}

		root, err := html.Parse(resp.Body)

		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{
				"error": err,
			})
			return
		}

		actorsMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Dd && n.Parent != nil &&
				n.PrevSibling != nil && n.PrevSibling.PrevSibling != nil {
				return scrape.Attr(n.Parent, "class") == "dv-meta-info size-small" &&
					scrape.Text(n.PrevSibling.PrevSibling) == "Darsteller:"
			}
			return false
		}

		posterMatcher := func(n *html.Node) bool {
			if n.DataAtom == atom.Img && n.Parent != nil {
				return scrape.Attr(n.Parent, "class") == "dp-meta-icon-container"
			}
			return false
		}

		//NOTE: Since this is a demo, I assume matchers will always hit a result

		movie := &Movie{}

		titleNode, _ := scrape.Find(root, scrape.ById("aiv-content-title"))
		movie.Title = scrape.Text(titleNode.FirstChild)

		releaseYearNode, _ := scrape.Find(root, scrape.ByClass("release-year"))
		year, _ := strconv.Atoi(scrape.Text(releaseYearNode))
		movie.ReleaseYear = year

		actorsNode, _ := scrape.Find(root, actorsMatcher)
		movie.Actors = strings.Split(scrape.Text(actorsNode), ",")

		posterNode, _ := scrape.Find(root, posterMatcher)
		movie.Poster = scrape.Attr(posterNode, "src")

		movieNodes := scrape.FindAll(root, scrape.ByClass("downloadable_movie"))
		ids := make([]string, len(movieNodes))
		for i, movieNode := range movieNodes {
			ids[i] = scrape.Attr(movieNode, "data-asin")
		}
		movie.SimilarIDs = ids

		c.JSON(http.StatusOK, movie)
	})

	router.Run(":8080")
}
Beispiel #14
0
//LivescoreParser parse livescore
func LivescoreParser(root *html.Node) []Match {
	var matches []Match

	contentElmt, contentOK := scrape.Find(root, scrape.ByClass(classContentTag))
	if contentOK {
		//find all row-gray
		rowGrayMatcher := func(n *html.Node) bool {
			classes := strings.Fields(scrape.Attr(n, "class"))
			for _, c := range classes {
				if c == classRowGray {
					parentClasses := strings.Fields(scrape.Attr(n.Parent, "class"))
					for _, pc := range parentClasses {
						if pc == classContentTag {
							return true
						}
					}
				}
			}
			return false
		}
		rows := scrape.FindAll(contentElmt, rowGrayMatcher)

		matchChann := make(chan Match)
		for _, rowElmt := range rows {
			go func(rowElmt *html.Node) {
				var time string
				var homeTeam string
				var awayTeam string
				var score string

				timeElmt, timeElmtOK := scrape.Find(rowElmt, scrape.ByClass(classMinElmt))
				if timeElmtOK {
					time = scrape.Text(timeElmt)
				}

				scoreElmt, scoreElmtOK := scrape.Find(rowElmt, scrape.ByClass(classScoreLink))
				if scoreElmtOK {
					score = scrape.Text(scoreElmt)
				}

				teamElmts := scrape.FindAll(rowElmt, scrape.ByClass(classPlyElmt))
				for i := 0; i < len(teamElmts); i++ {
					teamElmt := teamElmts[i]
					if i%2 == 0 {
						homeTeam = scrape.Text(teamElmt)
					} else {
						awayTeam = scrape.Text(teamElmt)
					}
				}
				match := Match{
					HomeTeam: homeTeam,
					AwayTeam: awayTeam,
					Score:    score,
					Time:     time,
				}

				matchChann <- match
			}(rowElmt)
		}

		for i := 0; i < len(rows); i++ {
			select {
			case m := <-matchChann:
				matches = append(matches, m)
			}
		}
		close(matchChann)
	}
	return matches
}
Beispiel #15
0
func doScrape(urlString string) AppData {
	fmt.Println(urlString)

	u, err := url.Parse(urlString)
	if err != nil {
		panic(err)
	}

	appData := AppData{}
	appData.PackageName = u.Query().Get("id")

	resp, err := http.Get(urlString)
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	genreMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "genre"
	}
	iconMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "image"
	}
	softwareVersionMatcher := func(n *html.Node) bool {
		return scrape.Attr(n, "itemprop") == "softwareVersion"
	}

	name, ok := scrape.Find(root, scrape.ByClass("id-app-title"))
	if ok {
		appData.Name = scrape.Text(name)
	}
	genre, ok := scrape.Find(root, genreMatcher)
	if ok {
		appData.Categories = append(appData.Categories, scrape.Text(genre))
	}
	icon, ok := scrape.Find(root, iconMatcher)
	if ok {
		iconSrc := scrape.Attr(icon, "src")
		iconUrl, err := url.Parse(iconSrc)
		if err != nil {
			panic(err)
		}
		if iconUrl.Scheme == "" {
			iconSrc = "https:" + iconSrc
		}

		resp, err = http.Get(iconSrc)
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()

		outputFile, err := os.Create("output/" + appData.PackageName + ".png")
		if err != nil {
			panic(err)
		}
		defer outputFile.Close()

		_, err = io.Copy(outputFile, resp.Body)
		if err != nil {
			panic(err)
		}
	}
	version, ok := scrape.Find(root, softwareVersionMatcher)
	if ok {
		appData.Version = strings.TrimSpace(scrape.Text(version))
	}

	return appData
}
Beispiel #16
0
func (s structure) getSpaces() ([]space, error) {

	spaces := []space{
		space{Name: "WSU Permit"},
		space{Name: "Student OneCard"},
		space{Name: "Visitor"},
	}
	re := map[string]*regexp.Regexp{
		"avail":   regexp.MustCompile(`([0-9]+|NONE)`),
		"status":  regexp.MustCompile(`(OPEN|CLOSED|FULL)`),
		"updated": regexp.MustCompile(`(?P<a>^.+: )(?P<b>.+)`),
	}

	// Request
	client := &http.Client{
		Timeout: time.Second * 10,
	}
	req, err := http.NewRequest("GET", "http://m.wayne.edu/parking.php?location="+s.URLCode, nil)
	if err != nil {
		return spaces, errors.New("Request failed")
	}
	req.Header.Set("User-Agent", "Apple-iPhone6C1/")

	// Response
	resp, err := client.Do(req)
	if err != nil {
		return spaces, errors.New("Response failed")
	}
	defer resp.Body.Close()

	body, err := html.Parse(resp.Body)
	if err != nil {
		return spaces, errors.New("Error parsing response body")
	}

	// Parse relevant response data
	dataString, ok := scrape.Find(body, scrape.ByClass("available"))
	if !ok {
		return spaces, errors.New("Error: Line 105 - scrape.Find (available) -- not finding scrape info")
	}
	lastUpdated, ok := scrape.Find(body, scrape.ByClass("last_updated"))
	if !ok {
		return spaces, errors.New("Error: Line 109 - scrape.Find (last_updated) -- not finding scrape info")
	}

	avail := re["avail"].FindAllString(scrape.Text(dataString), -1)
	if len(avail) == 0 {
		avail = []string{"0", "0", "0"}
	}
	status := re["status"].FindAllString(scrape.Text(dataString), -1)
	if len(status) != 3 {
		return spaces, errors.New("Error: Line 118 - FindAllString (status) not returning 3 matches")
	}
	updated := re["updated"].FindStringSubmatch(scrape.Text(lastUpdated))
	if len(updated) == 0 {
		return spaces, errors.New("Error: Line 122 - FindAllStringSubmatch (updated) not finding a match")
	}

	for key := range spaces {
		spaces[key].Available = avail[key]
		spaces[key].Status = status[key]
		spaces[key].Updated = updated[2]
	}

	return spaces, nil

}