Пример #1
0
func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) {
	{
		// Author
		meta, _ := scrape.Find(root, func(n *html.Node) bool {
			return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name")
		})
		if nil != meta {
			content := scrape.Attr(meta, "content")
			bc.Author = &content
		}
	}
	for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool {
		return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class")
	}) {
		if idx != 0 {
			err = errors.New("There was more than 1 <div class='epg-content-right'/>")
			return
		}
		{
			// TitleEpisode
			txt, _ := scrape.Find(epg, func(n *html.Node) bool {
				return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom
			})
			if nil != txt {
				t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data))
				bc.TitleEpisode = &t
				txt.Parent.RemoveChild(txt.NextSibling)
				txt.Parent.RemoveChild(txt)
			}
		}
		{
			// Subject
			a, _ := scrape.Find(epg, func(n *html.Node) bool {
				return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom
			})
			if nil != a {
				u, _ := url.Parse(scrape.Attr(a, "href"))
				bc.Subject = bc.Source.ResolveReference(u)
			}
		}
		// purge some cruft
		for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool {
			clz := scrape.Attr(n, "class")
			return atom.H2 == n.DataAtom ||
				"mod modSharing" == clz ||
				"modGalery" == clz ||
				"sendungsLink" == clz ||
				"tabs-container" == clz
		}) {
			nn.Parent.RemoveChild(nn)
		}
		{
			description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent }))
			bc.Description = &description
		}
	}
	bc_ := r.Broadcast(*bc)
	ret = append(ret, &bc_)
	return
}
Пример #2
0
func TextWithBrFromNodeSet(nodes []*html.Node) string {
	parts := make([]string, len(nodes))
	for i, node := range nodes {
		for _, tag := range []atom.Atom{atom.Br, atom.Tr} {
			for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) {
				lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker}
				n.Parent.InsertBefore(&lfn, n.NextSibling)
			}
		}
		for _, tag := range []atom.Atom{atom.P, atom.Div} {
			for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) {
				lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker + lineFeedMarker}
				n.Parent.InsertBefore(&lfn, n.NextSibling)
			}
		}
		tmp := []string{}
		for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return html.TextNode == n.Type }) {
			tmp = append(tmp, n.Data)
		}
		parts[i] = strings.Join(tmp, "")
	}
	ret := strings.Join(parts, lineFeedMarker+lineFeedMarker)
	ret = NormaliseWhiteSpace(ret)
	ret = strings.Replace(ret, lineFeedMarker, "\n", -1)
	re := regexp.MustCompile("[ ]*(\\s)[ ]*") // collapse whitespace, keep \n
	ret = re.ReplaceAllString(ret, "$1")      // collapse whitespace (not the \n\n however)
	{
		re := regexp.MustCompile("\\s*\\n\\s*\\n\\s*") // collapse linefeeds
		ret = re.ReplaceAllString(ret, "\n\n")
	}
	return strings.TrimSpace(ret)
}
Пример #3
0
func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) {
	const closeDownHour int = 5
	for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) {
		year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time)
		if nil != err {
			panic(err)
		}
		// fmt.Printf("%d-%d-%d %s\n", year, month, day, err)
		for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) {
			m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a))
			if nil == m {
				panic(errors.New("Couldn't parse <a>"))
			}
			ur, _ := url.Parse(scrape.Attr(a, "href"))
			hour := r.MustParseInt(m[1])
			dayOffset := 0
			if hour < closeDownHour {
				dayOffset = 1
			}
			// fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title)
			bcu := broadcastURL(r.BroadcastURL{
				TimeURL: r.TimeURL{
					Time:    time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc),
					Source:  *day.Source.ResolveReference(ur),
					Station: day.Station,
				},
				Title: strings.TrimSpace(m[3]),
			})
			ret = append(ret, &bcu)
		}
	}
	return
}
Пример #4
0
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) {
	nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") })
	ret = make([]*r.Broadcast, len(nodes))
	for index, tim := range nodes {
		// prepare response
		bc := r.Broadcast{
			BroadcastURL: r.BroadcastURL{
				TimeURL: r.TimeURL(*day),
			},
		}
		// some defaults
		bc.Language = &lang_de
		bc.Publisher = &publisher
		// set start time
		{
			div_t := strings.TrimSpace(scrape.Text(tim))
			if 5 != len(div_t) {
				continue
			}
			hour := r.MustParseInt(div_t[0:2])
			minute := r.MustParseInt(div_t[3:5])
			bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone)
			if index > 0 {
				ret[index-1].DtEnd = &bc.Time
			}
		}
		for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool {
			return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class")
		}) {
			// Title
			bc.Title = strings.TrimSpace(scrape.Text(tit))
			href := scrape.Attr(tit, "href")
			if "" != href {
				u, _ := url.Parse(href)
				bc.Subject = day.Source.ResolveReference(u)
			}

			desc_node := tit.Parent
			desc_node.RemoveChild(tit)
			description := r.TextWithBrFromNodeSet([]*html.Node{desc_node})
			bc.Description = &description
			// fmt.Fprintf(os.Stderr, "\n")
		}
		ret[index] = &bc
	}
	// fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String())
	if len(nodes) > 0 {
		midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone)
		ret[len(nodes)-1].DtEnd = &midnight
	}
	return
}
Пример #5
0
// Scrape scrapes a site for a keyword
func (q *query) Scrape() []*match {

	// Request the URL
	resp, err := http.Get(q.SiteURL)
	if err != nil {
		panic(err)
		log.Fatal("Couldn't GET ", q.SiteURL)
	}

	// Parse the contents of the URL
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
		log.Fatal("Unable to parse response")
	}

	// Grab all the posts and print them
	posts := scrape.FindAll(root, scrape.ByClass("description"))
	matches := make([]*match, len(posts))
	for i, post := range posts {
		matches[i] = &match{
			Title:       scrape.Text(post.FirstChild.NextSibling),
			Description: scrape.Text(post),
			Link:        "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"),
			Price:       scrape.Text(post.NextSibling.NextSibling),
			Matched:     false,
		}
	}

	return matches
}
Пример #6
0
func main() {
	// request and parse the front page
	resp, err := http.Get("https://torguard.net/downloads.php")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		// must check for nil values
		// if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
		if n.DataAtom == atom.Tr {
			return true
		}
		return false
	}
	// grab all articles and print them
	articles := scrape.FindAll(root, matcher)
	for _, article := range articles {
		if strings.Contains(scrape.Text(article), "DEBIAN x64Bit") {
			fmt.Printf("%s\n", scrape.Text(article))
		}
		//fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
	}
}
Пример #7
0
Файл: co.go Проект: gozes/co
func Search(s JobSearch) []*Job {
	jobSlice := []*Job{}
	fmt.Println("before loop in search")

	for i := 0; i < 1000; i++ {
		go getPage(urlCh, respCh)
	}

	for s.root = fetchByKeyword(s.Keyword); checkNextPage(s) == true; s.root = fetchNextPage(s.Keyword) {
		fmt.Println("in loop in search")
		jobs := scrape.FindAll(s.root, allJobMatcher)
		fmt.Println(len(jobs))

		for i, job := range jobs {
			fmt.Println(i)
			fmt.Println(job)
			j := fillJobStruct(job)
			jobSlice = append(jobSlice, j)
			fmt.Println(pager)
		}
		fmt.Println("befor if")
		if len(jobs) < 50 {
			break
		}

	}

	return jobSlice

}
Пример #8
0
Файл: main.go Проект: anykao/p
func TorrentList(url string) ([]Torrent, error) {
	// request and parse the front page
	resp, err := http.Get(url)
	if err != nil {
		return make([]Torrent, 0), err
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		return make([]Torrent, 0), err
	}
	var torrents []Torrent
	if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok {
		// define a matcher
		matcher := func(n *html.Node) bool {
			// must check for nil values
			if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody {
				return true
			}
			return false
		}
		// grab all articles and print them
		trs := scrape.FindAll(content, matcher)
		for _, tr := range trs {
			torrents = append(torrents, ParseRecord(tr))
		}
	}
	resp.Body.Close()
	return torrents, nil
}
Пример #9
0
func indexPage(page string) (ind map[string]int, branches []string, err error) {
	resp, err := http.Get(page)
	if err != nil {
		return
	}
	root, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return
	}

	content, ok := scrape.Find(root, scrape.ById("bodyContent"))
	if !ok {
		return nil, nil, errors.New("no bodyContent element")
	}

	paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P))
	pageText := ""
	for _, p := range paragraphs {
		pageText += elementInnerText(p) + " "
	}
	words := strings.Fields(strings.ToLower(pageText))

	ind = map[string]int{}
	for _, word := range words {
		ind[word] = ind[word] + 1
	}

	links := findWikiLinks(content)
	branches = make([]string, len(links))
	for i, link := range links {
		branches[i] = "https://en.wikipedia.org" + link
	}
	return
}
Пример #10
0
func main() {
	// request and parse the front page
	resp, err := http.Get("https://news.ycombinator.com/")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		// must check for nil values
		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
			return scrape.Attr(n.Parent.Parent, "class") == "athing"
		}
		return false
	}
	// grab all articles and print them
	articles := scrape.FindAll(root, matcher)
	for i, article := range articles {
		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
	}
}
Пример #11
0
// Auth attempts to access a given URL, then enters the given
// credentials when the URL redirects to a login page.
func (s *Session) Auth(serviceURL, email, password string) error {
	resp, err := s.Get(serviceURL)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	parsed, err := html.ParseFragment(resp.Body, nil)
	if err != nil || len(parsed) == 0 {
		return err
	}
	root := parsed[0]
	form, ok := scrape.Find(root, scrape.ById("gaia_loginform"))
	if !ok {
		return errors.New("failed to process login page")
	}
	submission := url.Values{}
	for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) {
		submission.Add(getAttribute(input, "name"), getAttribute(input, "value"))
	}
	submission["Email"] = []string{email}
	submission["Passwd"] = []string{password}

	postResp, err := s.PostForm(resp.Request.URL.String(), submission)
	if err != nil {
		return err
	}
	postResp.Body.Close()

	if postResp.Request.Method == "POST" {
		return errors.New("login incorrect")
	}

	return nil
}
Пример #12
0
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the
// login form.
func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) {
	parsed, err := html.ParseFragment(res.Body, nil)
	if err != nil {
		return
	} else if len(parsed) != 1 {
		return nil, errors.New("wrong number of root elements")
	}

	root := parsed[0]

	var form loginFormInfo

	htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form))
	if !ok {
		return nil, errors.New("no form element found")
	}

	if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" {
		form.action = res.Request.URL.String()
	} else {
		actionURL, err := url.Parse(actionStr)
		if err != nil {
			return nil, err
		}
		if actionURL.Host == "" {
			actionURL.Host = res.Request.URL.Host
		}
		if actionURL.Scheme == "" {
			actionURL.Scheme = res.Request.URL.Scheme
		}
		if !path.IsAbs(actionURL.Path) {
			actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path)
		}
		form.action = actionURL.String()
	}

	inputs := scrape.FindAll(root, scrape.ByTag(atom.Input))
	form.otherFields = url.Values{}
	for _, input := range inputs {
		inputName := getNodeAttribute(input, "name")
		switch getNodeAttribute(input, "type") {
		case "text":
			form.usernameField = inputName
		case "password":
			form.passwordField = inputName
		default:
			form.otherFields.Add(inputName, getNodeAttribute(input, "value"))
		}
	}

	if form.usernameField == "" {
		return nil, errors.New("no username field found")
	} else if form.passwordField == "" {
		return nil, errors.New("no password field found")
	}

	return &form, nil
}
Пример #13
0
// parseCurrentSchedule parses the courses from the schedule list view page.
//
// If fetchMoreInfo is true, this will perform a request for each component to find out information
// about it.
func parseSchedule(rootNode *html.Node) ([]Course, error) {
	courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO"))
	result := make([]Course, 0, len(courseTables))
	for _, classTable := range courseTables {
		println("found course")

		titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER"))
		if !ok {
			// This will occur at least once, since the filter options are a PSGROUPBOXWBO.
			continue
		}

		infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO"))
		if len(infoTables) != 2 {
			return nil, errors.New("expected exactly 2 info tables but found " +
				strconv.Itoa(len(infoTables)))
		}

		courseInfoTable := infoTables[0]
		course, err := parseCourseInfoTable(courseInfoTable)
		if err != nil {
			return nil, err
		}

		// NOTE: there isn't really a standard way to parse the department/number.
		course.Name = nodeInnerText(titleElement)

		componentsInfoTable := infoTables[1]
		componentMaps, err := tableEntriesAsMaps(componentsInfoTable)
		if err != nil {
			return nil, err
		}
		course.Components = make([]Component, len(componentMaps))
		for i, componentMap := range componentMaps {
			course.Components[i], err = parseComponentInfoMap(componentMap)
			if err != nil {
				return nil, err
			}
		}

		result = append(result, course)
	}
	return result, nil
}
Пример #14
0
func getLink(r *html.Node) (s string) {
	buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn"))
	for _, button := range buttons {
		windowLocation := scrape.Attr(button, "onclick")
		link := strings.Split(windowLocation, "=")[1]
		s := strings.Trim(link, "'")
		return s
	}
	return
}
Пример #15
0
func parseHistoryItems(rootNode *html.Node) []*YoutubeVideoInfo {
	videoElements := scrape.FindAll(rootNode, scrape.ByClass("yt-lockup-video"))

	res := make([]*YoutubeVideoInfo, len(videoElements))
	for i, element := range videoElements {
		res[i] = parseVideoInfo(element)
	}

	return res
}
Пример #16
0
// Get Time, Source and Image from json html snippet
func (item *calendarItem) parseBroadcastSeedNode(root *html.Node) (bc *broadcastURL, err error) {
	bc = &broadcastURL{}
	bc.Station = *item.Station
	bc.Time = time.Time(item.DateTime)
	for _, a := range scrape.FindAll(root, func(n *html.Node) bool {
		if atom.A != n.DataAtom {
			return false
		}
		href := scrape.Attr(n, "href")
		return strings.HasPrefix(href, "/programm/radio/ausstrahlung-") && strings.HasSuffix(href, ".html")
	}) {
		ru, _ := url.Parse(scrape.Attr(a, "href"))
		bc.Source = *item.Station.ProgramURL.ResolveReference(ru)
	}
	for _, img := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Img == n.DataAtom }) {
		ru, _ := url.Parse(scrape.Attr(img, "src"))
		bc.Image = item.Station.ProgramURL.ResolveReference(ru)
	}
	return
}
Пример #17
0
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada
// dada. Hay que comprobar que no se devuelve error.
func Get(parada int) (TiemposParada, error) {
	resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" +
		strconv.Itoa(parada))
	if err != nil {
		return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.")
	}

	rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder())
	root, err := html.Parse(rInUTF8)
	if err != nil {
		return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.")
	}

	headers := scrape.FindAll(root, scrape.ByTag(atom.H1))
	if len(headers) < 2 {
		return TiemposParada{}, errors.New("La parada indicada parece errónea.")
	}

	lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36"))
	resultados := make([]ProximoBus, len(lineasTiempos))
	for i, item := range lineasTiempos {
		valores := scrape.FindAll(item, scrape.ByClass("style38"))
		resultados[i] = ProximoBus{
			Linea:   scrape.Text(valores[0]),
			Destino: scrape.Text(valores[2]),
			Minutos: scrape.Text(valores[3]),
		}
	}

	if len(resultados) == 0 {
		return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.")
	}

	return TiemposParada{
		Nombre:  scrape.Text(headers[1]),
		Tiempos: resultados,
		Momento: time.Now(),
		Codigo:  parada,
	}, nil

}
Пример #18
0
func TestTextWithBrFromNodeSet_001(t *testing.T) {
	f, err := os.Open("testdata/TextWithBrFromNodeSet_001.html")
	assert.NotNil(t, f, "ouch")
	assert.Nil(t, err, "ouch")
	root, err := html.Parse(f)
	assert.NotNil(t, root, "ouch")
	nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom })

	txt := TextWithBrFromNodeSet(nodes)
	assert.Equal(t, "foo\n\nbar\nfoo\n\nbar", txt, "ouch")

}
Пример #19
0
func extractEventDetails(root *html.Node) []*html.Node {
	eventNames := scrape.FindAll(root, eventNameMatcher)
	eventDescriptions := scrape.FindAll(root, eventDescriptionMatcher)
	eventDates := scrape.FindAll(root, eventDateMatcher)
	eventTimes := scrape.FindAll(root, eventTimeMatcher)
	eventLocations := scrape.FindAll(root, eventLocationMatcher)
	eventContacts := scrape.FindAll(root, eventContactPersonMatcher)

	// return nil if mandatory attributes are not found
	if len(eventNames) == 0 ||
		len(eventDates) == 0 ||
		len(eventContacts) == 0 {
		return nil
	}

	ensureAtMostOneElement(eventNames, eventNameToMatch)
	ensureAtMostOneElement(eventDescriptions, eventDescriptionToMatch)
	ensureAtMostOneElement(eventDates, eventDateToMatch)
	ensureAtMostOneElement(eventTimes, eventTimeToMatch)
	ensureAtMostOneElement(eventLocations, eventLocationToMatch)
	ensureAtMostOneElement(eventContacts, eventContactPersonToMatch)

	return []*html.Node{
		eventNames[0],
		eventDescriptions[0],
		eventDates[0],
		eventTimes[0],
		eventLocations[0],
		eventContacts[0],
	}
}
Пример #20
0
func NewListing(ctx appengine.Context, url string) (*Listing, error) {
	client := urlfetch.Client(ctx)
	resp, err := client.Get("http://167.88.16.61:2138/" + url)
	if err != nil {
		ctx.Errorf("%s", err)
	}
	ctx.Debugf("Craigslist request came back with status: %s", resp.Status)
	if err != nil {
		ctx.Errorf("%s", err)
		return nil, errors.New("Get listing failed")
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		ctx.Errorf("%s", "Parsing Error")
		return nil, errors.New("Parse body failed")
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
	if !ok {
		ctx.Errorf("%s", "Error getting title")
		return nil, errors.New("Get title failed")
	}
	price, ok := scrape.Find(root, scrape.ByClass("price"))
	if !ok {
		ctx.Errorf("%s", "Error getting price")
		return nil, errors.New("Get price failed")
	}
	intPrice, err := strconv.Atoi(scrape.Text(price)[1:])
	if err != nil {
		ctx.Errorf("Error casting price: %s", scrape.Text(price))
		return nil, err
	}
	images := scrape.FindAll(root, scrape.ByTag(atom.Img))
	imageUrl := ""
	for _, image := range images {
		if scrape.Attr(image, "title") == "image 1" {
			imageUrl = scrape.Attr(image, "src")
		}
	}

	ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title))

	return &Listing{
		Url:      url,
		Title:    scrape.Text(title),
		Price:    intPrice,
		ImageUrl: imageUrl,
	}, nil
}
Пример #21
0
// tableEntriesAsMaps takes a <table> and parses its headers and row entries.
// Often times, an HTML table has one row of headers followed by several rows of data. This method
// uses the headers as map keys. It returns an array of map objects representing the rows of the
// table, with the <th>'s as keys and their corresponding <td>'s as values.
func tableEntriesAsMaps(table *html.Node) ([]map[string]string, error) {
	headings := scrape.FindAll(table, scrape.ByTag(atom.Th))
	cells := scrape.FindAll(table, scrape.ByTag(atom.Td))
	if len(cells)%len(headings) != 0 {
		return nil, errors.New("number of cells should be divisible by number of headings")
	}

	headingText := make([]string, len(headings))
	for i, heading := range headings {
		headingText[i] = strings.TrimSpace(nodeInnerText(heading))
	}

	maps := make([]map[string]string, len(cells)/len(headings))
	for rowIndex := 0; rowIndex < len(maps); rowIndex++ {
		row := map[string]string{}
		maps[rowIndex] = row
		for colIndex := 0; colIndex < len(headings); colIndex++ {
			cellIndex := rowIndex*len(headings) + colIndex
			row[headingText[colIndex]] = strings.TrimSpace(nodeInnerText(cells[cellIndex]))
		}
	}

	return maps, nil
}
Пример #22
0
// parseServerStatus returns a slice of strings containing only server stats.
func parseServerStatus(root *html.Node) []string {
	var apacheStats []string
	// Lines with stats start with a number.
	var validStats = regexp.MustCompile(`^[0-9]`)
	// Grab all the table rows.
	rows := scrape.FindAll(root, scrape.ByTag(atom.Tr))
	// If each row matches - add it to the stats lines.
	for _, row := range rows {
		content := scrape.Text(row)
		if validStats.MatchString(content) {
			apacheStats = append(apacheStats, content)
		}
	}
	Log(fmt.Sprintf("parseServerStatus apacheStats='%d'", len(apacheStats)), "debug")
	return apacheStats
}
Пример #23
0
Файл: main.go Проект: anykao/p
func ParseRecord(n *html.Node) Torrent {
	tds := scrape.FindAll(n, scrape.ByTag(atom.Td))
	var size, uptime, uploader string
	if len(tds) == 4 {
		cat := scrape.Text(tds[0])[0:3]
		name, magnet, desc := ParseName(tds[1])
		matches := re.FindStringSubmatch(desc)
		uptime, size, uploader = matches[1], matches[2], matches[3]
		seed := scrape.Text(tds[2])
		leech := scrape.Text(tds[3])
		return Torrent{cat, name, magnet, size, uptime, uploader, seed, leech}
	} else {
		fmt.Println("Error: not expected format")
	}
	return Torrent{}
}
Пример #24
0
func main() {

	resp, err := http.Get("https://www.reddit.com")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "id") == "siteTable"
		}
		return false
	}
	table, ok := scrape.Find(root, matcher)
	if !ok {
		panic(ok)
	}
	matcher = func(n *html.Node) bool {
		if n.DataAtom == atom.Div && n.Parent != nil {
			return scrape.Attr(n, "data-type") == "link"
		}
		return false
	}

	articles := scrape.FindAll(table, matcher)
	var posts []Post

	for i := 0; i < len(articles); i++ {
		wg.Add(1)
		go func(n *html.Node) {
			post := parsepost(n)
			posts = append(posts, post)
			wg.Done()
		}(articles[i])
	}

	wg.Wait()

	for i := 0; i < len(posts); i++ {
		printpost(posts[i])
	}

}
Пример #25
0
func findWikiLinks(node *html.Node) []string {
	links := scrape.FindAll(node, scrape.ByTag(atom.A))
	res := make([]string, 0, len(links))
	for _, link := range links {
		var u string
		for _, attr := range link.Attr {
			if strings.ToLower(attr.Key) == "href" {
				u = attr.Val
				break
			}
		}
		if strings.HasPrefix(u, "/wiki/") {
			res = append(res, u)
		}
	}
	return res
}
Пример #26
0
func (s *station) parseDayURLsNode(root *html.Node) (ret []timeURL, err error) {
	i := 0
	for _, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom }) {
		rel := scrape.Attr(a, "href")
		d, err := s.newTimeURL(rel)
		if nil != err {
			continue
		}
		// use only every 3rd day schedule url because each one contains 3 days
		i += 1
		if 2 != i%3 {
			continue
		}
		// fmt.Printf("ok %s\n", d.String())
		ret = append(ret, timeURL(d))
	}
	return
}
Пример #27
0
func main() {
	// request and parse the front page
	resp, err := http.Get("https://torguard.net/downloads.php")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	rows := scrape.FindAll(root, scrape.ByTag(atom.Tr))
	for _, row := range rows {
		if strings.Contains(scrape.Text(row), "DEBIAN x64") {
			l := getLink(row)
			fmt.Printf("%s \n %s \n", scrape.Text(row), l)
		}
	}
}
Пример #28
0
func scraper() {
	fd, err := os.Open("/mnt/hgfs/Downloads/wiki.html")
	if err != nil {
		panic(err)
	}
	defer fd.Close()
	root, err := html.Parse(fd)
	if err != nil {
		panic(err)
	}
	t := html.NewTokenizer(root)

	// matcher := func(n *html.Node) bool {
	// 	if n.DataAtom == atom.Table {
	// 		return true
	// 	}
	// 	return false
	// }

	// rowMatcher := func(n *html.Node) bool {
	// 	if n.DataAtom == atom.Tr {
	// 		return true
	// 	}
	// 	return false
	// }
	tableMatcher := scrape.ById(tableID)
	table := scrape.FindAll(root, tableMatcher)

	for _, v := range table {
		if t.Token().Data == "tr" {
			fmt.Printf("%s\n", scrape.Text(v))
		} else {
			t.Next()
		}
	}

	// for , v := range table  {
	// 	fmt.Printf("%s\n", scrape.Text(v))
	// }

}
Пример #29
0
func TweetsToUser(u user.User) []tweet.Tweet {
	reqURL := SearchURL
	_url.SetQueryParams(&reqURL, map[string]string{
		"q": "to:" + u.ScreenName,
		"f": "tweets",
	})

	res, err := http.Get(reqURL.String())
	PanicIf(err)
	root, err := html.Parse(res.Body)
	PanicIf(err)

	tweetsMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet")
	}
	tweetScreenNameMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username")
	}
	tweetTextMatcher := func(n *html.Node) bool {
		return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text")
	}

	tweetNodes := scrape.FindAll(root, tweetsMatcher)
	tweets := make([]tweet.Tweet, len(tweetNodes))
	for i, n := range tweetNodes {
		t := tweet.Tweet{
			ID: scrape.Attr(n, "data-user-id"),
		}
		if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok {
			t.Author = *user.NewUser(scrape.Text(child))
		}
		if child, ok := scrape.Find(n, tweetTextMatcher); ok {
			t.Text = scrape.Text(child)
		}
		tweets[i] = t
	}

	return tweets
}
Пример #30
0
func main() {
	client := &http.Client{}
	req, err := http.NewRequest("GET", "http://whatsmyuseragent.com/", nil)
	if err != nil {
		panic(err)
	}
	req.Header.Set("User-Agent", ua)
	resp, err := client.Do(req)
	// resp, err := http.Get("http://whatsmyuseragent.com/")
	if err != nil {
		panic(err)
	}

	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	info := scrape.ByClass("info")
	data := scrape.FindAll(root, info)
	for _, v := range data {
		fmt.Printf("%s\n", scrape.Text(v))
	}
}