func (bc *broadcast) parseBroadcastFromHtmlNode(root *html.Node) (ret []*r.Broadcast, err error) { { // Author meta, _ := scrape.Find(root, func(n *html.Node) bool { return atom.Meta == n.DataAtom && "Author" == scrape.Attr(n, "name") }) if nil != meta { content := scrape.Attr(meta, "content") bc.Author = &content } } for idx, epg := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "epg-content-right" == scrape.Attr(n, "class") }) { if idx != 0 { err = errors.New("There was more than 1 <div class='epg-content-right'/>") return } { // TitleEpisode txt, _ := scrape.Find(epg, func(n *html.Node) bool { return html.TextNode == n.Type && atom.H3 == n.Parent.DataAtom && atom.Br == n.NextSibling.DataAtom }) if nil != txt { t := strings.TrimSpace(r.NormaliseWhiteSpace(txt.Data)) bc.TitleEpisode = &t txt.Parent.RemoveChild(txt.NextSibling) txt.Parent.RemoveChild(txt) } } { // Subject a, _ := scrape.Find(epg, func(n *html.Node) bool { return atom.Div == n.Parent.DataAtom && "sendungsLink" == scrape.Attr(n.Parent, "class") && atom.A == n.DataAtom }) if nil != a { u, _ := url.Parse(scrape.Attr(a, "href")) bc.Subject = bc.Source.ResolveReference(u) } } // purge some cruft for _, nn := range scrape.FindAll(epg, func(n *html.Node) bool { clz := scrape.Attr(n, "class") return atom.H2 == n.DataAtom || "mod modSharing" == clz || "modGalery" == clz || "sendungsLink" == clz || "tabs-container" == clz }) { nn.Parent.RemoveChild(nn) } { description := r.TextWithBrFromNodeSet(scrape.FindAll(epg, func(n *html.Node) bool { return epg == n.Parent })) bc.Description = &description } } bc_ := r.Broadcast(*bc) ret = append(ret, &bc_) return }
func TextWithBrFromNodeSet(nodes []*html.Node) string { parts := make([]string, len(nodes)) for i, node := range nodes { for _, tag := range []atom.Atom{atom.Br, atom.Tr} { for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) { lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker} n.Parent.InsertBefore(&lfn, n.NextSibling) } } for _, tag := range []atom.Atom{atom.P, atom.Div} { for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) { lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker + lineFeedMarker} n.Parent.InsertBefore(&lfn, n.NextSibling) } } tmp := []string{} for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return html.TextNode == n.Type }) { tmp = append(tmp, n.Data) } parts[i] = strings.Join(tmp, "") } ret := strings.Join(parts, lineFeedMarker+lineFeedMarker) ret = NormaliseWhiteSpace(ret) ret = strings.Replace(ret, lineFeedMarker, "\n", -1) re := regexp.MustCompile("[ ]*(\\s)[ ]*") // collapse whitespace, keep \n ret = re.ReplaceAllString(ret, "$1") // collapse whitespace (not the \n\n however) { re := regexp.MustCompile("\\s*\\n\\s*\\n\\s*") // collapse linefeeds ret = re.ReplaceAllString(ret, "\n\n") } return strings.TrimSpace(ret) }
func (day *timeURL) parseBroadcastURLsNode(root *html.Node) (ret []*broadcastURL, err error) { const closeDownHour int = 5 for _, h4 := range scrape.FindAll(root, func(n *html.Node) bool { return atom.H4 == n.DataAtom }) { year, month, day_, err := timeForH4(scrape.Text(h4), &day.Time) if nil != err { panic(err) } // fmt.Printf("%d-%d-%d %s\n", year, month, day, err) for _, a := range scrape.FindAll(h4.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Dt == n.Parent.DataAtom }) { m := hourMinuteTitleRegExp.FindStringSubmatch(scrape.Text(a)) if nil == m { panic(errors.New("Couldn't parse <a>")) } ur, _ := url.Parse(scrape.Attr(a, "href")) hour := r.MustParseInt(m[1]) dayOffset := 0 if hour < closeDownHour { dayOffset = 1 } // fmt.Printf("%s %s\n", b.r.TimeURL.String(), b.Title) bcu := broadcastURL(r.BroadcastURL{ TimeURL: r.TimeURL{ Time: time.Date(year, month, day_+dayOffset, hour, r.MustParseInt(m[2]), 0, 0, localLoc), Source: *day.Source.ResolveReference(ur), Station: day.Station, }, Title: strings.TrimSpace(m[3]), }) ret = append(ret, &bcu) } } return }
func (day *timeURL) parseBroadcastsFromNode(root *html.Node) (ret []*r.Broadcast, err error) { nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom && "time" == scrape.Attr(n, "class") }) ret = make([]*r.Broadcast, len(nodes)) for index, tim := range nodes { // prepare response bc := r.Broadcast{ BroadcastURL: r.BroadcastURL{ TimeURL: r.TimeURL(*day), }, } // some defaults bc.Language = &lang_de bc.Publisher = &publisher // set start time { div_t := strings.TrimSpace(scrape.Text(tim)) if 5 != len(div_t) { continue } hour := r.MustParseInt(div_t[0:2]) minute := r.MustParseInt(div_t[3:5]) bc.Time = time.Date(day.Year(), day.Month(), day.Day(), hour, minute, 0, 0, day.TimeZone) if index > 0 { ret[index-1].DtEnd = &bc.Time } } for _, tit := range scrape.FindAll(tim.Parent, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Div == n.Parent.DataAtom && "descr" == scrape.Attr(n.Parent, "class") }) { // Title bc.Title = strings.TrimSpace(scrape.Text(tit)) href := scrape.Attr(tit, "href") if "" != href { u, _ := url.Parse(href) bc.Subject = day.Source.ResolveReference(u) } desc_node := tit.Parent desc_node.RemoveChild(tit) description := r.TextWithBrFromNodeSet([]*html.Node{desc_node}) bc.Description = &description // fmt.Fprintf(os.Stderr, "\n") } ret[index] = &bc } // fmt.Fprintf(os.Stderr, "len(ret) = %d '%s'\n", len(ret), day.Source.String()) if len(nodes) > 0 { midnight := time.Date(day.Year(), day.Month(), day.Day(), 24, 0, 0, 0, day.TimeZone) ret[len(nodes)-1].DtEnd = &midnight } return }
// Scrape scrapes a site for a keyword func (q *query) Scrape() []*match { // Request the URL resp, err := http.Get(q.SiteURL) if err != nil { panic(err) log.Fatal("Couldn't GET ", q.SiteURL) } // Parse the contents of the URL root, err := html.Parse(resp.Body) if err != nil { panic(err) log.Fatal("Unable to parse response") } // Grab all the posts and print them posts := scrape.FindAll(root, scrape.ByClass("description")) matches := make([]*match, len(posts)) for i, post := range posts { matches[i] = &match{ Title: scrape.Text(post.FirstChild.NextSibling), Description: scrape.Text(post), Link: "http://kijiji.ca" + scrape.Attr(post.FirstChild.NextSibling, "href"), Price: scrape.Text(post.NextSibling.NextSibling), Matched: false, } } return matches }
func main() { // request and parse the front page resp, err := http.Get("https://torguard.net/downloads.php") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values // if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { if n.DataAtom == atom.Tr { return true } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for _, article := range articles { if strings.Contains(scrape.Text(article), "DEBIAN x64Bit") { fmt.Printf("%s\n", scrape.Text(article)) } //fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
func Search(s JobSearch) []*Job { jobSlice := []*Job{} fmt.Println("before loop in search") for i := 0; i < 1000; i++ { go getPage(urlCh, respCh) } for s.root = fetchByKeyword(s.Keyword); checkNextPage(s) == true; s.root = fetchNextPage(s.Keyword) { fmt.Println("in loop in search") jobs := scrape.FindAll(s.root, allJobMatcher) fmt.Println(len(jobs)) for i, job := range jobs { fmt.Println(i) fmt.Println(job) j := fillJobStruct(job) jobSlice = append(jobSlice, j) fmt.Println(pager) } fmt.Println("befor if") if len(jobs) < 50 { break } } return jobSlice }
func TorrentList(url string) ([]Torrent, error) { // request and parse the front page resp, err := http.Get(url) if err != nil { return make([]Torrent, 0), err } root, err := html.Parse(resp.Body) if err != nil { return make([]Torrent, 0), err } var torrents []Torrent if content, ok := scrape.Find(root, scrape.ById("searchResult")); ok { // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.Tr && n.Parent.DataAtom == atom.Tbody { return true } return false } // grab all articles and print them trs := scrape.FindAll(content, matcher) for _, tr := range trs { torrents = append(torrents, ParseRecord(tr)) } } resp.Body.Close() return torrents, nil }
func indexPage(page string) (ind map[string]int, branches []string, err error) { resp, err := http.Get(page) if err != nil { return } root, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return } content, ok := scrape.Find(root, scrape.ById("bodyContent")) if !ok { return nil, nil, errors.New("no bodyContent element") } paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P)) pageText := "" for _, p := range paragraphs { pageText += elementInnerText(p) + " " } words := strings.Fields(strings.ToLower(pageText)) ind = map[string]int{} for _, word := range words { ind[word] = ind[word] + 1 } links := findWikiLinks(content) branches = make([]string, len(links)) for i, link := range links { branches[i] = "https://en.wikipedia.org" + link } return }
func main() { // request and parse the front page resp, err := http.Get("https://news.ycombinator.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } // define a matcher matcher := func(n *html.Node) bool { // must check for nil values if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { return scrape.Attr(n.Parent.Parent, "class") == "athing" } return false } // grab all articles and print them articles := scrape.FindAll(root, matcher) for i, article := range articles { fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) } }
// Auth attempts to access a given URL, then enters the given // credentials when the URL redirects to a login page. func (s *Session) Auth(serviceURL, email, password string) error { resp, err := s.Get(serviceURL) if err != nil { return err } defer resp.Body.Close() parsed, err := html.ParseFragment(resp.Body, nil) if err != nil || len(parsed) == 0 { return err } root := parsed[0] form, ok := scrape.Find(root, scrape.ById("gaia_loginform")) if !ok { return errors.New("failed to process login page") } submission := url.Values{} for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) { submission.Add(getAttribute(input, "name"), getAttribute(input, "value")) } submission["Email"] = []string{email} submission["Passwd"] = []string{password} postResp, err := s.PostForm(resp.Request.URL.String(), submission) if err != nil { return err } postResp.Body.Close() if postResp.Request.Method == "POST" { return errors.New("login incorrect") } return nil }
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the // login form. func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) { parsed, err := html.ParseFragment(res.Body, nil) if err != nil { return } else if len(parsed) != 1 { return nil, errors.New("wrong number of root elements") } root := parsed[0] var form loginFormInfo htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form)) if !ok { return nil, errors.New("no form element found") } if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" { form.action = res.Request.URL.String() } else { actionURL, err := url.Parse(actionStr) if err != nil { return nil, err } if actionURL.Host == "" { actionURL.Host = res.Request.URL.Host } if actionURL.Scheme == "" { actionURL.Scheme = res.Request.URL.Scheme } if !path.IsAbs(actionURL.Path) { actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path) } form.action = actionURL.String() } inputs := scrape.FindAll(root, scrape.ByTag(atom.Input)) form.otherFields = url.Values{} for _, input := range inputs { inputName := getNodeAttribute(input, "name") switch getNodeAttribute(input, "type") { case "text": form.usernameField = inputName case "password": form.passwordField = inputName default: form.otherFields.Add(inputName, getNodeAttribute(input, "value")) } } if form.usernameField == "" { return nil, errors.New("no username field found") } else if form.passwordField == "" { return nil, errors.New("no password field found") } return &form, nil }
// parseCurrentSchedule parses the courses from the schedule list view page. // // If fetchMoreInfo is true, this will perform a request for each component to find out information // about it. func parseSchedule(rootNode *html.Node) ([]Course, error) { courseTables := scrape.FindAll(rootNode, scrape.ByClass("PSGROUPBOXWBO")) result := make([]Course, 0, len(courseTables)) for _, classTable := range courseTables { println("found course") titleElement, ok := scrape.Find(classTable, scrape.ByClass("PAGROUPDIVIDER")) if !ok { // This will occur at least once, since the filter options are a PSGROUPBOXWBO. continue } infoTables := scrape.FindAll(classTable, scrape.ByClass("PSLEVEL3GRIDNBO")) if len(infoTables) != 2 { return nil, errors.New("expected exactly 2 info tables but found " + strconv.Itoa(len(infoTables))) } courseInfoTable := infoTables[0] course, err := parseCourseInfoTable(courseInfoTable) if err != nil { return nil, err } // NOTE: there isn't really a standard way to parse the department/number. course.Name = nodeInnerText(titleElement) componentsInfoTable := infoTables[1] componentMaps, err := tableEntriesAsMaps(componentsInfoTable) if err != nil { return nil, err } course.Components = make([]Component, len(componentMaps)) for i, componentMap := range componentMaps { course.Components[i], err = parseComponentInfoMap(componentMap) if err != nil { return nil, err } } result = append(result, course) } return result, nil }
func getLink(r *html.Node) (s string) { buttons := scrape.FindAll(r, scrape.ByClass("downloadbtn")) for _, button := range buttons { windowLocation := scrape.Attr(button, "onclick") link := strings.Split(windowLocation, "=")[1] s := strings.Trim(link, "'") return s } return }
func parseHistoryItems(rootNode *html.Node) []*YoutubeVideoInfo { videoElements := scrape.FindAll(rootNode, scrape.ByClass("yt-lockup-video")) res := make([]*YoutubeVideoInfo, len(videoElements)) for i, element := range videoElements { res[i] = parseVideoInfo(element) } return res }
// Get Time, Source and Image from json html snippet func (item *calendarItem) parseBroadcastSeedNode(root *html.Node) (bc *broadcastURL, err error) { bc = &broadcastURL{} bc.Station = *item.Station bc.Time = time.Time(item.DateTime) for _, a := range scrape.FindAll(root, func(n *html.Node) bool { if atom.A != n.DataAtom { return false } href := scrape.Attr(n, "href") return strings.HasPrefix(href, "/programm/radio/ausstrahlung-") && strings.HasSuffix(href, ".html") }) { ru, _ := url.Parse(scrape.Attr(a, "href")) bc.Source = *item.Station.ProgramURL.ResolveReference(ru) } for _, img := range scrape.FindAll(root, func(n *html.Node) bool { return atom.Img == n.DataAtom }) { ru, _ := url.Parse(scrape.Attr(img, "src")) bc.Image = item.Station.ProgramURL.ResolveReference(ru) } return }
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada // dada. Hay que comprobar que no se devuelve error. func Get(parada int) (TiemposParada, error) { resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" + strconv.Itoa(parada)) if err != nil { return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.") } rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder()) root, err := html.Parse(rInUTF8) if err != nil { return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.") } headers := scrape.FindAll(root, scrape.ByTag(atom.H1)) if len(headers) < 2 { return TiemposParada{}, errors.New("La parada indicada parece errónea.") } lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36")) resultados := make([]ProximoBus, len(lineasTiempos)) for i, item := range lineasTiempos { valores := scrape.FindAll(item, scrape.ByClass("style38")) resultados[i] = ProximoBus{ Linea: scrape.Text(valores[0]), Destino: scrape.Text(valores[2]), Minutos: scrape.Text(valores[3]), } } if len(resultados) == 0 { return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.") } return TiemposParada{ Nombre: scrape.Text(headers[1]), Tiempos: resultados, Momento: time.Now(), Codigo: parada, }, nil }
func TestTextWithBrFromNodeSet_001(t *testing.T) { f, err := os.Open("testdata/TextWithBrFromNodeSet_001.html") assert.NotNil(t, f, "ouch") assert.Nil(t, err, "ouch") root, err := html.Parse(f) assert.NotNil(t, root, "ouch") nodes := scrape.FindAll(root, func(n *html.Node) bool { return atom.Div == n.DataAtom }) txt := TextWithBrFromNodeSet(nodes) assert.Equal(t, "foo\n\nbar\nfoo\n\nbar", txt, "ouch") }
func extractEventDetails(root *html.Node) []*html.Node { eventNames := scrape.FindAll(root, eventNameMatcher) eventDescriptions := scrape.FindAll(root, eventDescriptionMatcher) eventDates := scrape.FindAll(root, eventDateMatcher) eventTimes := scrape.FindAll(root, eventTimeMatcher) eventLocations := scrape.FindAll(root, eventLocationMatcher) eventContacts := scrape.FindAll(root, eventContactPersonMatcher) // return nil if mandatory attributes are not found if len(eventNames) == 0 || len(eventDates) == 0 || len(eventContacts) == 0 { return nil } ensureAtMostOneElement(eventNames, eventNameToMatch) ensureAtMostOneElement(eventDescriptions, eventDescriptionToMatch) ensureAtMostOneElement(eventDates, eventDateToMatch) ensureAtMostOneElement(eventTimes, eventTimeToMatch) ensureAtMostOneElement(eventLocations, eventLocationToMatch) ensureAtMostOneElement(eventContacts, eventContactPersonToMatch) return []*html.Node{ eventNames[0], eventDescriptions[0], eventDates[0], eventTimes[0], eventLocations[0], eventContacts[0], } }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
// tableEntriesAsMaps takes a <table> and parses its headers and row entries. // Often times, an HTML table has one row of headers followed by several rows of data. This method // uses the headers as map keys. It returns an array of map objects representing the rows of the // table, with the <th>'s as keys and their corresponding <td>'s as values. func tableEntriesAsMaps(table *html.Node) ([]map[string]string, error) { headings := scrape.FindAll(table, scrape.ByTag(atom.Th)) cells := scrape.FindAll(table, scrape.ByTag(atom.Td)) if len(cells)%len(headings) != 0 { return nil, errors.New("number of cells should be divisible by number of headings") } headingText := make([]string, len(headings)) for i, heading := range headings { headingText[i] = strings.TrimSpace(nodeInnerText(heading)) } maps := make([]map[string]string, len(cells)/len(headings)) for rowIndex := 0; rowIndex < len(maps); rowIndex++ { row := map[string]string{} maps[rowIndex] = row for colIndex := 0; colIndex < len(headings); colIndex++ { cellIndex := rowIndex*len(headings) + colIndex row[headingText[colIndex]] = strings.TrimSpace(nodeInnerText(cells[cellIndex])) } } return maps, nil }
// parseServerStatus returns a slice of strings containing only server stats. func parseServerStatus(root *html.Node) []string { var apacheStats []string // Lines with stats start with a number. var validStats = regexp.MustCompile(`^[0-9]`) // Grab all the table rows. rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) // If each row matches - add it to the stats lines. for _, row := range rows { content := scrape.Text(row) if validStats.MatchString(content) { apacheStats = append(apacheStats, content) } } Log(fmt.Sprintf("parseServerStatus apacheStats='%d'", len(apacheStats)), "debug") return apacheStats }
func ParseRecord(n *html.Node) Torrent { tds := scrape.FindAll(n, scrape.ByTag(atom.Td)) var size, uptime, uploader string if len(tds) == 4 { cat := scrape.Text(tds[0])[0:3] name, magnet, desc := ParseName(tds[1]) matches := re.FindStringSubmatch(desc) uptime, size, uploader = matches[1], matches[2], matches[3] seed := scrape.Text(tds[2]) leech := scrape.Text(tds[3]) return Torrent{cat, name, magnet, size, uptime, uploader, seed, leech} } else { fmt.Println("Error: not expected format") } return Torrent{} }
func main() { resp, err := http.Get("https://www.reddit.com") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } matcher := func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "id") == "siteTable" } return false } table, ok := scrape.Find(root, matcher) if !ok { panic(ok) } matcher = func(n *html.Node) bool { if n.DataAtom == atom.Div && n.Parent != nil { return scrape.Attr(n, "data-type") == "link" } return false } articles := scrape.FindAll(table, matcher) var posts []Post for i := 0; i < len(articles); i++ { wg.Add(1) go func(n *html.Node) { post := parsepost(n) posts = append(posts, post) wg.Done() }(articles[i]) } wg.Wait() for i := 0; i < len(posts); i++ { printpost(posts[i]) } }
func findWikiLinks(node *html.Node) []string { links := scrape.FindAll(node, scrape.ByTag(atom.A)) res := make([]string, 0, len(links)) for _, link := range links { var u string for _, attr := range link.Attr { if strings.ToLower(attr.Key) == "href" { u = attr.Val break } } if strings.HasPrefix(u, "/wiki/") { res = append(res, u) } } return res }
func (s *station) parseDayURLsNode(root *html.Node) (ret []timeURL, err error) { i := 0 for _, a := range scrape.FindAll(root, func(n *html.Node) bool { return atom.A == n.DataAtom && atom.Td == n.Parent.DataAtom }) { rel := scrape.Attr(a, "href") d, err := s.newTimeURL(rel) if nil != err { continue } // use only every 3rd day schedule url because each one contains 3 days i += 1 if 2 != i%3 { continue } // fmt.Printf("ok %s\n", d.String()) ret = append(ret, timeURL(d)) } return }
func main() { // request and parse the front page resp, err := http.Get("https://torguard.net/downloads.php") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) for _, row := range rows { if strings.Contains(scrape.Text(row), "DEBIAN x64") { l := getLink(row) fmt.Printf("%s \n %s \n", scrape.Text(row), l) } } }
func scraper() { fd, err := os.Open("/mnt/hgfs/Downloads/wiki.html") if err != nil { panic(err) } defer fd.Close() root, err := html.Parse(fd) if err != nil { panic(err) } t := html.NewTokenizer(root) // matcher := func(n *html.Node) bool { // if n.DataAtom == atom.Table { // return true // } // return false // } // rowMatcher := func(n *html.Node) bool { // if n.DataAtom == atom.Tr { // return true // } // return false // } tableMatcher := scrape.ById(tableID) table := scrape.FindAll(root, tableMatcher) for _, v := range table { if t.Token().Data == "tr" { fmt.Printf("%s\n", scrape.Text(v)) } else { t.Next() } } // for , v := range table { // fmt.Printf("%s\n", scrape.Text(v)) // } }
func TweetsToUser(u user.User) []tweet.Tweet { reqURL := SearchURL _url.SetQueryParams(&reqURL, map[string]string{ "q": "to:" + u.ScreenName, "f": "tweets", }) res, err := http.Get(reqURL.String()) PanicIf(err) root, err := html.Parse(res.Body) PanicIf(err) tweetsMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Div && strings.HasPrefix(scrape.Attr(n, "class"), "tweet original-tweet") } tweetScreenNameMatcher := func(n *html.Node) bool { return n.DataAtom == atom.Span && strings.HasPrefix(scrape.Attr(n, "class"), "username") } tweetTextMatcher := func(n *html.Node) bool { return n.DataAtom == atom.P && strings.HasSuffix(scrape.Attr(n, "class"), "tweet-text") } tweetNodes := scrape.FindAll(root, tweetsMatcher) tweets := make([]tweet.Tweet, len(tweetNodes)) for i, n := range tweetNodes { t := tweet.Tweet{ ID: scrape.Attr(n, "data-user-id"), } if child, ok := scrape.Find(n, tweetScreenNameMatcher); ok { t.Author = *user.NewUser(scrape.Text(child)) } if child, ok := scrape.Find(n, tweetTextMatcher); ok { t.Text = scrape.Text(child) } tweets[i] = t } return tweets }
func main() { client := &http.Client{} req, err := http.NewRequest("GET", "http://whatsmyuseragent.com/", nil) if err != nil { panic(err) } req.Header.Set("User-Agent", ua) resp, err := client.Do(req) // resp, err := http.Get("http://whatsmyuseragent.com/") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } defer resp.Body.Close() info := scrape.ByClass("info") data := scrape.FindAll(root, info) for _, v := range data { fmt.Printf("%s\n", scrape.Text(v)) } }