// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the // login form. func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) { parsed, err := html.ParseFragment(res.Body, nil) if err != nil { return } else if len(parsed) != 1 { return nil, errors.New("wrong number of root elements") } root := parsed[0] var form loginFormInfo htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form)) if !ok { return nil, errors.New("no form element found") } if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" { form.action = res.Request.URL.String() } else { actionURL, err := url.Parse(actionStr) if err != nil { return nil, err } if actionURL.Host == "" { actionURL.Host = res.Request.URL.Host } if actionURL.Scheme == "" { actionURL.Scheme = res.Request.URL.Scheme } if !path.IsAbs(actionURL.Path) { actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path) } form.action = actionURL.String() } inputs := scrape.FindAll(root, scrape.ByTag(atom.Input)) form.otherFields = url.Values{} for _, input := range inputs { inputName := getNodeAttribute(input, "name") switch getNodeAttribute(input, "type") { case "text": form.usernameField = inputName case "password": form.passwordField = inputName default: form.otherFields.Add(inputName, getNodeAttribute(input, "value")) } } if form.usernameField == "" { return nil, errors.New("no username field found") } else if form.passwordField == "" { return nil, errors.New("no password field found") } return &form, nil }
func NewListing(ctx appengine.Context, url string) (*Listing, error) { client := urlfetch.Client(ctx) resp, err := client.Get("http://167.88.16.61:2138/" + url) if err != nil { ctx.Errorf("%s", err) } ctx.Debugf("Craigslist request came back with status: %s", resp.Status) if err != nil { ctx.Errorf("%s", err) return nil, errors.New("Get listing failed") } root, err := html.Parse(resp.Body) if err != nil { ctx.Errorf("%s", "Parsing Error") return nil, errors.New("Parse body failed") } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if !ok { ctx.Errorf("%s", "Error getting title") return nil, errors.New("Get title failed") } price, ok := scrape.Find(root, scrape.ByClass("price")) if !ok { ctx.Errorf("%s", "Error getting price") return nil, errors.New("Get price failed") } intPrice, err := strconv.Atoi(scrape.Text(price)[1:]) if err != nil { ctx.Errorf("Error casting price: %s", scrape.Text(price)) return nil, err } images := scrape.FindAll(root, scrape.ByTag(atom.Img)) imageUrl := "" for _, image := range images { if scrape.Attr(image, "title") == "image 1" { imageUrl = scrape.Attr(image, "src") } } ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title)) return &Listing{ Url: url, Title: scrape.Text(title), Price: intPrice, ImageUrl: imageUrl, }, nil }
// Auth attempts to access a given URL, then enters the given // credentials when the URL redirects to a login page. func (s *Session) Auth(serviceURL, email, password string) error { resp, err := s.Get(serviceURL) if err != nil { return err } defer resp.Body.Close() parsed, err := html.ParseFragment(resp.Body, nil) if err != nil || len(parsed) == 0 { return err } root := parsed[0] form, ok := scrape.Find(root, scrape.ById("gaia_loginform")) if !ok { return errors.New("failed to process login page") } submission := url.Values{} for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) { submission.Add(getAttribute(input, "name"), getAttribute(input, "value")) } submission["Email"] = []string{email} submission["Passwd"] = []string{password} postResp, err := s.PostForm(resp.Request.URL.String(), submission) if err != nil { return err } postResp.Body.Close() if postResp.Request.Method == "POST" { return errors.New("login incorrect") } return nil }
func indexPage(page string) (ind map[string]int, branches []string, err error) { resp, err := http.Get(page) if err != nil { return } root, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return } content, ok := scrape.Find(root, scrape.ById("bodyContent")) if !ok { return nil, nil, errors.New("no bodyContent element") } paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P)) pageText := "" for _, p := range paragraphs { pageText += elementInnerText(p) + " " } words := strings.Fields(strings.ToLower(pageText)) ind = map[string]int{} for _, word := range words { ind[word] = ind[word] + 1 } links := findWikiLinks(content) branches = make([]string, len(links)) for i, link := range links { branches[i] = "https://en.wikipedia.org" + link } return }
func findHTMLTitle(doc *html.Node) string { el, found := scrape.Find(doc, scrape.ByTag(atom.Title)) if !found { return "" } return scrape.Text(el) }
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo { var info YoutubeVideoInfo info.ID = scrape.Attr(element, "data-context-item-id") thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple")) if ok { thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img)) if ok { info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src")) } } videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time")) if ok { durationStr := strings.TrimSpace(scrape.Text(videoTimeElement)) info.Length, _ = parseVideoDuration(durationStr) } linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"} linkFieldPtrs := []*string{&info.Title, &info.Author} for i, class := range linkFieldClasses { linkContainer, ok := scrape.Find(element, scrape.ByClass(class)) if ok { link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A)) if ok { *linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link)) } } } descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description")) if ok { info.Description = strings.TrimSpace(scrape.Text(descBox)) } return &info }
func resolveUrl(website string) string { site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Printf("%s", err) os.Exit(1) panic(err) } title, _ := scrape.Find(contents, scrape.ByTag(atom.Title)) var titulo string = scrape.Text(title) return titulo }
func queryWikipedia(word string) string { word = strings.TrimSpace(word) website := "http://en.wikipedia.com/wiki/" + word site := getURL(website) contents, err := html.Parse(site.Body) if err != nil { fmt.Print("%s", err) panic(err) os.Exit(1) } intro, _ := scrape.Find(contents, scrape.ByTag(atom.P)) var resp string = scrape.Text(intro) return resp }
// tableEntriesAsMaps takes a <table> and parses its headers and row entries. // Often times, an HTML table has one row of headers followed by several rows of data. This method // uses the headers as map keys. It returns an array of map objects representing the rows of the // table, with the <th>'s as keys and their corresponding <td>'s as values. func tableEntriesAsMaps(table *html.Node) ([]map[string]string, error) { headings := scrape.FindAll(table, scrape.ByTag(atom.Th)) cells := scrape.FindAll(table, scrape.ByTag(atom.Td)) if len(cells)%len(headings) != 0 { return nil, errors.New("number of cells should be divisible by number of headings") } headingText := make([]string, len(headings)) for i, heading := range headings { headingText[i] = strings.TrimSpace(nodeInnerText(heading)) } maps := make([]map[string]string, len(cells)/len(headings)) for rowIndex := 0; rowIndex < len(maps); rowIndex++ { row := map[string]string{} maps[rowIndex] = row for colIndex := 0; colIndex < len(headings); colIndex++ { cellIndex := rowIndex*len(headings) + colIndex row[headingText[colIndex]] = strings.TrimSpace(nodeInnerText(cells[cellIndex])) } } return maps, nil }
func ParseRecord(n *html.Node) Torrent { tds := scrape.FindAll(n, scrape.ByTag(atom.Td)) var size, uptime, uploader string if len(tds) == 4 { cat := scrape.Text(tds[0])[0:3] name, magnet, desc := ParseName(tds[1]) matches := re.FindStringSubmatch(desc) uptime, size, uploader = matches[1], matches[2], matches[3] seed := scrape.Text(tds[2]) leech := scrape.Text(tds[3]) return Torrent{cat, name, magnet, size, uptime, uploader, seed, leech} } else { fmt.Println("Error: not expected format") } return Torrent{} }
// parseServerStatus returns a slice of strings containing only server stats. func parseServerStatus(root *html.Node) []string { var apacheStats []string // Lines with stats start with a number. var validStats = regexp.MustCompile(`^[0-9]`) // Grab all the table rows. rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) // If each row matches - add it to the stats lines. for _, row := range rows { content := scrape.Text(row) if validStats.MatchString(content) { apacheStats = append(apacheStats, content) } } Log(fmt.Sprintf("parseServerStatus apacheStats='%d'", len(apacheStats)), "debug") return apacheStats }
func findWikiLinks(node *html.Node) []string { links := scrape.FindAll(node, scrape.ByTag(atom.A)) res := make([]string, 0, len(links)) for _, link := range links { var u string for _, attr := range link.Attr { if strings.ToLower(attr.Key) == "href" { u = attr.Val break } } if strings.HasPrefix(u, "/wiki/") { res = append(res, u) } } return res }
func main() { // request and parse the front page resp, err := http.Get("https://torguard.net/downloads.php") if err != nil { panic(err) } root, err := html.Parse(resp.Body) if err != nil { panic(err) } rows := scrape.FindAll(root, scrape.ByTag(atom.Tr)) for _, row := range rows { if strings.Contains(scrape.Text(row), "DEBIAN x64") { l := getLink(row) fmt.Printf("%s \n %s \n", scrape.Text(row), l) } } }
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada // dada. Hay que comprobar que no se devuelve error. func Get(parada int) (TiemposParada, error) { resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" + strconv.Itoa(parada)) if err != nil { return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.") } rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder()) root, err := html.Parse(rInUTF8) if err != nil { return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.") } headers := scrape.FindAll(root, scrape.ByTag(atom.H1)) if len(headers) < 2 { return TiemposParada{}, errors.New("La parada indicada parece errónea.") } lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36")) resultados := make([]ProximoBus, len(lineasTiempos)) for i, item := range lineasTiempos { valores := scrape.FindAll(item, scrape.ByClass("style38")) resultados[i] = ProximoBus{ Linea: scrape.Text(valores[0]), Destino: scrape.Text(valores[2]), Minutos: scrape.Text(valores[3]), } } if len(resultados) == 0 { return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.") } return TiemposParada{ Nombre: scrape.Text(headers[1]), Tiempos: resultados, Momento: time.Now(), Codigo: parada, }, nil }
func getTitle(url string) string { resp, err := http.Get(url) if err != nil { fmt.Println("error:", err) return "error" } root, err := html.Parse(resp.Body) if err != nil { fmt.Println("error:", err) return "error" } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if ok { return scrape.Text(title) } return "unknown" }
// parseExtraComponentInfo parses the "Class Detail" page for a component. func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) { nodes, err := html.ParseFragment(body, nil) if err != nil { return } if len(nodes) != 1 { return false, errors.New("invalid number of root elements") } openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT")) if !ok { return false, errors.New("open status not found") } courseOpen = (nodeInnerText(openStatus) == "Open") availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3")) if !ok { return courseOpen, errors.New("could not find availability info") } rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr)) if len(rows) != 7 { return courseOpen, errors.New("invalid number of rows in availability table") } var availability ClassAvailability cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 2") } availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td))) if len(cols) != 2 { return courseOpen, errors.New("expected 2 aligned columns in row 4") } availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1]))) if err != nil { return } cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td))) if len(cols) != 1 { return courseOpen, errors.New("expected 1 aligned column in row 6") } availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0]))) if err != nil { return } component.ClassAvailability = &availability return }