예제 #1
0
// parseGenericLoginForm takes a login page and parses the first form it finds, treating it as the
// login form.
func parseGenericLoginForm(res *http.Response) (result *loginFormInfo, err error) {
	parsed, err := html.ParseFragment(res.Body, nil)
	if err != nil {
		return
	} else if len(parsed) != 1 {
		return nil, errors.New("wrong number of root elements")
	}

	root := parsed[0]

	var form loginFormInfo

	htmlForm, ok := scrape.Find(root, scrape.ByTag(atom.Form))
	if !ok {
		return nil, errors.New("no form element found")
	}

	if actionStr := getNodeAttribute(htmlForm, "action"); actionStr == "" {
		form.action = res.Request.URL.String()
	} else {
		actionURL, err := url.Parse(actionStr)
		if err != nil {
			return nil, err
		}
		if actionURL.Host == "" {
			actionURL.Host = res.Request.URL.Host
		}
		if actionURL.Scheme == "" {
			actionURL.Scheme = res.Request.URL.Scheme
		}
		if !path.IsAbs(actionURL.Path) {
			actionURL.Path = path.Join(res.Request.URL.Path, actionURL.Path)
		}
		form.action = actionURL.String()
	}

	inputs := scrape.FindAll(root, scrape.ByTag(atom.Input))
	form.otherFields = url.Values{}
	for _, input := range inputs {
		inputName := getNodeAttribute(input, "name")
		switch getNodeAttribute(input, "type") {
		case "text":
			form.usernameField = inputName
		case "password":
			form.passwordField = inputName
		default:
			form.otherFields.Add(inputName, getNodeAttribute(input, "value"))
		}
	}

	if form.usernameField == "" {
		return nil, errors.New("no username field found")
	} else if form.passwordField == "" {
		return nil, errors.New("no password field found")
	}

	return &form, nil
}
예제 #2
0
func NewListing(ctx appengine.Context, url string) (*Listing, error) {
	client := urlfetch.Client(ctx)
	resp, err := client.Get("http://167.88.16.61:2138/" + url)
	if err != nil {
		ctx.Errorf("%s", err)
	}
	ctx.Debugf("Craigslist request came back with status: %s", resp.Status)
	if err != nil {
		ctx.Errorf("%s", err)
		return nil, errors.New("Get listing failed")
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		ctx.Errorf("%s", "Parsing Error")
		return nil, errors.New("Parse body failed")
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
	if !ok {
		ctx.Errorf("%s", "Error getting title")
		return nil, errors.New("Get title failed")
	}
	price, ok := scrape.Find(root, scrape.ByClass("price"))
	if !ok {
		ctx.Errorf("%s", "Error getting price")
		return nil, errors.New("Get price failed")
	}
	intPrice, err := strconv.Atoi(scrape.Text(price)[1:])
	if err != nil {
		ctx.Errorf("Error casting price: %s", scrape.Text(price))
		return nil, err
	}
	images := scrape.FindAll(root, scrape.ByTag(atom.Img))
	imageUrl := ""
	for _, image := range images {
		if scrape.Attr(image, "title") == "image 1" {
			imageUrl = scrape.Attr(image, "src")
		}
	}

	ctx.Debugf("Craigslist returned listing.Price: %d, listing.Title: %s", intPrice, scrape.Text(title))

	return &Listing{
		Url:      url,
		Title:    scrape.Text(title),
		Price:    intPrice,
		ImageUrl: imageUrl,
	}, nil
}
예제 #3
0
파일: auth.go 프로젝트: unixpickle/gscrape
// Auth attempts to access a given URL, then enters the given
// credentials when the URL redirects to a login page.
func (s *Session) Auth(serviceURL, email, password string) error {
	resp, err := s.Get(serviceURL)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	parsed, err := html.ParseFragment(resp.Body, nil)
	if err != nil || len(parsed) == 0 {
		return err
	}
	root := parsed[0]
	form, ok := scrape.Find(root, scrape.ById("gaia_loginform"))
	if !ok {
		return errors.New("failed to process login page")
	}
	submission := url.Values{}
	for _, input := range scrape.FindAll(form, scrape.ByTag(atom.Input)) {
		submission.Add(getAttribute(input, "name"), getAttribute(input, "value"))
	}
	submission["Email"] = []string{email}
	submission["Passwd"] = []string{password}

	postResp, err := s.PostForm(resp.Request.URL.String(), submission)
	if err != nil {
		return err
	}
	postResp.Body.Close()

	if postResp.Request.Method == "POST" {
		return errors.New("login incorrect")
	}

	return nil
}
예제 #4
0
파일: index.go 프로젝트: unixpickle/weakai
func indexPage(page string) (ind map[string]int, branches []string, err error) {
	resp, err := http.Get(page)
	if err != nil {
		return
	}
	root, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return
	}

	content, ok := scrape.Find(root, scrape.ById("bodyContent"))
	if !ok {
		return nil, nil, errors.New("no bodyContent element")
	}

	paragraphs := scrape.FindAll(content, scrape.ByTag(atom.P))
	pageText := ""
	for _, p := range paragraphs {
		pageText += elementInnerText(p) + " "
	}
	words := strings.Fields(strings.ToLower(pageText))

	ind = map[string]int{}
	for _, word := range words {
		ind[word] = ind[word] + 1
	}

	links := findWikiLinks(content)
	branches = make([]string, len(links))
	for i, link := range links {
		branches[i] = "https://en.wikipedia.org" + link
	}
	return
}
예제 #5
0
파일: title.go 프로젝트: mcmillan/socialite
func findHTMLTitle(doc *html.Node) string {
	el, found := scrape.Find(doc, scrape.ByTag(atom.Title))

	if !found {
		return ""
	}

	return scrape.Text(el)
}
예제 #6
0
func parseVideoInfo(element *html.Node) *YoutubeVideoInfo {
	var info YoutubeVideoInfo

	info.ID = scrape.Attr(element, "data-context-item-id")

	thumbnailContainer, ok := scrape.Find(element, scrape.ByClass("yt-thumb-simple"))
	if ok {
		thumbnailImage, ok := scrape.Find(thumbnailContainer, scrape.ByTag(atom.Img))
		if ok {
			info.ThumbnailURL, _ = url.Parse(scrape.Attr(thumbnailImage, "src"))
		}
	}

	videoTimeElement, ok := scrape.Find(element, scrape.ByClass("video-time"))
	if ok {
		durationStr := strings.TrimSpace(scrape.Text(videoTimeElement))
		info.Length, _ = parseVideoDuration(durationStr)
	}

	linkFieldClasses := []string{"yt-lockup-title", "yt-lockup-byline"}
	linkFieldPtrs := []*string{&info.Title, &info.Author}
	for i, class := range linkFieldClasses {
		linkContainer, ok := scrape.Find(element, scrape.ByClass(class))
		if ok {
			link, ok := scrape.Find(linkContainer, scrape.ByTag(atom.A))
			if ok {
				*linkFieldPtrs[i] = strings.TrimSpace(scrape.Text(link))
			}
		}
	}

	descBox, ok := scrape.Find(element, scrape.ByClass("yt-lockup-description"))
	if ok {
		info.Description = strings.TrimSpace(scrape.Text(descBox))
	}

	return &info
}
예제 #7
0
func resolveUrl(website string) string {
	site := getURL(website)

	contents, err := html.Parse(site.Body)
	if err != nil {
		fmt.Printf("%s", err)
		os.Exit(1)
		panic(err)
	}
	title, _ := scrape.Find(contents, scrape.ByTag(atom.Title))
	var titulo string = scrape.Text(title)
	return titulo

}
예제 #8
0
func queryWikipedia(word string) string {
	word = strings.TrimSpace(word)
	website := "http://en.wikipedia.com/wiki/" + word
	site := getURL(website)
	contents, err := html.Parse(site.Body)

	if err != nil {
		fmt.Print("%s", err)
		panic(err)
		os.Exit(1)
	}
	intro, _ := scrape.Find(contents, scrape.ByTag(atom.P))
	var resp string = scrape.Text(intro)
	return resp
}
예제 #9
0
// tableEntriesAsMaps takes a <table> and parses its headers and row entries.
// Often times, an HTML table has one row of headers followed by several rows of data. This method
// uses the headers as map keys. It returns an array of map objects representing the rows of the
// table, with the <th>'s as keys and their corresponding <td>'s as values.
func tableEntriesAsMaps(table *html.Node) ([]map[string]string, error) {
	headings := scrape.FindAll(table, scrape.ByTag(atom.Th))
	cells := scrape.FindAll(table, scrape.ByTag(atom.Td))
	if len(cells)%len(headings) != 0 {
		return nil, errors.New("number of cells should be divisible by number of headings")
	}

	headingText := make([]string, len(headings))
	for i, heading := range headings {
		headingText[i] = strings.TrimSpace(nodeInnerText(heading))
	}

	maps := make([]map[string]string, len(cells)/len(headings))
	for rowIndex := 0; rowIndex < len(maps); rowIndex++ {
		row := map[string]string{}
		maps[rowIndex] = row
		for colIndex := 0; colIndex < len(headings); colIndex++ {
			cellIndex := rowIndex*len(headings) + colIndex
			row[headingText[colIndex]] = strings.TrimSpace(nodeInnerText(cells[cellIndex]))
		}
	}

	return maps, nil
}
예제 #10
0
파일: main.go 프로젝트: anykao/p
func ParseRecord(n *html.Node) Torrent {
	tds := scrape.FindAll(n, scrape.ByTag(atom.Td))
	var size, uptime, uploader string
	if len(tds) == 4 {
		cat := scrape.Text(tds[0])[0:3]
		name, magnet, desc := ParseName(tds[1])
		matches := re.FindStringSubmatch(desc)
		uptime, size, uploader = matches[1], matches[2], matches[3]
		seed := scrape.Text(tds[2])
		leech := scrape.Text(tds[3])
		return Torrent{cat, name, magnet, size, uptime, uploader, seed, leech}
	} else {
		fmt.Println("Error: not expected format")
	}
	return Torrent{}
}
예제 #11
0
파일: apache.go 프로젝트: darron/goshe
// parseServerStatus returns a slice of strings containing only server stats.
func parseServerStatus(root *html.Node) []string {
	var apacheStats []string
	// Lines with stats start with a number.
	var validStats = regexp.MustCompile(`^[0-9]`)
	// Grab all the table rows.
	rows := scrape.FindAll(root, scrape.ByTag(atom.Tr))
	// If each row matches - add it to the stats lines.
	for _, row := range rows {
		content := scrape.Text(row)
		if validStats.MatchString(content) {
			apacheStats = append(apacheStats, content)
		}
	}
	Log(fmt.Sprintf("parseServerStatus apacheStats='%d'", len(apacheStats)), "debug")
	return apacheStats
}
예제 #12
0
파일: index.go 프로젝트: unixpickle/weakai
func findWikiLinks(node *html.Node) []string {
	links := scrape.FindAll(node, scrape.ByTag(atom.A))
	res := make([]string, 0, len(links))
	for _, link := range links {
		var u string
		for _, attr := range link.Attr {
			if strings.ToLower(attr.Key) == "href" {
				u = attr.Val
				break
			}
		}
		if strings.HasPrefix(u, "/wiki/") {
			res = append(res, u)
		}
	}
	return res
}
예제 #13
0
파일: main.go 프로젝트: jmonmane/scrape
func main() {
	// request and parse the front page
	resp, err := http.Get("https://torguard.net/downloads.php")
	if err != nil {
		panic(err)
	}
	root, err := html.Parse(resp.Body)
	if err != nil {
		panic(err)
	}

	rows := scrape.FindAll(root, scrape.ByTag(atom.Tr))
	for _, row := range rows {
		if strings.Contains(scrape.Text(row), "DEBIAN x64") {
			l := getLink(row)
			fmt.Printf("%s \n %s \n", scrape.Text(row), l)
		}
	}
}
예제 #14
0
파일: auvasa.go 프로젝트: adrm/auvasa
// Get devuelve el conjunto de tiempos de llegada para los buses de la parada
// dada. Hay que comprobar que no se devuelve error.
func Get(parada int) (TiemposParada, error) {
	resp, err := http.Get("http://www.auvasa.es/paradamb.asp?codigo=" +
		strconv.Itoa(parada))
	if err != nil {
		return TiemposParada{}, errors.New("Error al conectar con el servidor de AUVASA.")
	}

	rInUTF8 := transform.NewReader(resp.Body, charmap.Windows1252.NewDecoder())
	root, err := html.Parse(rInUTF8)
	if err != nil {
		return TiemposParada{}, errors.New("Error en la respuesta de AUVASA.")
	}

	headers := scrape.FindAll(root, scrape.ByTag(atom.H1))
	if len(headers) < 2 {
		return TiemposParada{}, errors.New("La parada indicada parece errónea.")
	}

	lineasTiempos := scrape.FindAll(root, scrape.ByClass("style36"))
	resultados := make([]ProximoBus, len(lineasTiempos))
	for i, item := range lineasTiempos {
		valores := scrape.FindAll(item, scrape.ByClass("style38"))
		resultados[i] = ProximoBus{
			Linea:   scrape.Text(valores[0]),
			Destino: scrape.Text(valores[2]),
			Minutos: scrape.Text(valores[3]),
		}
	}

	if len(resultados) == 0 {
		return TiemposParada{}, errors.New("No hay tiempos para la parada especificada. Puede que sea errónea o que ya no haya buses.")
	}

	return TiemposParada{
		Nombre:  scrape.Text(headers[1]),
		Tiempos: resultados,
		Momento: time.Now(),
		Codigo:  parada,
	}, nil

}
예제 #15
0
func getTitle(url string) string {

	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("error:", err)
		return "error"
	}

	root, err := html.Parse(resp.Body)
	if err != nil {
		fmt.Println("error:", err)
		return "error"
	}

	title, ok := scrape.Find(root, scrape.ByTag(atom.Title))

	if ok {
		return scrape.Text(title)
	}

	return "unknown"
}
예제 #16
0
// parseExtraComponentInfo parses the "Class Detail" page for a component.
func parseExtraComponentInfo(body io.Reader, component *Component) (courseOpen bool, err error) {
	nodes, err := html.ParseFragment(body, nil)
	if err != nil {
		return
	}
	if len(nodes) != 1 {
		return false, errors.New("invalid number of root elements")
	}

	openStatus, ok := scrape.Find(nodes[0], scrape.ById("SSR_CLS_DTL_WRK_SSR_DESCRSHORT"))
	if !ok {
		return false, errors.New("open status not found")
	}
	courseOpen = (nodeInnerText(openStatus) == "Open")

	availTable, ok := scrape.Find(nodes[0], scrape.ById("ACE_SSR_CLS_DTL_WRK_GROUP3"))
	if !ok {
		return courseOpen, errors.New("could not find availability info")
	}

	rows := scrape.FindAll(availTable, scrape.ByTag(atom.Tr))
	if len(rows) != 7 {
		return courseOpen, errors.New("invalid number of rows in availability table")
	}

	var availability ClassAvailability

	cols := nodesWithAlignAttribute(scrape.FindAll(rows[2], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 2")
	}
	availability.Capacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListCapacity, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[4], scrape.ByTag(atom.Td)))
	if len(cols) != 2 {
		return courseOpen, errors.New("expected 2 aligned columns in row 4")
	}
	availability.EnrollmentTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}
	availability.WaitListTotal, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[1])))
	if err != nil {
		return
	}

	cols = nodesWithAlignAttribute(scrape.FindAll(rows[6], scrape.ByTag(atom.Td)))
	if len(cols) != 1 {
		return courseOpen, errors.New("expected 1 aligned column in row 6")
	}
	availability.AvailableSeats, err = strconv.Atoi(strings.TrimSpace(nodeInnerText(cols[0])))
	if err != nil {
		return
	}

	component.ClassAvailability = &availability

	return
}