コード例 #1
func scrape(language string, filename string) {
	var doc *goquery.Document
	var e error
	// var w *bufio.Writer

	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0600)
	if err != nil {

	defer f.Close()

	if _, err = f.WriteString(fmt.Sprintf("\n####%s\n", language)); err != nil {

	if doc, e = goquery.NewDocument(fmt.Sprintf("https://github.com/trending?l=%s", language)); e != nil {

	doc.Find("li.repo-leaderboard-list-item").Each(func(i int, s *goquery.Selection) {
		title := s.Find("div h2 a").Text()
		owner := s.Find("span.owner-name").Text()
		repoName := s.Find("strong").Text()
		description := s.Find("p.repo-leaderboard-description").Text()
		url, _ := s.Find("h2 a").Attr("href")
		url = "https://github.com" + url
		fmt.Println("owner: ", owner)
		fmt.Println("repo: ", repoName)
		fmt.Println("URL: ", url)
		if _, err = f.WriteString("* [" + title + "](" + url + "): " + description + "\n"); err != nil {
コード例 #2
ファイル: cleaner.go プロジェクト: ngs/GoOse
func (this *cleaner) cleanBadTags(doc *goquery.Document) *goquery.Document {
	body := doc.Find("body")
	children := body.Children()
	selectors := [3]string{"id", "class", "name"}
	for _, selector := range selectors {
		children.Each(func(i int, s *goquery.Selection) {
			naughtyList := s.Find("*[" + selector + "]")
			cont := 0
			naughtyList.Each(func(j int, e *goquery.Selection) {
				attribute, _ := e.Attr(selector)
				if this.matchNodeRegEx(attribute, REMOVENODES_RE) {
					if this.config.debug {

						log.Printf("Cleaning: Removing node with %s: %s\n", selector, this.config.parser.name(selector, e))
			if this.config.debug {
				log.Printf("%d naughty %s elements found", cont, selector)
	return doc
コード例 #3
ファイル: cleaner.go プロジェクト: ngs/GoOse
func (this *cleaner) cleanDivs(doc *goquery.Document) *goquery.Document {
	frames := make(map[string]int)
	framesNodes := make(map[string]*list.List)
	divs := doc.Find("div")
	divs.Each(func(i int, s *goquery.Selection) {
		children := s.Children()
		if children.Size() == 0 {
			text := s.Text()
			text = strings.Trim(text, " ")
			text = strings.Trim(text, "\t")
			text = strings.ToLower(text)
			if framesNodes[text] == nil {
				framesNodes[text] = list.New()
	for text, freq := range frames {
		if freq > 1 {
			selections := framesNodes[text]
			for s := selections.Front(); s != nil; s = s.Next() {
				selection := s.Value.(*goquery.Selection)
	return doc
コード例 #4
// ogtags extracts the og:title, og:image, ... tags from a webpage
func defaultHTML(i *data.Item, sourceURL string, doc *goquery.Document) {
	fmt.Println("Running OG extract.")

	selection := doc.Find("title")
	if len(selection.Nodes) != 0 {
		i.Caption = selection.Nodes[0].FirstChild.Data

	selection = doc.Find("meta[property*='og']")

	for _, e := range selection.Nodes {
		m := htmlAttributeToMap(e.Attr)

		if m["property"] == "og:title" {
			i.Caption = m["content"]
		if m["property"] == "og:image" {
			if !govalidator.IsRequestURL(m["content"]) {
				log.Println("Invalid url in og:image. " + sourceURL)
			i.ImageURL = m["content"]
		if m["property"] == "og:url" {
			if !govalidator.IsRequestURL(m["content"]) {
				log.Println("Invalid url in og:url. " + sourceURL)
			i.URL = m["content"]
		if m["property"] == "og:description" {
			i.Description = m["content"]
コード例 #5
ファイル: order.go プロジェクト: imos/fxtools
func parseOrderListPage(s *goquery.Document) ([]Order, bool, error) {
	c := s.Find(".container").First()
	t := c.Find("div").First().Text()
	if t != ">注文情報(一覧)<" && t != ">注文情報(検索)<" {
		return nil, false, fmt.Errorf("cannot open \"注文情報(一覧)\", but %#v", t)
	// タイトル行の削除

	results := []Order{}
		func(_ int, s *goquery.Selection) {
			href, ok := s.Attr("href")
			if !ok || !strings.HasPrefix(href, "../otc/C003.html?") {
			u, err := url.Parse(href)
			if err != nil || u.RawQuery == "" {
			v, err := url.ParseQuery(u.RawQuery)
			results = append(results, Order{
				OrderId:     v.Get("order_id"),
				OrderMethod: v.Get("order_method"),

	return results, c.Find("a[accesskey=\"#\"]").Length() == 1, nil
コード例 #6
ファイル: tao.go プロジェクト: qgweb/new
func GetShopName(p *goquery.Document) string {
	name := p.Find(".tb-shop-name").Text()
	if name == "" {
		name = p.Find(".slogo-shopname").Text()
	return strings.TrimSpace(name)
コード例 #7
ファイル: website.go プロジェクト: golibri/website
func feedsFromDoc(doc *goquery.Document, text string) []string {
	sel := "link[type='application/rss+xml']"
	sel += ", link[type='application/atom+xml']"
	matches := doc.Find(sel)

	if matches.Length() > 0 {
		feeds := make([]string, matches.Length())
		matches.Each(func(i int, s *goquery.Selection) {
			url, _ := s.Attr("href")
			feeds[i] = url
		return feeds

	rx := regexp.MustCompile(`href=['"]([^'"]*(rss|atom|feed|xml)[^'"]*)['"]`)
	if rx.FindString(text) != "" {
		matches := rx.FindAllStringSubmatch(text, -1)
		feeds := make([]string, len(matches))
		for i, e := range matches {
			feeds[i] = e[1]
		return feeds

	return make([]string, 0)
コード例 #8
ファイル: matchPage.go プロジェクト: trtstm/zeejongparser
Get the two teams in a match
func getTeamsId(d *goquery.Document) ([2]int, error) {
	var ids [2]int

	url1, ok := d.Find("div.container.left h3 a").Attr("href")
	if !ok {
		return ids, errors.New("could not find team a")

	idA, err := parseTeam(BASE + url1)
	if err != nil {
		return ids, err

	url2, ok := d.Find("div.container.right h3 a").Attr("href")
	if !ok {
		return ids, errors.New("could not find team b")

	idB, err := parseTeam(BASE + url2)
	if err != nil {
		return ids, err

	ids[0] = idA
	ids[1] = idB
	return ids, nil
コード例 #9
// Parse html
func perseHTML(htmldata *goquery.Document) []string {
	var dates []string

	htmldata.Find("a.bt-open").Each(func(_ int, s *goquery.Selection) {
		if jsonData, ok := s.Attr("id"); ok {


			//analyze json object
			var jsonObject map[string]interface{}
			//json.JsonAnalyze(jsonData, &jsonObject)
			json.Unmarshal([]byte(jsonData), &jsonObject)

			//extract date from json object
			//e.g. 2016-02-27 03:30:00
			strDate := jsonObject["field19"].(string)
			if isTimeApplicable(strDate) {
				dates = append(dates, strDate)

	return dates
コード例 #10
ファイル: spider.go プロジェクト: luzh0422/spider-docker
** get friends' friends info
func (w *SocialWorker) GetFFInfo(query *goquery.Document) {
	var user User
	// var uid string
	var usex string
	// var usersId []string
	// var usersName []string
	// uidString, _ := query.Find("div.c").Eq(1).Find("a").Attr("href")
	// var digitsRegexp = regexp.MustCompile(`(^|&|\?)uid=([^&]*)(&|$)`)
	 ** 获取粉丝的粉丝的uid(str)
	// str := digitsRegexp.FindStringSubmatch(uidString)
	// uid = crawlUrl.Id
	// usersId = append(usersId, uid)
	uStr := query.Find("div.c").Eq(2).Text()
	nameStr_1 := GetBetweenStr(uStr, ":", "性别")
	nameStr_2 := GetBetweenStr(nameStr_1, ":", "认证")
	nameStr_3 := strings.Split(nameStr_2, ":")
	uname := nameStr_3[1]
	sexStr_1 := GetBetweenStr(uStr, "性别", "地区")
	sexStr_2 := strings.Split(sexStr_1, ":")
	if sexStr_2[1] == "男" {
		usex = "male"
	} else {
		usex = "famale"

	user.uid = crawlUrl.FatherId
	user.friendid = crawlUrl.Id
	user.uname = uname
	user.usex = usex
コード例 #11
ファイル: main.go プロジェクト: coolhacks/gohn
func getItems(doc *goquery.Document) (items []item, maxWidth int) {
	doc.Find("td.title a").EachWithBreak(func(i int, s *goquery.Selection) bool {
		if i == maxItems {
			return false

		if s.Text() == "More" {
			return true

		href, _ := s.Attr("href")
		title := s.Text()
		points := s.Parent().Parent().Next().Find("span").Text()
		a, b := len(fmt.Sprintf("%s (%s)", title, points)), len(href)
		maxWidth = max(a, b, maxWidth)

		items = append(items, item{
			title:  title,
			url:    href,
			points: points,

		return true
コード例 #12
ファイル: spider.go プロジェクト: luzh0422/spider-docker
**get friends url
func (w *SocialWorker) GetFriendsUrl(query *goquery.Document, p *page.Page) {
	var str_1 string
	// newCrawlUrl := models.CrawlUrl{}
	query.Find("div.c").Find("table").Find("tbody").Find("tr").Find("a:last-child").Each(func(j int, s *goquery.Selection) {
		if j%2 != 0 {
			friendsUrlString, _ := s.Attr("href")
			var digitsRegexp = regexp.MustCompile(`(^|&|\?)uid=([^&]*)(&|$)`)
			str := digitsRegexp.FindStringSubmatch(friendsUrlString)
			if str == nil {
				str_1 = "1"
			} else {
				str_1 = str[2]
			friendsInfoUrl := "http://weibo.cn/" + str_1 + "/info"
			// newCrawlUrl.Url = "http://weibo.cn/" + str_1 + "/fans"
			// p.AddTargetRequestWithHeaderFile(friendsInfoUrl, "html", "./header.json")
			// newCrawlUrl.Id = str_1
			// newCrawlUrl.Layer = crawlUrl.Layer + 1
			// newCrawlUrl.FatherId = crawlUrl.Id
			// w.SendMessageToSQS(newCrawlUrl)

			Urls = append(Urls, friendsInfoUrl)
			UrlsLevel = append(UrlsLevel, UrlsLevel[i]+1)
コード例 #13
ファイル: reddit.go プロジェクト: studygolang/studygolang
// Parse 获取url对应的资源并根据规则进行解析
func (this *RedditLogic) Parse(redditUrl string) error {
	redditUrl = strings.TrimSpace(redditUrl)
	if redditUrl == "" {
		redditUrl = this.domain + this.golang
	} else if !strings.HasPrefix(redditUrl, "https") {
		redditUrl = "https://" + redditUrl

	var (
		doc *goquery.Document
		err error

	// if doc, err = goquery.NewDocument(redditUrl); err != nil {
	if doc, err = this.newDocumentFromResp(redditUrl); err != nil {
		logger.Errorln("goquery reddit newdocument error:", err)
		return err

	// 最后面的先入库处理
	resourcesSelection := doc.Find("#siteTable .link")

	for i := resourcesSelection.Length() - 1; i >= 0; i-- {
		err = this.dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection)

		if err != nil {

	return err
コード例 #14
ファイル: main.go プロジェクト: danielfireman/phd
func doWork(links <-chan string, results chan<- string) {
	for link := range links {
		var doc *goquery.Document
		for i := 1; ; i++ {
			var err error
			doc, err = goquery.NewDocument(link)
			if err == nil {
			fmt.Fprintf(os.Stderr, "[Tentativa %d] Erro tentando processar página de servidor: %s. Erro: %q", i, link, err)
			if i == maxRetries {
				fmt.Fprintf(os.Stderr, "Página não processada: %s", link)
			time.Sleep(time.Duration(i) * time.Duration(rand.Intn(5)) * time.Second)
		var row []string
		doc.Find("td.desc").Each(func(i int, s *goquery.Selection) {
			cell := strings.Replace(
				strings.Trim(s.Next().Text(), " \n"),
			row = append(row, cell)
		if len(row) > 0 {
			results <- strings.Join(row, *sep)
		} else {
			fmt.Fprintf(os.Stderr, "Não achou td.desc: %s\n", link)
コード例 #15
ファイル: scrapers.go プロジェクト: ubuntu-si/arso-api
// ARSOPotresi returs slice of Potres struct
func ARSOPotresi() []Potres {
	var potresi []Potres
	var doc *goquery.Document
	var e error

	if res, found := cacheArso.Get("potresi"); found {
		return res.([]Potres)

	if doc, e = goquery.NewDocument("http://www.arso.gov.si/potresi/obvestila%20o%20potresih/aip/"); e != nil {
		return potresi

	doc.Find("#glavna td.vsebina table tr").Each(func(i int, s *goquery.Selection) {
		magnituda, err := strconv.ParseFloat(s.Find("td:nth-child(4)").Text(), 2)
		if magnituda > 0 && err == nil {
			potres := Potres{}
			potres.Magnituda = magnituda
			potres.Lat, _ = strconv.ParseFloat(s.Find("td:nth-child(2)").Text(), 3)
			potres.Lon, _ = strconv.ParseFloat(s.Find("td:nth-child(3)").Text(), 3)
			potres.Lokacija = s.Find("td:nth-child(6)").Text()
			potres.Datum = s.Find("td:nth-child(1)").Text()
			potresi = append(potresi, potres)
	cacheArso.Set("potresi", potresi, cache.DefaultExpiration)
	return potresi
コード例 #16
ファイル: garfield.go プロジェクト: koffeinsource/notreddit
func garfield(i *data.Item, sourceURL string, doc *goquery.Document) {
	if !strings.Contains(sourceURL, "www.gocomics.com/garfield") {

	fmt.Println("Running Garfield plugin.")

	// update title

	selection := doc.Find(".strip")
	if len(selection.Nodes) == 0 {
		fmt.Println("Garfield plugin found no .strip. " + sourceURL)
	} else {
		if len(selection.Nodes) > 1 {
			fmt.Println("Garfield plugin found >1 .strip. " + sourceURL)
		m := htmlAttributeToMap(selection.Nodes[0].Attr)

		if govalidator.IsRequestURL(m["src"]) {
			i.Description = "<img src =\""
			i.Description += m["src"]
			i.Description += "\" />"
		} else {
			fmt.Println("Amazon plugin invalid url. " + m["src"])
		i.ImageURL = ""

コード例 #17
ファイル: tao.go プロジェクト: qgweb/new
func GetAttrbuites(p *goquery.Document) string {
	attribute := make([]string, 0, 20)
	p.Find("#J_AttrUL li").Each(func(index int, element *goquery.Selection) {
		as := strings.Split(element.Text(), ":")
		if len(as) < 2 {
			as = strings.Split(element.Text(), ":")

		b := ""

		if len(as) >= 2 && !utf8.ValidString(as[1]) {
			as[1] = as[1]
			b = as[1]

		attribute = append(attribute, as[0]+":"+b)

	if len(attribute) == 0 {
		p.Find("#attributes .attributes-list li").Each(func(index int, element *goquery.Selection) {
			attribute = append(attribute, element.Text())

	return strings.Join(attribute, "##")
コード例 #18
ファイル: scrape.go プロジェクト: timchunght/gophers
func scrapeSearch(document *goquery.Document, url string) {
	pagesStr := document.Find("a.next_page").Prev().Text()
	pages, _ := strconv.Atoi(pagesStr)
	page := 1
	for page <= pages {
		pageURL := url + "&p=" + strconv.Itoa(page)
		fmt.Println("Analyzing page: " + pageURL)
		doc := downloadURL(pageURL)
		doc.Find(".user-list-item").Each(func(i int, s *goquery.Selection) {
			email := s.Find("a.email").Text()
			profileURL, _ := s.Find("a").Eq(1).Attr("href")
			username := profileURL[1:len(profileURL)]
			profileURL = "http://github.com" + profileURL
			info := s.Find(".user-list-info")
			_ = info.Find("ul.user-list-meta").Remove()
			_ = info.Find("a").Remove()
			name := strings.TrimSpace(info.Text())
			fmt.Println("Parsed user: " + username)
			user := user{name: name, email: email, url: profileURL, username: username}

		page = page + 1
コード例 #19
ファイル: tao.go プロジェクト: qgweb/new
func GetShopUrl(p *goquery.Document) string {
	href, _ := p.Find(".tb-seller-name").Attr("href")
	if href == "" {
		href, _ = p.Find(".slogo-shopname").Attr("href")
	return strings.TrimSpace("https:" + href)
コード例 #20
ファイル: extractor.go プロジェクト: anyweez/newsflash
func getTerms(doc *goquery.Document) ([]string, error) {
	terms := make([]string, 0)
	doc.Find("p").Each(func(i int, s *goquery.Selection) {
		// Decode any HTML-encoded characters so they can be parsed correctly.
		bdy := html.UnescapeString(s.Text())
		// TODO: condense into a regex?
		bdy = strings.Replace(bdy, "-", " ", -1)
		bdy = strings.Replace(bdy, ",", " ", -1)
		bdy = strings.Replace(bdy, ".", " ", -1)
		bdy = strings.Replace(bdy, ";", " ", -1)
		bdy = strings.Replace(bdy, "\"", " ", -1)
		terms = append(terms, strings.Fields(bdy)...)

	re, err := regexp.Compile("[^A-Za-z0-9]+")
	if err != nil {
		log.Println("Unexpected regex compilation error: " + err.Error())
		return []string{}, err

	for i := 0; i < len(terms); i++ {
		terms[i] = re.ReplaceAllString(terms[i], "")

	return terms, nil
コード例 #21
ファイル: website.go プロジェクト: golibri/website
func descriptionFromDoc(doc *goquery.Document) string {
	sel := "meta[property='og:description']"
	sel += ", meta[name='twitter:description']"
	sel += ", meta[name='description']"
	desc, _ := doc.Find(sel).First().Attr("content")
	return desc
コード例 #22
ファイル: client.go プロジェクト: Kemonozume/nzbcrawler
func (t *TownClient) getSValue() (sValue string) {
	log.WithField("tag", TAG).Info("getting sValue for town login")
	sValue = ""
	var doc *goquery.Document
	var e error
	log.WithField("tag", TAG).Infof("GET %v", ROOT)
	if doc, e = goquery.NewDocument(ROOT); e != nil {
		log.WithField("tag", TAG).Errorf("%s", e.Error())

	doc.Find("input").Each(func(i int, s *goquery.Selection) {
		attr, exists := s.Attr("name")
		if exists == true {
			if attr == "s" {
				bla, exists := s.Attr("value")
				if exists == true {
					sValue = bla

	log.WithField("tag", TAG).Infof("sValue: %v", sValue)
	return sValue
コード例 #23
ファイル: main.go プロジェクト: koansys/isat-smd-missions
func main() {
	var doc *goquery.Document
	var err error

	if doc, err = goquery.NewDocument("http://science.nasa.gov/missions/?group=all"); err != nil {
		log.Fatal("Failed to fetch page")
	doc.Find(".missions").Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
		m := unpackMission(s)
		if m.Phase == "Operating" {
			missions = append(missions, m)

	if *asJson == true {
		b, err := json.Marshal(missions)
		if err != nil {
	} else {
		for _, m := range missions {
コード例 #24
ファイル: subclub.go プロジェクト: riston/subclub-parser
func parse(d *goquery.Document) []Result {

	// Select the tales table
	rowsSel := d.Find("#tale_list > tbody:nth-child(2) > tr")

	// var rows []Result
	rows := make([]Result, rowsSel.Length())

	rowsSel.Each(func(i int, s *goquery.Selection) {

		// Get all the rows children td tags
		tdSel := s.Children()

		rows[i] = Result{
			ID:           getMovieID(tdSel.Eq(1)),
			Name:         getMovieName(tdSel.Eq(1)),
			SubName:      getMovieSubName(tdSel.Eq(1)),
			Views:        getViews(tdSel.Eq(4)),
			Author:       getAuthor(tdSel.Eq(8)),
			FPS:          getFPS(tdSel.Eq(6)),
			SubtitleLink: getSubtitleLink(tdSel.Eq(1)),
			Links:        getMovieLinks(tdSel.Eq(3)),
			Genres:       getGenres(tdSel.Eq(2)),
			Created:      getDate(tdSel.Eq(0)),

	return rows
コード例 #25
ファイル: github.go プロジェクト: hypebeast/gostats
func parseTrendingRepos(doc *goquery.Document) []GithubRepo {
	var repos []GithubRepo
	var regStars = regexp.MustCompile("[0-9]+")

	doc.Find("li.repo-list-item").Each(func(i int, s *goquery.Selection) {
		title := strings.Trim(s.Find("h3.repo-list-name a").Text(), "\n\t ")
		title = strings.Replace(title, " ", "", -1)
		title = strings.Replace(title, "\n", "", -1)
		description := strings.Trim(s.Find("p.repo-list-description").Text(), "\n\t ")
		url, _ := s.Find("h3.repo-list-name a").Attr("href")
		url = "https://github.com" + url
		starsString := s.Find("p.repo-list-meta").Text()
		starsString = strings.Replace(starsString, ",", "", -1)
		starsString = regStars.FindString(starsString)
		if starsString == "" {
			starsString = "0"
		stars, _ := strconv.Atoi(starsString)

		repo := GithubRepo{
			Title:       title,
			Description: description,
			Url:         url,
			Stars:       stars,
			Forks:       0,
			Date:        time.Now().UTC().Unix(),

		repos = append(repos, repo)

	return repos
コード例 #26
ファイル: ulli2rst.go プロジェクト: siongui/siongui.github.io
func HtmlUlLiToRst(doc *goquery.Document) *goquery.Document {
	for ul := doc.Find("ul").First(); ul.Length() != 0; ul = doc.Find("ul").First() {
		processUl(ul, 0)

	return doc
コード例 #27
ファイル: cleaner.go プロジェクト: ngs/GoOse
func (this *cleaner) cleanCites(doc *goquery.Document) *goquery.Document {
	cites := doc.Find("cite")
	cites.Each(func(i int, s *goquery.Selection) {
	return doc
コード例 #28
ファイル: fetcher.go プロジェクト: mattheath/kraken
// extractLinks from a document
func (h *HttpFetcher) extractLinks(doc *goquery.Document) ([]*url.URL, error) {

	// Blank slice to hold the links on this page
	urls := make([]*url.URL, 0)

	// Extract all 'a' elements from the document
	sel := doc.Find("a")
	if sel == nil {
		// Assume zero links on failure
		return nil, nil

	// Range over links, and add them to the list if valid
	for _, n := range sel.Nodes {

		// Validate the node is a link, and extract the target URL
		href, err := h.extractValidHref(n)
		if err != nil || href == "" {

		// Normalise the URL and add if valid
		if uri := h.normaliseUrl(doc.Url, href); uri != nil {
			urls = append(urls, uri)

	return h.dedupeUrls(urls), nil
コード例 #29
ファイル: parser.go プロジェクト: korroktheslavemaster/jee
func GetStudent(d *goquery.Document, rollno int) (s Student, ok bool) {
	//sanity on document
	// if v := d.Find(".titlehead").Children().Text(); v != "JEE (Advanced) - 2013 Result" {
	// 	return Student{}, false
	// }
	dtext := strings.Trim(d.Text(), " ")
	dfields := strings.Fields(dtext)
	for _, v := range dfields {
		s.Plaintext += v + " "
	s.Plaintext = strings.Trim(s.Plaintext, " ")
	if isInvalid(dtext) {
		return s, false
	ok = true
	s.Rollno = rollno
	s.Region = s.Rollno / 10000
	if !isSelected(dtext) {
	s.Selected = true
	s.Rank, _ = strconv.Atoi(d.Find(".style7").First().Text())
	text, _ := d.Find(".titlehead").First().Parent().Next().Children().Children().First().Html()
	tokens := strings.Split(text, "<br/>")
	nameToks := strings.Fields(tokens[1])
	nameToks = nameToks[2:len(nameToks)]
	for _, v := range nameToks {
		s.Name += v + " "
	s.Name = strings.Trim(s.Name, " ")
	s.Q = GetQuota(dtext)
コード例 #30
ファイル: download.go プロジェクト: JFMarket/report-cacher
// Determines whether or not the client is currently logged in based on a goquery.Document.
func loginStatus(doc *goquery.Document) bool {
	if doc.Find(`#user-controls`).Length() > 0 {
		return true

	return false