コード例 #1
0
ファイル: contentishe.go プロジェクト: JAremko/bendera
func GetContentishe(category_url string) (url string, err error) {
	doc, err := goquery.NewDocument(category_url)
	if err != nil {
		return "", err
	}

	pNav := doc.Find("div.pagination_expanded > span.current").First()
	pageCount, _ := strconv.Atoi(pNav.Text())

	rand.Seed(time.Now().Unix())

	pageIndex := strconv.Itoa(rand.Intn(pageCount-1) + 1)
	doc, err = goquery.NewDocument(category_url + "/" + pageIndex)

	if err != nil {
		return "", err
	}

	contentishe := doc.Find(".image [src$='.gif']," +
		" .image [src$='.png']," +
		" .image [src$='.jpg']," +
		" .image [src$='.jpeg'] ")
	if contentishe.Length() == 0 {
		return "", errors.New("failed to find contentishe")
	}

	imgSrc, exist := contentishe.Eq(rand.Intn(contentishe.Length())).Attr("src")
	if !exist || imgSrc == "" {
		imgSrc, exist = contentishe.Eq(rand.Intn(contentishe.Length())).Attr("href")
		if !exist || imgSrc == "" {
			return "", errors.New("bad src and href")
		}
	}
	return imgSrc + "?.jpg", nil
}
コード例 #2
0
ファイル: petaboard.go プロジェクト: rajninorp/pbget
func posts(url string, lastModified time.Time) []Post {
	doc, err := goquery.NewDocument(url)
	errNotNilToPanic(err)
	lastPage := 0
	doc.Find("ul").EachWithBreak(func(_ int, s *goquery.Selection) bool {
		if class, exist := s.Attr("class"); exist && class == "pagination" {
			if href, exist := s.Find("li").Find("a").Last().Attr("href"); exist {
				reg := regexp.MustCompile(".*page_num=([0-9]+)$")
				if m := reg.FindStringSubmatch(href); len(m) > 1 {
					lastPage, _ = strconv.Atoi(m[1])
				}
			}
			return false
		}
		return true
	})
	pList := []Post{}
	for page := 1; page <= lastPage; page++ {
		doc, err := goquery.NewDocument(url + "?comment_order=DESC&page_num=" + strconv.Itoa(page))
		errNotNilToPanic(err)
		doc.Find("div").EachWithBreak(func(_ int, s *goquery.Selection) bool {
			if class, exist := s.Attr("class"); exist && class == "post-sla" {
				p := post(s)
				if !lastModified.Before(p.postDate) {
					return false
				}
				pList = append(pList, p)
			}
			return true
		})
	}
	return pList
}
コード例 #3
0
ファイル: scraper.go プロジェクト: nonsenz/bbcParty
func BroadcastIds(showId string, all bool) []string {

	continueUntilPage := 1
	showUrl := "https://www.bbc.co.uk/programmes/" + showId + "/episodes/guide?page="
	var broadcastIds []string

	showDoc, err := goquery.NewDocument(showUrl + strconv.Itoa(continueUntilPage))
	if err != nil {
		log.Fatal(err)
	}

	if all {
		maxPage, _ := strconv.Atoi(showDoc.Find(".pagination__page--last a").Text())
		if maxPage > 0 {
			continueUntilPage = maxPage
		}
	}

	for pageCount := 1; pageCount <= continueUntilPage; pageCount++ {
		if pageCount > 1 {
			showDoc, err = goquery.NewDocument(showUrl + strconv.Itoa(pageCount))
			if err != nil {
				log.Fatal(err)
			}
		}

		broadcastIds = append(broadcastIds, showDoc.Find(".programme__titles a").Map(func(i int, s *goquery.Selection) string {
			broadcastLink, _ := s.Attr("href")
			return strings.Split(broadcastLink, "/")[2]
		})...)
	}

	return broadcastIds
}
コード例 #4
0
ファイル: ptt.go プロジェクト: kkdai/iloveptt
func parsePttBoardIndex(page int) (hrefs []string) {
	doc, err := goquery.NewDocument(EntryAddress)
	if err != nil {
		log.Fatal(err)
	}
	hrefs = make([]string, 0)
	maxPageNumberString := ""
	var PageWebSide string
	if page > 0 {
		// Find page result
		doc.Find(".btn-group a").Each(func(i int, s *goquery.Selection) {
			if strings.Contains(s.Text(), "上頁") {
				href, exist := s.Attr("href")
				if exist {
					targetString := strings.Split(href, "index")[1]
					targetString = strings.Split(targetString, ".html")[0]
					fmt.Println("total page:", targetString)
					maxPageNumberString = targetString
				}
			}
		})
		pageNum, _ := strconv.Atoi(maxPageNumberString)
		pageNum = pageNum - page
		PageWebSide = fmt.Sprintf("https://www.ptt.cc/bbs/Beauty/index%d.html", pageNum)
	} else {
		PageWebSide = EntryAddress
	}

	doc, err = goquery.NewDocument(PageWebSide)
	if err != nil {
		log.Fatal(err)
	}

	doc.Find(".r-ent").Each(func(i int, s *goquery.Selection) {
		title := strings.TrimSpace(s.Find(".title").Text())
		likeCount, _ := strconv.Atoi(s.Find(".nrec span").Text())
		href, _ := s.Find(".title a").Attr("href")
		link := BasePttAddress + href
		hrefs = append(hrefs, link)
		fmt.Printf("%d:[%d★]%s\n", i, likeCount, title)
	})

	// Print pages
	fmt.Printf("Pages: ")
	for i := page - 3; i <= page+2; i++ {
		if i >= 0 {
			if i == page {
				fmt.Printf("[%v] ", i)
			} else {
				fmt.Printf("%v ", i)
			}
		}
	}
	fmt.Printf("(o: open file in fider, s: top page, n:next, p:prev, quit: quit program)\n")
	return hrefs
}
コード例 #5
0
ファイル: ck101.go プロジェクト: kkdai/photomgr
//Set CK101 board page index, fetch all post and return article count back
func (p *CK101) ParseCK101PageByIndex(page int) int {
	doc, err := goquery.NewDocument(p.entryAddress)
	if err != nil {
		log.Fatal(err)
	}

	urlList := make([]string, 0)
	postList := make([]string, 0)
	starList := make([]int, 0)

	var PageWebSide string
	page = page + 1 //one base
	if page > 1 {
		// Find page result
		PageWebSide = fmt.Sprintf("http://ck101.com/forum-1345-%d.html", page)
	} else {
		PageWebSide = p.entryAddress
	}
	//fmt.Println("Page", PageWebSide)

	doc, err = goquery.NewDocument(PageWebSide)
	if err != nil {
		log.Fatal(err)
	}
	doc.Find(".cl_box").Each(func(i int, s *goquery.Selection) {
		star := ""
		title := ""
		url := ""
		starInt := 0
		s.Find("a").Each(func(i int, tQ *goquery.Selection) {
			title, _ = tQ.Attr("title")
			url, _ = tQ.Attr("href")
		})
		s.Find("em").Each(func(i int, starC *goquery.Selection) {
			star_c, _ := starC.Attr("title")
			fmt.Println("star_c:", star_c)
			if strings.Contains(star_c, "查看") {
				star = strings.Replace(star_c, "查看", "", -1)
				fmt.Println("star:", star)
				star = strings.TrimSpace(star)
				starInt, _ = strconv.Atoi(star)
			}
			//}
		})
		urlList = append(urlList, url)
		starList = append(starList, starInt)
		postList = append(postList, title)
	})

	p.storedPostURLList = urlList
	p.storedStarList = starList
	p.storedPostTitleList = postList

	return len(p.storedPostTitleList)
}
コード例 #6
0
ファイル: main0.go プロジェクト: sainoba/mifciencias
func getPagina(url string) (doc *goquery.Document) {
	var err error
	doc, err = goquery.NewDocument(url)
	for i, maxIntentos := 0, 1000; err != nil && i < maxIntentos; i++ {
		doc, err = goquery.NewDocument(url)
	}
	if err != nil {
		mataPrograma(">>Error al obtener la url: "+url, err)
	}
	return
}
コード例 #7
0
ファイル: Test1.go プロジェクト: sugeladi/goWeb
//领事安全预警
func lsyj() {

	file := xlsx.NewFile()
	sheet := file.AddSheet("领事安全预警")

	doc, err := goquery.NewDocument(URL_LSYJ + "/default.shtml")
	if err != nil {
		log.Fatal(err)
	}

	totalUrl := 0

	doc.Find(".ct3_m .news_list li  a").Each(func(i int, contentSelection *goquery.Selection) {
		name := contentSelection.Text()
		if href, exists := contentSelection.Attr("href"); exists {
			href = URL_LSYJ + strings.Replace(href, ".", "", 1)
			err := parseLSYJ(sheet, href)
			if err != nil {
				fmt.Printf("[read error]第%d个:%s。url:%s。 %v", i+1, name, href, err)
			}
			totalUrl += 1
		}
	})

	for i := 1; i <= 11; i++ {
		url := fmt.Sprintf("/default_%d.shtml", i)
		fmt.Printf("第%d个URL:%s", i, url)
		doc, err := goquery.NewDocument(URL_LSYJ + url)
		if err != nil {
			log.Fatal(err)
		}

		doc.Find(".ct3_m .news_list li  a").Each(func(i int, contentSelection *goquery.Selection) {
			name := contentSelection.Text()
			if href, exists := contentSelection.Attr("href"); exists {
				href = URL_LSYJ + strings.Replace(href, ".", "", 1)
				err := parseLSYJ(sheet, href)
				if err != nil {
					fmt.Printf("[read error]第%d个:%s。url:%s。 %v", i+1, name, href, err)
				}
				totalUrl += 1
			}
		})
	}
	fileName := fmt.Sprintf("领事安全预警(%d个).xlsx", totalUrl)
	err = file.Save(fileName)
	if err != nil {
		fmt.Printf(err.Error())
	}
}
コード例 #8
0
ファイル: p2p_rong360bbs.go プロジェクト: joyplus/finprocess
func Rong360bbsCrawler() {
	beego.Info("Process rong360 bbs-yangmao.")

	for i := 1; i < 6; i++ {
		u := "http://bbs.rong360.com/forum-76-" + strconv.Itoa(i) + ".html"
		beego.Info("Process rong360 bbs-yangmao url: " + u)

		document, _ := goquery.NewDocument(u)

		//所有帖子
		document.Find("table#threadlisttableid").Find("tbody").Each(func(i int, selection *goquery.Selection) {
			topic := &models.Topic{}

			topic.Node_id = 4
			topic.Uid = 1
			topic.Ord = time.Now().Unix()

			t := selection.Find("th").First().Find("a.s.xst")

			title := t.Text()
			if len(title) > 0 {
				topic.Title = title

				if titleUrl, f := t.Attr("href"); f {
					//获取帖子正文
					c, _ := goquery.NewDocument(titleUrl)

					content := c.Find("div#postlist").First().Find("td.t_f").First()

					content.Find("img").Each(func(i int, se *goquery.Selection) { // 替换图片的src地址
						if src, exists := se.Attr("file"); exists {
							se.SetAttr("src", "http://bbs.rong360.com/"+src)
						}
					})

					html, _ := content.Html()
					topic.Content = html
					topic.Addtime = time.Now().Unix()
					topic.Updatetime = time.Now().Unix()

					(&models.TopicDao{}).InsertOrUpdate(topic)
				}

			}
		})

	}

}
コード例 #9
0
ファイル: kickass.go プロジェクト: willnix/kickass
// GetMovie finds shows with a title containg the keyword
// Returns error if no show is found
func GetMovie(keyword string) (*Movie, error) {
	if keyword == "" {
		return nil, ErrMissingArgument
	}

	doc, err := goquery.NewDocument("https://kat.cr/usearch/" + keyword)
	if err != nil {
		return nil, err
	}

	usearch := doc.Find(".torrentMediaInfo")
	if usearch.Length() < 1 {
		return nil, ErrMovieNotFound
	}

	titleLink := doc.Find("h1 > a.plain")
	title := titleLink.Text()
	if title == "" {
		return nil, ErrParsingFailure
	}

	url, ok := titleLink.Attr("href")
	if !ok {
		return nil, ErrParsingFailure
	}

	doc, err = goquery.NewDocument("https://kat.cr" + url)
	if err != nil {
		return nil, err
	}

	cover, ok := doc.Find(".movieCover > img").Attr("src")
	if !ok {
		return nil, ErrParsingFailure
	}

	magnets := make(map[string]string, 3)
	magnets["1080p"], _ = doc.Find("#tab-1080p i.ka-magnet").Parent().Attr("href")
	magnets["720p"], _ = doc.Find("#tab-720p i.ka-magnet").Parent().Attr("href")
	magnets["hdtv"], _ = doc.Find("#tab-HDRiP i.ka-magnet").Parent().Attr("href")

	return &Movie{
			Title:   title,
			URL:     url,
			Cover:   cover,
			Sources: magnets},
		nil

}
コード例 #10
0
ファイル: mzs.go プロジェクト: rose312/mzr
func SelfPage(cururl string) {

	x, _ := goquery.NewDocument(cururl)
	//获取标题
	title := x.Find(".main-tags").Text()
	fmt.Println("标题:", title)

	//获取当前页可见图像
	x.Find(".size-full").Each(func(idx int, s *goquery.Selection) {

		title2, b2 := s.Attr("title")
		if b2 == true {
			title = title2
		}

		v, b := s.Attr("src")
		if b == true {
			if !strings.HasSuffix(v, "grey.jpg") {
				AddSpiderData(v, title)
			}
		}

	})
	//获取翻页链接
	x.Find(".link_pages").Each(func(idx int, s *goquery.Selection) {
		iurl, bl := s.Find("a").Attr("href")
		if bl == true {
			z, _ := goquery.NewDocument(iurl)
			//读取被打开的翻页页面内的可见图像
			z.Find(".size-full").Each(func(idx int, s *goquery.Selection) {
				title2, b2 := s.Attr("title")
				if b2 == true {
					title = title2
				}

				v, b := s.Attr("src")
				if b == true {

					if !strings.HasSuffix(v, "grey.jpg") {
						AddSpiderData(v, title)
					}

				}
			})
		}

	})
}
コード例 #11
0
func main() {
	urlMain := `http://www.kuaiyilicai.com`
	urlUpayCurrency := `http://www.kuaiyilicai.com/upcurrency.html`
	docUpayCurrency, err := goquery.NewDocument(urlUpayCurrency)
	checkError(err)

	// fmt.Println(` ** list all sorts of currency`)
	docUpayCurrency.Find(`ul.list-inline > li.itm`).Each(
		func(i int, selUpayCcurrency *goquery.Selection) {
			// fmt.Println(`  ** get all url of every currency`)
			selUpayCcurrency.Find(`a`).Each(
				func(i int, sel_sort *goquery.Selection) {
					href, _ := sel_sort.Attr(`href`)
					if matched, _ := regexp.MatchString(`.*uprate.*`, href); matched {
						href = urlMain + href
						fmt.Println(href + `  |  ` + selUpayCcurrency.Text())

						// fmt.Println(`   ** get data from every url of currency`)
						docEachCurrency, err := goquery.NewDocument(href)
						checkError(err)
						docEachCurrency.Find(`div.rate`).Each(
							func(i int, selEachCurrency *goquery.Selection) {
								eachCurrency := regexp.MustCompile(`\s`).
									ReplaceAllString(selEachCurrency.Text(), ``)

								if matched, _ := regexp.MatchString(`\d+\.\d+\/\d+\.\d+.*`, eachCurrency); matched {
									// fmt.Println(`    ** match a format`)
									eachCurrency = regexp.MustCompile(`\d+\.\d+\/(\d+\.\d+)[^0-9]*(\d+)-(\d+).*`).
										ReplaceAllString(eachCurrency, `$2$3;$1`)
								} else {
									// fmt.Println(`    ** not match the format`)
									eachReciprocalCurrency := regexp.MustCompile(`[^0-9]*(\d+\.\d+)[^0-9]*(\d+-\d+).*`).
										ReplaceAllString(eachCurrency, `$1`)
									eachCurrencyDate := regexp.MustCompile(`[^0-9]*(\d+\.\d+)[^0-9]*(\d+)-(\d+).*`).
										ReplaceAllString(eachCurrency, `$2$3`)
									f, err := strconv.ParseFloat(eachReciprocalCurrency, 32)
									checkError(err)
									eachReciprocalCurrency = strconv.FormatFloat(1/f, 'f', 4, 32)
									eachCurrency = eachCurrencyDate + `;` + eachReciprocalCurrency
								}
								fmt.Println(
									regexp.MustCompile(`(.*);(.*)` /*Date-%4d;Currency-%.4f*/).
										ReplaceAllString(eachCurrency, `$2;$1`))
							})
					} // else { fmt.Println(`   ** not match url`) }
				})
		})
}
コード例 #12
0
func getBuildStatus(src string) (string, error) {
	doc, err := goquery.NewDocument(src)
	if err != nil {
		return "", err
	}
	cssPath := "#repo-info-tab > div.repository > table > tbody tr > td"
	var status string
	doc.Find(cssPath).Each(func(i int, s *goquery.Selection) {
		txt := s.Text()
		if txt == "Finished" || txt == "Error" {
			if status == "" {
				switch txt {
				case "Finished":
					status = "passing"
				case "Error":
					status = "failing"
				default:
					status = txt
				}

			}
		}
	})

	return status, nil
}
コード例 #13
0
ファイル: robot.go プロジェクト: aleSuglia/slideshare_down
func GetSlideList(presentationURL string) ([]string, error) {
	//var slideContSelector string = ".slide_container"
	var slideImgSelector string = ".slide_image"
	var imgURLAttribute string = "data-full"

	doc, err := goquery.NewDocument(presentationURL)

	if err != nil {
		return nil, err
	}

	// allocate for a single slide, than extend it
	// for each slide that we find in the HTML page
	slideList := make([]string, 0, 0)

	// find the slide container in the web page
	// for each section in it, retrieve the img tag that contains the images' URL
	doc.Find(slideImgSelector).Each(func(i int, s *goquery.Selection) {
		// each children of the slide container is a section
		// each sections' children is an "img" tag
		if url, ok := s.Attr(imgURLAttribute); ok {
			slideList = append(slideList, url)
		}
	})

	if len(slideList) == 0 {
		return nil, TagNotFoundError{"No slide sections in the HTML page!"}
	}

	return slideList, nil

}
コード例 #14
0
ファイル: thepiratebay.go プロジェクト: Ragnis/autousts
func (tpb *Thepiratebay) Search(query string, options Options) ([]*Result, error) {
	url := url.URL{
		Scheme: "https",
		Host:   "thepiratebay.org",
		Path:   fmt.Sprintf("/search/%s/0/7/0", query),
	}

	doc, err := goquery.NewDocument(url.String())
	if err != nil {
		return nil, err
	}

	ret := []*Result{}

	doc.Find("#SearchResults table#searchResult > tbody > tr").Each(func(i int, tr *goquery.Selection) {
		magnet, ok := tr.Find("a[href^=magnet]").Attr("href")
		if !ok {
			return
		}

		seeders, err := strconv.Atoi(tr.Find("td:nth-child(3)").Text())
		if err != nil || seeders < 0 {
			return
		}

		ret = append(ret, &Result{
			Name:      tr.Find(".detName a.detLink").Text(),
			MagnetURL: magnet,
			Seeders:   uint(seeders),
		})
	})

	return ret, nil
}
コード例 #15
0
ファイル: parser.go プロジェクト: podliy16/gogogog
func parseStartLink() {
	fmt.Println("Input url: ")
	fmt.Scanf("%s", &url)
	firstDoc, err := goquery.NewDocument(url)
	checkerr(err)
	firstDoc.Find("tbody").Each(func(i int, tbody *goquery.Selection) {
		tbody.Find(".description").Each(func(j int, s *goquery.Selection) {
			link, _ := s.Find("a").Attr("href")
			x, _ := regexp.MatchString(`https://www.exploit-db.com/exploits/.....`, link)
			if x == true {
				file, err := os.OpenFile("temp.txt", os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666)
				checkerr(err)
				_, err = file.WriteString(link + "\n")
				checkerr(err)
				file.Close()
			}
			y, _ := regexp.MatchString(`/docs/......pdf`, link)
			if y == true {
				wasteUrl, err := os.OpenFile("waste.txt", os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666)
				checkerr(err)
				_, err = wasteUrl.WriteString(link)
				checkerr(err)
				wasteUrl.Close()
			}
		})
	})
}
コード例 #16
0
ファイル: gopa.go プロジェクト: Hoodps/golangExample
func getDoc(url string) *goquery.Document {
	doc, err := goquery.NewDocument(url)
	if err != nil {
		log.Fatal(err)
	}
	return doc
}
コード例 #17
0
ファイル: mailbox.go プロジェクト: inscriptionweb/yogo
func (m *Mailbox) Flush() {
	id := func() string {
		doc, err := goquery.NewDocument(fmt.Sprintf(indexUrl, m.mail, 1))

		if err != nil {
			log.Fatal(err)
		}

		idUrl, _ := doc.Find("div.um a.lm").First().Attr("href")

		re := regexp.MustCompile("mail.php.b=.*?id=(.*)")

		matches := re.FindStringSubmatch(idUrl)

		if len(matches) == 2 {
			return matches[1]
		}

		return ""
	}()

	if id != "" {
		http.Get(fmt.Sprintf(deleteUrl, m.mail, strings.TrimLeft(id, "m")))
	}
}
コード例 #18
0
ファイル: crawler.go プロジェクト: hypebeast/gostats
func scrapeTrendingRepos(language string, outDir string) {
	var doc *goquery.Document
	var err error

	filename := dateFilename("github_trending_repos", ".json")
	if outDir != "" {
		filename = path.Join(outDir, filename)
	}

	err = createFile(filename)
	if err != nil {
		Error.Println(err)
		return
	}

	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0600)
	defer f.Close()
	if err != nil {
		Error.Println(err)
		return
	}

	for _, period := range Periods {
		if doc, err = goquery.NewDocument(fmt.Sprintf("https://github.com/trending?l=%s&since=%s", language, period)); err != nil {
			Error.Println(err)
		}

		repos := readTrendingRepos(doc, period)
		err = writeRepos(f, repos)
		if err != nil {
			Error.Println(err)
			return
		}
	}
}
コード例 #19
0
ファイル: iloveck101.go プロジェクト: kkdai/iloveck101
func crawler(target string, workerNum int) {
	doc, err := goquery.NewDocument(target)
	if err != nil {
		panic(err)
	}

	title := doc.Find("h1#thread_subject").Text()
	dir := fmt.Sprintf("%v/%v - %v", baseDir, threadId.FindStringSubmatch(target)[1], title)

	os.MkdirAll(dir, 0755)

	linkChan := make(chan string)
	wg := new(sync.WaitGroup)
	for i := 0; i < workerNum; i++ {
		wg.Add(1)
		go worker(dir, linkChan, wg)
	}

	doc.Find("div[itemprop=articleBody] img").Each(func(i int, img *goquery.Selection) {
		imgUrl, _ := img.Attr("file")
		linkChan <- imgUrl
	})

	close(linkChan)
	wg.Wait()
}
コード例 #20
0
func scrapeISO639() ([]iso639.Language, error) {
	// alpha-1
	doc, err := goquery.NewDocument("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes")
	if err != nil {
		return nil, err
	}

	all := []iso639.Language{}
	doc.Find("#mw-content-text > table.wikitable > tbody > tr").Each(func(i int, s *goquery.Selection) {
		var r iso639.Language

		r.Family = s.Find("td:nth-of-type(2) > a").Text()
		r.Name = s.Find("td:nth-of-type(3) > a").Text()
		r.NativeName = s.Find("td:nth-of-type(4)").Text()
		r.Code1 = s.Find("td:nth-of-type(5)").Text()
		r.Code2 = s.Find("td:nth-of-type(6)").Text()
		r.Code2B = s.Find("td:nth-of-type(7)").Text()
		r.Code3 = s.Find("td:nth-of-type(8)").Text()
		r.Code6 = s.Find("td:nth-of-type(9)").Text()
		all = append(all, r)
	})

	// TODO: scrape alpha3

	return all, nil
}
コード例 #21
0
ファイル: crawler.go プロジェクト: willwhitney/go-experiments
func Crawl(url string, urls chan []string) (links []string) {
	doc, err := goquery.NewDocument(url)
	errorify(err)

	doc.Find("title").Each(func(i int, s *goquery.Selection) {
		value := s.Text()
		fmt.Println(value)
	})

	links = make([]string, 0, 100)
	doc.Find("a").Each(func(i int, s *goquery.Selection) {
		link, exists := s.Attr("href")

		// only take wikipedia links to real pages
		if exists {
			switch {
			case strings.Contains(link, "edit") || strings.Contains(link, "disambiguation"):
				return
			case strings.HasPrefix(link, "//"):
				return
			case strings.HasPrefix(link, "#"):
				return
			case strings.HasPrefix(link, "/") && strings.Contains(link, "wiki"):
				links = append(links, "http://en.wikipedia.org"+link)
			default:
				return
			}
		}
	})

	urls <- links
	return

}
コード例 #22
0
ファイル: main.go プロジェクト: koansys/isat-smd-missions
func main() {
	fmt.Println("starting...")
	flag.Parse()
	var doc *goquery.Document
	var err error

	if doc, err = goquery.NewDocument("http://science.nasa.gov/missions/?group=all"); err != nil {
		log.Fatal("Failed to fetch page")
	}
	doc.Find(".missions").Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
		m := unpackMission(s)
		if m.Phase == "Operating" {
			missions = append(missions, m)
		}
	})

	if *asJson == true {
		b, err := json.Marshal(missions)
		if err != nil {
			log.Fatal(err)
		}
		os.Stdout.Write(b)
	} else {
		for _, m := range missions {
			fmt.Println(m)
		}
	}
}
コード例 #23
0
ファイル: kickass.go プロジェクト: LKarel/autousts
func (k *Kickass) Search(query string, options Options) ([]*Result, error) {
	url := url.URL{
		Scheme:   "https",
		Host:     "kat.cr",
		Path:     fmt.Sprintf("/usearch/%s/", query),
		RawQuery: "field=seeders&sorder=desc",
	}

	doc, err := goquery.NewDocument(url.String())
	if err != nil {
		return nil, err
	}

	ret := []*Result{}

	doc.Find("#mainSearchTable table.data tr[id]").Each(func(i int, s *goquery.Selection) {
		magnet, ok := s.Find("a[title='Torrent magnet link']").Attr("href")
		if !ok {
			return
		}

		seeders, err := strconv.Atoi(s.Find("td:nth-child(5)").Text())
		if err != nil || seeders < 0 {
			return
		}

		ret = append(ret, &Result{
			Name:      s.Find(".cellMainLink").Text(),
			MagnetURL: magnet,
			Seeders:   uint(seeders),
		})
	})

	return ret, nil
}
コード例 #24
0
ファイル: crawler.go プロジェクト: hypebeast/gostats
func scrapeMostStarredRepos(language string, outDir string) {
	var doc *goquery.Document
	var err error

	filename := dateFilename("github_most_starred", ".json")
	if outDir != "" {
		filename = path.Join(outDir, filename)
	}

	err = createFile(filename)
	if err != nil {
		Error.Println(err)
		return
	}

	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0600)
	defer f.Close()
	if err != nil {
		Error.Println(err)
		return
	}

	for i := 1; i <= 5; i++ {
		if doc, err = goquery.NewDocument(fmt.Sprintf("https://github.com/search?q=stars:>1&type=Repositories&l=%s&p=%d", language, i)); err != nil {
			Error.Println(err)
		}
		repos := readMostStarredRepos(doc)
		err = writeRepos(f, repos)
		if err != nil {
			Error.Println(err)
			return
		}
	}
}
コード例 #25
0
ファイル: main.go プロジェクト: ruqqq/nuswherebot
// Get location (name, lat and lng)
// via NUS Web
func getLocationInfoNUS(query string) ([]LocationInfo, error) {
	url := fmt.Sprintf("http://map.nus.edu.sg/index.php/search/by/%s", query)
	doc, err := goquery.NewDocument(url)
	if err != nil {
		return nil, err
	}

	var locations []LocationInfo

	s := doc.Find("#search_list a[href=\"javascript:void(0)\"]").First()

	onclick, _ := s.Attr("onclick")
	regex := regexp.MustCompile("long=([0-9\\.]+?)&lat=([0-9\\.]+?)'")
	matches := regex.FindAllStringSubmatch(onclick, -1)

	if len(matches) == 0 || len(matches[0]) != 3 {
		return nil, fmt.Errorf("Can't find lat and lng from query: %s", query)
	}

	x, _ := strconv.ParseFloat(matches[0][1], 64)
	y, _ := strconv.ParseFloat(matches[0][2], 64)

	location := LocationInfo{
		Name: s.Text(),
		Lng:  x,
		Lat:  y,
	}

	locations = append(locations, location)

	return locations, nil
}
コード例 #26
0
// prepScrapeCinemaMovies prepares the actual URL for movie showtimes at a particular cinema, then
// calls the actual scraping function.
func prepScrapeCinemaMovies(url string, context interface{}, cinemas chan<- []*data.Cinema, movies chan<- []*data.Movie) {

	var doc *gq.Document
	var err error

	log.Println("Retrieving document for " + url)
	if doc, err = gq.NewDocument(url); err != nil {
		log.Fatal(err)
	}

	allText, err := doc.Html()
	startIdx := strings.Index(allText, "buyTickets2")

	if startIdx > -1 {

		locIdx := strings.Index(allText[startIdx:], "loc=")
		endLoc := strings.Index(allText[startIdx+locIdx:], "&")
		loc := allText[startIdx+locIdx+4 : startIdx+locIdx+endLoc]

		go scrapeCinemaMovies(BASE+"/buyTickets2.jsp?loc="+loc+"&date="+time.Now().Format("02-01-2006"), context, cinemas, movies)

	} else {
		log.Fatalf("No available source URL")
	}

}
コード例 #27
0
ファイル: wiki.go プロジェクト: alexmullins/aeswiki
func main() {
	doc, err := goquery.NewDocument(url)
	if err != nil {
		panic(err)
	}

	preElems := doc.Find("#mw-content-text > pre")

	if preElems.Length() != PreCount {
		panic("Did not find enough elements on the page.")
	}

	preElems.Each(func(i int, sel *goquery.Selection) {
		prees[PreType(i)] = sel.Text()
	})

	tableB := parseSBoxTable(prees[SBoxTable])
	bytesB := parseSBoxBytes(prees[SBoxBytes])

	if !bytes.Equal(tableB, bytesB) {
		panic("SBoxTable and SBoxBytes do not match")
	}

	iTableB := parseSBoxTable(prees[SBoxInvTable])
	iBytesB := parseSBoxBytes(prees[SBoxInvBytes])

	if !bytes.Equal(iTableB, iBytesB) {
		panic("SBoxInvTable and SBoxInvBytes do not match.")
	}

	fmt.Println("Everything matches.")
}
コード例 #28
0
ファイル: index.go プロジェクト: oywc410/amazon
func getAllData(link string, i int) {

	fmt.Println(link)
	doc, err := goquery.NewDocument(link)

	if err != nil {
		fmt.Println(err)
	}

	j := 0

	doc.Find(".s-result-item").Each(func(i int, s *goquery.Selection) {
		j++
		linkHref, exists := s.Find("a").Attr("href")
		if !exists {
			fmt.Println("no href")
		}

		//fmt.Println(linkHref)
		all := digitsRegexp.FindStringSubmatch(linkHref)
		fmt.Println(all[1])
	})

	chanM <- i

}
コード例 #29
0
func scanFolder(url string) {
	doc, err := goquery.NewDocument(url)
	if err != nil {
		log.Fatal(err)
	}

	doc.Find("a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")

		if strings.HasSuffix(href, ".zip") {
			c := pool.Borrow()
			go func() {
				defer pool.Return(c)
				filename := download(url, href)
				if filename != "" {
					process(filename)
				}
			}()
		}

		if !strings.HasPrefix(href, "/") && strings.HasSuffix(href, "/") {
			log.Printf("%+v", href)
			scanFolder(url + href)
		}
	})
}
コード例 #30
0
ファイル: scrapers.go プロジェクト: ubuntu-si/arso-api
// ARSOPotresi returs slice of Potres struct
func ARSOPotresi() []Potres {
	var potresi []Potres
	var doc *goquery.Document
	var e error

	if res, found := cacheArso.Get("potresi"); found {
		return res.([]Potres)
	}

	if doc, e = goquery.NewDocument("http://www.arso.gov.si/potresi/obvestila%20o%20potresih/aip/"); e != nil {
		return potresi
	}

	doc.Find("#glavna td.vsebina table tr").Each(func(i int, s *goquery.Selection) {
		magnituda, err := strconv.ParseFloat(s.Find("td:nth-child(4)").Text(), 2)
		if magnituda > 0 && err == nil {
			potres := Potres{}
			potres.Magnituda = magnituda
			potres.Lat, _ = strconv.ParseFloat(s.Find("td:nth-child(2)").Text(), 3)
			potres.Lon, _ = strconv.ParseFloat(s.Find("td:nth-child(3)").Text(), 3)
			potres.Lokacija = s.Find("td:nth-child(6)").Text()
			potres.Datum = s.Find("td:nth-child(1)").Text()
			potresi = append(potresi, potres)
		}
	})
	cacheArso.Set("potresi", potresi, cache.DefaultExpiration)
	return potresi
}