func crawl(exe_dir string, db *sql.DB) { res, _ := http.PostForm("http://shirodanuki.cs.shinshu-u.ac.jp/cgi-bin/olts/sys/exercise.cgi", url.Values{ "name": {"hoge"}, "id": {"hogehoge"}, "email": {""}, "exe_dir": {exe_dir}, "chapter": {""}, "url": {"http://webmizar.cs.shinshu-u.ac.jp/learn/infomath/"}, }, ) defer res.Body.Close() utf8 := euc2utf8(res.Body) doc, _ := goquery.NewDocumentFromReader(utf8) html, _ := doc.Find("blockquote").Html() question := strings.TrimSpace(html) tmp, _ := doc.Find("input[name=tmp]").Attr("value") res, _ = http.PostForm("http://shirodanuki.cs.shinshu-u.ac.jp/cgi-bin/olts/sys/answer.cgi", url.Values{ "answer": {""}, "subject": {""}, "chapter": {""}, "url": {"http://webmizar.cs.shinshu-u.ac.jp/learn/infomath/"}, "tmp": {tmp}, }, ) defer res.Body.Close() utf8 = euc2utf8(res.Body) doc, _ = goquery.NewDocumentFromReader(utf8) answer := strings.TrimSpace(doc.Find("blockquote tt b").Text()) stmt, _ := db.Prepare("INSERT INTO `cai` (`exe_dir`, `question`, `answer`) VALUES (?, ?, ?)") stmt.Exec(exe_dir, question, answer) }
// ExtractNews will return the proper structures from items func ExtractNews(newitems []*rss.Item) []NewStruct { var newst []NewStruct for _, new := range newitems { // init // linkstr := "" var linkslist []string // linkslist := make([]string, 0) var images []string descrip := "" // get all links if new.Links != nil { links := new.Links for _, l := range links { l2 := *l linkslist = append(linkslist, l2.Href) // linkstr += fmt.Sprintf(" - (%s)", l2.Href) } } // Read HTML content := new.Description if new.Content != nil { content = new.Content.Text } // finaltext := fmt.Sprintf("%s<br>%s", new.Description, content) read := strings.NewReader(content) doc, err := goquery.NewDocumentFromReader(read) if err == nil { doc.Find("img").Each(func(i int, s *goquery.Selection) { val, ok := s.Attr("src") if ok { images = append(images, val) } }) descrip = doc.Text() doc2, err2 := goquery.NewDocumentFromReader(strings.NewReader(descrip)) if err2 == nil { doc2.Find("img").Each(func(i int, s *goquery.Selection) { val, ok := s.Attr("src") if ok { images = append(images, val) } }) descrip = doc2.Text() } } new.Title, descrip = analyzeTitleDescrip(new.Title, descrip) // itemstr := fmt.Sprintf("%s%s\n%s", new.Title, linkstr, descrip) newst = append(newst, NewStruct{"", images, new.Title, descrip, new.PubDate, new.Author.Name, "", linkslist}) // newst = append(newst, NewStruct{itemstr, images}) } return newst }
func (c *webCache) load(url string) (*goquery.Document, error) { localPath := c.urlToLocal(url) if file, err := os.Open(localPath); err == nil { defer file.Close() return goquery.NewDocumentFromReader(file) } <-c.ticker.C res, err := http.Get(url) if err != nil { return nil, err } defer res.Body.Close() var buff bytes.Buffer if _, err := buff.ReadFrom(res.Body); err != nil { return nil, err } if err := ioutil.WriteFile(localPath, buff.Bytes(), 0644); err != nil { return nil, err } return goquery.NewDocumentFromReader(&buff) }
// Preprocess fetches the HTML page if needed, converts it to UTF-8 and applies // some text normalisation to guarantee better results when extracting the content func (c *Crawler) Preprocess() (*goquery.Document, error) { if c.RawHTML == "" { c.RawHTML = c.fetchHTML(c.url, c.config.timeout) } if c.RawHTML == "" { return nil, nil } c.RawHTML = c.addSpacesBetweenTags(c.RawHTML) reader := strings.NewReader(c.RawHTML) document, err := goquery.NewDocumentFromReader(reader) if err != nil { return nil, err } cs := c.GetCharset(document) //log.Println("-------------------------------------------CHARSET:", cs) if "" != cs && "UTF-8" != cs { // the net/html parser and goquery require UTF-8 data c.RawHTML = UTF8encode(c.RawHTML, cs) reader = strings.NewReader(c.RawHTML) document, err = goquery.NewDocumentFromReader(reader) if nil != err { return nil, err } } return document, nil }
func TestPostAfterUpdating(t *testing.T) { Convey("the post should not be displayed on frontpage", t, func() { var recorder = httptest.NewRecorder() request, _ := http.NewRequest("GET", "/", nil) server.ServeHTTP(recorder, request) So(recorder.Code, ShouldEqual, 200) doc, _ := goquery.NewDocumentFromReader(recorder.Body) sel := doc.Find("article h1").Text() So(sel, ShouldBeEmpty) }) Convey("update should return HTTP 200", t, func() { var recorder = httptest.NewRecorder() request, _ := http.NewRequest("GET", fmt.Sprintf("/api/post/%s/publish", post.Slug), nil) cookie := &http.Cookie{Name: "id", Value: sessioncookie} request.AddCookie(cookie) server.ServeHTTP(recorder, request) So(recorder.Body.String(), ShouldEqual, `{"success":"Post published"}`) So(recorder.Code, ShouldEqual, 200) }) Convey("after updating, post should be displayed on frontpage", t, func() { var recorder = httptest.NewRecorder() request, _ := http.NewRequest("GET", "/", nil) server.ServeHTTP(recorder, request) So(recorder.Code, ShouldEqual, 200) doc, _ := goquery.NewDocumentFromReader(recorder.Body) sel := doc.Find("article .title").Text() So(sel, ShouldEqual, post.Title) }) Convey("the post should not be displayed trough API", t, func() { var recorder = httptest.NewRecorder() request, _ := http.NewRequest("GET", "/api/posts", nil) server.ServeHTTP(recorder, request) So(recorder.Code, ShouldEqual, 200) var posts []Post json.Unmarshal(recorder.Body.Bytes(), &posts) for i, p := range posts { So(i, ShouldEqual, 0) So(post.ID, ShouldEqual, p.ID) So(post.Title, ShouldEqual, p.Title) So(post.Content, ShouldEqual, p.Content) So(post.Markdown, ShouldEqual, p.Markdown) So(post.Slug, ShouldEqual, p.Slug) So(post.Author, ShouldEqual, p.Author) So(post.Created, ShouldBeGreaterThan, int64(1400000000)) if post.Updated != post.Created { So(post.Updated, ShouldAlmostEqual, post.Created, 5) } So(post.Excerpt, ShouldEqual, p.Excerpt) } }) }
// Login() authenticates with ShopKeep. // Returns a non-nil error value if login fails. func (d *Downloader) Login() error { // Get the login page lp, err := d.client.Get(d.site) if err != nil { return errors.New("Could not get: " + d.site) } defer lp.Body.Close() // Pull the login page into a goquery.Document loginPage, err := goquery.NewDocumentFromReader(lp.Body) if err != nil { return errors.New("Failed to login: Could not read response body.") } // Determine what the authenticity token is. at := authToken(loginPage) if at == "" { return errors.New("Failed to find authenticity_token.") } d.authenticity_token = at log.Println("Found authenticity_token: " + d.authenticity_token) // Get the homepage by posting login credentials hp, err := d.client.PostForm(d.site+"/session", url.Values{ "authenticity_token": {d.authenticity_token}, "utf8": {"✓"}, "login": {d.username}, "password": {d.password}, "commit": {"Sign in"}, }) if err != nil { return errors.New("Failed POSTing login form: " + err.Error()) } defer hp.Body.Close() // Pull the homepage response into a goquery.Document homePage, err := goquery.NewDocumentFromReader(hp.Body) if err != nil { return errors.New("Failed to access homepage: " + err.Error()) } // Check the login status. // Can't simply check response status (ShopKeep returns 200 whether login was successful or not). // Can't check location header as it is not included in the response. if loginStatus(homePage) == false { return errors.New("Invalid username or password") } log.Println("Login successful!") return nil }
func (user *User) Etl(links []string) { mscnt_regexp := regexp.MustCompile(`(\d+)人参加`) date_regexp := regexp.MustCompile(`0?(\d+)月0?(\d+)日`) for _, link := range links { go func(u User, link string) { fmt.Println("Etl <-", link) response, err := u.RequestWithCookie(link, "GET", nil) if err != nil { fmt.Println(err) } else { defer response.Body.Close() if rawbody, err := goquery.NewDocumentFromReader(response.Body); err != nil { fmt.Printf("error: %s\n", err) } else { var mscnt int var acdate time.Time body := rawbody.Find("div[class='tn-box-content tn-widget-content tn-corner-all']") subject := rawbody.Find("h1[class='tn-helper-reset tn-text-heading']").Text() body.Find("span[class='tn-action']").Find("a").Each(func(i int, s *goquery.Selection) { if mscnt_content := mscnt_regexp.FindStringSubmatch(s.Text()); len(mscnt_content) > 1 { if cnt, err := strconv.Atoi(mscnt_content[1]); err != nil { panic(err) } else { mscnt = cnt } } }) if datext := body.Find("span[class='tn-date']").Text(); datext != "" { ad, _ := time.Parse("2006年01月02日", "2014年"+date_regexp.FindStringSubmatch(datext)[0]) acdate = ad } robbery_body := body.Find("span[class='tn-icon-join tn-icon']").Next() robbery_text := robbery_body.Text() robbery_addr, _ := robbery_body.Attr("href") if strings.Contains(robbery_text, "我要报名") { form_response, _ := u.RequestWithCookie(domain+robbery_addr, "GET", nil) form_body, _ := goquery.NewDocumentFromReader(form_response.Body) if form_addr, form_exists := form_body.Find("form").Attr("action"); form_exists { activitie := Activity{subject, acdate, acdate.Weekday(), mscnt, domain + form_addr} fmt.Println("Activitys <-", activitie) activities <- activitie } } } } }(*user, link) } }
func parse(s string) []string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(s)) if err != nil { log.Fatalln("pare error", err) } result := []string{} f := func(i int, q *goquery.Selection) { q = q.Children() if q.Length() != 7 { return } dt := strings.TrimSpace(q.Eq(1).Text()) name := strings.TrimSpace(q.Eq(2).Text()) name = strings.Replace(name, "_", "", -1) id, _ := q.Eq(2).Find("a").Attr("href") id = strings.TrimSpace(id) id = strings.Split(id, "=")[1] b := strings.TrimSpace(q.Eq(3).Text()) b = strings.Replace(b, "_", "", -1) w := strings.TrimSpace(q.Eq(4).Text()) w = strings.Replace(w, "_", "", -1) result = append(result, fmt.Sprintf("%v_%v_%v_%v_%v", name, dt, b, w, id)) } doc.Find("#table1 tr").Each(f) return result }
func (this *HttpDownloader) downloadHtml(p *page.Page, req *request.Request) *page.Page { var err error p, destbody := this.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Page error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { mlog.LogInst().LogError(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }
func getGbkDoc(client *http.Client, url string) (*goquery.Document, error) { retry := 3 get: resp, err := client.Get(url) if err != nil { if retry > 0 { retry-- goto get } else { return nil, me(err, "get") } } defer resp.Body.Close() r := transform.NewReader(resp.Body, simplifiedchinese.GBK.NewDecoder()) doc, err := goquery.NewDocumentFromReader(r) if err != nil { if retry > 0 { retry-- goto get } else { return nil, me(err, "new document from response") } } return doc, nil }
func MakeDoubanSpider() *spiders.Spider { spider := &spiders.Spider{} spider.Name = "douban_img_spider" spider.StartUrls = []string{"http://movie.douban.com/"} spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error)) spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) { if response.Request.Depth > 10 { return nil, nil } doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body)) if err != nil { return nil, err } nodes := doc.Find("#page .n").Nodes if len(nodes) == 0 { return nil, err } nextNode := nodes[len(nodes)-1] attrList := nextNode.Attr var nextPageLink string for _, attr := range attrList { if attr.Key == "href" { nextPageLink = attr.Val break } } nextPage := "http://www.baidu.com" + nextPageLink request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0) requestList := make([]*http.Request, 0) requestList = append(requestList, request) return requestList, nil } return spider }
func firstURLFromHTML(con *data.Context, body string) ([]string, error) { if body == "" { return nil, nil } strRdr := strings.NewReader(body) doc, err := goquery.NewDocumentFromReader(strRdr) if err != nil { return nil, err } var links []string found := false doc.Find("a").First().Each(func(i int, s *goquery.Selection) { if found { return } link, exists := s.Attr("href") if !exists { return } if strings.Contains(link, "mailto:") { return } links = append(links, link) found = true con.Log.Infof("HTML found %v", link) }) return links, nil }
// Returns the page title or an error. If there is an error, the url is returned as well. func getPageTitle(url string) (string, error) { client := &http.Client{} req, err := http.NewRequest("GET", url, nil) if err != nil { return url, err } req.Header.Set("User-Agent", SUFRUserAgent) res, err := client.Do(req) if err != nil { return url, err } defer res.Body.Close() doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { return url, err } title := doc.Find("title").Text() return title, nil }
// Robtex looks up a host at robtex.com. func Robtex(ip string) (string, Results, error) { task := "robtex.com" results := Results{} resp, err := http.Get("http://www.robtex.com/ip/" + ip + ".html") if err != nil { return task, results, err } defer resp.Body.Close() doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return task, results, err } doc.Selection.Find("#x_summary td:nth-child(1)").Each(func(_ int, s *goquery.Selection) { hostname := s.Text() if strings.Contains(hostname, "*") { return } if hostname == "." { return } if _, err := strconv.Atoi(hostname); err == nil { return } results = append(results, Result{Source: task, IP: ip, Hostname: s.Text()}) }) return task, results, nil }
// send uses the given *http.Request to make an HTTP request. func (bow *Browser) httpRequest(req *http.Request) error { bow.preSend() resp, err := bow.client.Do(req) if err != nil { return err } defer resp.Body.Close() bow.body, err = ioutil.ReadAll(resp.Body) if err != nil { return err } buff := bytes.NewBuffer(bow.body) dom, err := goquery.NewDocumentFromReader(buff) if err != nil { return err } bow.history.Push(bow.state) bow.state = jar.NewHistoryState(req, resp, dom) bow.postSend() return nil }
func TestSanrioNewsReleaseSource(t *testing.T) { f, err := os.Open("data/www.sanrio.co.jp/corporate/release/index.html") if err != nil { t.Fatal(err) } defer f.Close() doc, err := goquery.NewDocumentFromReader(f) if err != nil { t.Fatal(err) } loc, err := time.LoadLocation("Asia/Tokyo") if err != nil { t.Fatal(err) } source := NewSanrioNewsReleaseSource() feed, err := source.ScrapeFromDocument(doc) if err != nil { t.Fatal(err) } assert.Equal(t, 51, len(feed.Items)) assert.Equal(t, "ぐでぐでやる気のない「ぐでたま」のイベント九州初上陸! 夏休み企画 「ぐでたま in ふくおか」 7月21日(木)〜 福岡パルコ & sanrio vivitix 天神地下街店にて開催 (PDF)", feed.Items[0].Title) assert.Equal(t, "http://www.sanrio.co.jp/wp-content/uploads/2015/05/20160708-1.pdf", feed.Items[0].Link.Href) assert.WithinDuration(t, time.Date(2016, 7, 8, 0, 0, 0, 0, loc), feed.Items[0].Created, 0) assert.Equal(t, "2016年バレンタイン向けスペシャルギフト「GODIVA &ハローキティ」・「GODIVA &マイメロディ」1月6日(水)よりサンリオ限定販売", feed.Items[50].Title) assert.Equal(t, "http://www.sanrio.co.jp/corporate/release/y2016/d0106/", feed.Items[50].Link.Href) assert.WithinDuration(t, time.Date(2016, 1, 6, 0, 0, 0, 0, loc), feed.Items[50].Created, 0) }
// Parse returns array of the StreetInfo populated by all streets func (parser *WikipediaMoscow) Parse(reader io.Reader) ([]StreetInfo, error) { parser.result = make([]StreetInfo, 0) doc, err := goquery.NewDocumentFromReader(reader) if err != nil { return nil, err } // Get links (a tag) inside li tags (li tags must be without id or class) liAll := doc.Find("li").FilterFunction(filterLITag).Children().Filter("a") parser.result = make([]StreetInfo, 0, liAll.Length()) done := make(chan *StreetInfo, liAll.Length()) for n := range liAll.Nodes { element := liAll.Eq(n) go parser.processLink(n, element, done) } for i := 0; i < liAll.Length(); i++ { info := <-done if len(info.Name) != 0 { parser.result = append(parser.result, *info) parser.writer.Print(info) } } return parser.result, nil }
func TestSubheadRemoval(t *testing.T) { html := bytes.NewBufferString(testHTML) doc, err := goquery.NewDocumentFromReader(html) if err != nil { t.Fatal(err) } extractedBody := ExtractBodyFromDocument(doc, false, false) subhead := "Depth at forward" if strings.Contains(extractedBody.Text, subhead) { t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead) } subhead = "New leaders on ‘D’" if strings.Contains(extractedBody.Text, subhead) { t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead) } subhead = "Go with veterans" if strings.Contains(extractedBody.Text, subhead) { t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead) } subhead = "Goaltending duties" if strings.Contains(extractedBody.Text, subhead) { t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead) } actualText := strings.Join(strings.Fields(extractedBody.Text), " ") if actualText != testExpectedText { t.Fatal("Actiual text does not match expected text") } }
func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response { var err error p, destbody := self.downloadFile(p, req) //fmt.Printf("Destbody %v \r\n", destbody) if !p.IsSucc() { //fmt.Print("Response error \r\n") return p } bodyReader := bytes.NewReader([]byte(destbody)) var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } var body string if body, err = doc.Html(); err != nil { reporter.Log.Println(err.Error()) p.SetStatus(true, err.Error()) return p } p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") return p }
func TestManipulatingSettings(t *testing.T) { Convey("when manipulating the global Settings variable", t, func() { Convey("should save the changes to disk", func() { settings = *Settings settings.Name = "Juuso's Blog" s, err := settings.Update() if err != nil { panic(err) } Settings = s }) Convey("frontpage's <title> should now be 'Juuso's Blog'", func() { var recorder = httptest.NewRecorder() request, _ := http.NewRequest("GET", "/", nil) server.ServeHTTP(recorder, request) doc, _ := goquery.NewDocumentFromReader(recorder.Body) sel := doc.Find("title").Text() So(sel, ShouldEqual, Settings.Name) }) }) TestSettingValues(t) }
func TestJobsRegionsPaginate(t *testing.T) { pg := []struct { from, to int }{ {0, 1}, {0, 2}, {1, 4}, } for _, page := range pg { for _, reg := range regionsSample { paginate := fmt.Sprintf("%s/jobs/regions/%s/%d/%d", ts.URL, reg.short, page.from, page.to) b, err := com.HttpGetBytes(client, paginate, nil) if err != nil { t.Errorf("getting regions home page %v", err) } doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(b))) if err != nil { t.Errorf("loading document %v", err) } s := doc.Find(".job-item") d := page.to - page.from if s.Length() != d { t.Errorf("expected %d got %d", d, s.Length()) } } } }
func newDocFromString(s string) *goquery.Document { doc, err := goquery.NewDocumentFromReader(strings.NewReader(s)) if err != nil { log.Fatal(err) } return doc }
func (p ProxyHTTP) ProcessArticles(ua []content.UserArticle) []content.UserArticle { if len(ua) == 0 { return ua } p.logger.Infof("Proxying urls of feed '%d'\n", ua[0].Data().FeedId) for i := range ua { data := ua[i].Data() if d, err := goquery.NewDocumentFromReader(strings.NewReader(data.Description)); err == nil { if processor.ProxyArticleLinks(d, p.urlTemplate) { if content, err := d.Html(); err == nil { // net/http tries to provide valid html, adding html, head and body tags content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")] data.Description = content ua[i].Data(data) } } } } return ua }
func TestActiveHeader(t *testing.T) { record := request.TestServer{t, TestHandler} testcases := []struct { Path string Link string }{ {"/about", "About"}, {"/contact", "Contact"}, {"/help", "Help"}, {"/", "Home"}, } It("should highlight the correct header link", func() { for _, test := range testcases { ctx := record.Get(test.Path) reader := strings.NewReader(ctx.ResponseRecorder.Body.String()) doc, err := goquery.NewDocumentFromReader(reader) if err != nil { t.Fatal(err) } active := doc.Find(".nav-item.active").First().Text() active = strings.TrimSpace(active) assert.Equal(t, test.Link, active) } }) }
func sfvTask(url string) { res, err := http.Get(url) if err != nil { // handle error fmt.Println("got error:", err) return } defer res.Body.Close() // use utfBody using goquery doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { // handler error fmt.Println("got error:", err) return } doc.Find("div.textblock").Each(func(i int, s *goquery.Selection) { alt, ok := s.Find("IMG").Attr("alt") if !ok { fmt.Println(alt) return } fmt.Println("get from html as result:", alt) if strings.Contains(alt, NOT_AVAILABLE) { // notify("Hi Sfver:</br> No available space currently. please try one via the link below: </br><a href=\"http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm\">http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm</a>") fmt.Println(time.Now().Format("2006-01-02 15:04:06"), alt, ", please waiting for available") } else { notify("Hi Sfver:</br> Some of SFVs are available! please try one via the link below: </br><a href=\"http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm\">http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm</a>") fmt.Println(time.Now().Format("2006-01-02 15:04:06"), "Some of SFVs are available! :-)") } }) }
func (h *HttpClient) FetchDocument(method, urlStr string, query map[string]string) (*goquery.Document, error) { buf, err := h.Do(method, urlStr, query) if err != nil { return nil, err } return goquery.NewDocumentFromReader(bytes.NewReader(buf)) }
func (p InsertThumbnailTarget) ProcessArticles(ua []content.UserArticle) []content.UserArticle { if len(ua) == 0 { return ua } p.logger.Infof("Proxying urls of feed '%d'\n", ua[0].Data().FeedId) for i := range ua { data := ua[i].Data() if data.ThumbnailLink == "" { continue } if d, err := goquery.NewDocumentFromReader(strings.NewReader(data.Description)); err == nil { if insertThumbnailTarget(d, data.ThumbnailLink, p.logger) { if content, err := d.Html(); err == nil { // net/http tries to provide valid html, adding html, head and body tags content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")] data.Description = content ua[i].Data(data) } } } } return ua }
func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error) { var wg sync.WaitGroup err := validateSelector(selector) if err != nil { return "", nil, err } jobId := "READER" + GenerateStringKey(selector) log.Printf("INFO: Scrap [%v] from Reader started\n", jobId) items := make(chan ItemResult, bufferItemsSize) wg.Add(1) go func() { doc, err := goquery.NewDocumentFromReader(*s.reader) if err != nil { log.Println("ERROR Scrapping ", selector.Url, " with message", err.Error()) return } DocumentScrap(jobId, selector, doc, items) wg.Done() }() closeItemsChannel(jobId, items, &wg) return jobId, items, nil }
func (notify *NotifyParams) extractHTML() error { reader := strings.NewReader(notify.RawHTML) document, err := goquery.NewDocumentFromReader(reader) if err != nil { return err } notify.ParsedHTML.Doctype, err = notify.extractDoctype(notify.RawHTML) if err != nil { return err } notify.ParsedHTML.Head, err = notify.extractHead(document) if err != nil { return err } bodyAttributes := "" for _, attribute := range document.Find("body").Nodes[0].Attr { bodyAttributes += " " + attribute.Key + `="` + attribute.Val + `"` } bodyAttributes = strings.TrimPrefix(bodyAttributes, " ") bodyContent, err := document.Find("body").Html() if err != nil { return err } if bodyContent != "" { notify.ParsedHTML.BodyAttributes = bodyAttributes notify.ParsedHTML.BodyContent = bodyContent } return nil }
func getListDetail(m string, s []byte) []string { r := bytes.NewReader(s) doc, err := goquery.NewDocumentFromReader(r) checkError(err) arr := []string{} dir := "./detail/" + m err0 := os.MkdirAll(dir, 0777) checkError(err0) doc.Find("div.cont dl").Each(func(i int, s *goquery.Selection) { title := s.Find("dt a").Text() link, _ := s.Find("dt a").Attr("href") url := "http://www.safe10000.com" + link file := dir + "/" + path.Base(url) //desc := s.Find("dd").Text() arr = append(arr, url) c := getContent(url, file) fmt.Printf("Review %d: %s - %s - %s\n", i, url, title, file) parseDetailHtml(m, c, path.Base(url)) }) return arr }