Beispiel #1
2
func crawl(exe_dir string, db *sql.DB) {
	res, _ := http.PostForm("http://shirodanuki.cs.shinshu-u.ac.jp/cgi-bin/olts/sys/exercise.cgi",
		url.Values{
			"name":    {"hoge"},
			"id":      {"hogehoge"},
			"email":   {""},
			"exe_dir": {exe_dir},
			"chapter": {""},
			"url":     {"http://webmizar.cs.shinshu-u.ac.jp/learn/infomath/"},
		},
	)
	defer res.Body.Close()
	utf8 := euc2utf8(res.Body)
	doc, _ := goquery.NewDocumentFromReader(utf8)
	html, _ := doc.Find("blockquote").Html()
	question := strings.TrimSpace(html)
	tmp, _ := doc.Find("input[name=tmp]").Attr("value")
	res, _ = http.PostForm("http://shirodanuki.cs.shinshu-u.ac.jp/cgi-bin/olts/sys/answer.cgi",
		url.Values{
			"answer":  {""},
			"subject": {""},
			"chapter": {""},
			"url":     {"http://webmizar.cs.shinshu-u.ac.jp/learn/infomath/"},
			"tmp":     {tmp},
		},
	)
	defer res.Body.Close()
	utf8 = euc2utf8(res.Body)
	doc, _ = goquery.NewDocumentFromReader(utf8)
	answer := strings.TrimSpace(doc.Find("blockquote tt b").Text())
	stmt, _ := db.Prepare("INSERT INTO `cai` (`exe_dir`, `question`, `answer`) VALUES (?, ?, ?)")
	stmt.Exec(exe_dir, question, answer)
}
Beispiel #2
0
// ExtractNews will return the proper structures from items
func ExtractNews(newitems []*rss.Item) []NewStruct {
	var newst []NewStruct
	for _, new := range newitems {
		// init
		// linkstr := ""
		var linkslist []string
		// linkslist := make([]string, 0)
		var images []string
		descrip := ""

		// get all links
		if new.Links != nil {
			links := new.Links
			for _, l := range links {
				l2 := *l
				linkslist = append(linkslist, l2.Href)
				// linkstr += fmt.Sprintf(" - (%s)", l2.Href)
			}
		}

		// Read HTML
		content := new.Description
		if new.Content != nil {
			content = new.Content.Text
		}
		// finaltext := fmt.Sprintf("%s<br>%s", new.Description, content)
		read := strings.NewReader(content)
		doc, err := goquery.NewDocumentFromReader(read)

		if err == nil {
			doc.Find("img").Each(func(i int, s *goquery.Selection) {
				val, ok := s.Attr("src")
				if ok {
					images = append(images, val)
				}
			})

			descrip = doc.Text()

			doc2, err2 := goquery.NewDocumentFromReader(strings.NewReader(descrip))
			if err2 == nil {
				doc2.Find("img").Each(func(i int, s *goquery.Selection) {
					val, ok := s.Attr("src")
					if ok {
						images = append(images, val)
					}
				})
				descrip = doc2.Text()
			}
		}

		new.Title, descrip = analyzeTitleDescrip(new.Title, descrip)

		// itemstr := fmt.Sprintf("%s%s\n%s", new.Title, linkstr, descrip)
		newst = append(newst, NewStruct{"", images, new.Title, descrip, new.PubDate, new.Author.Name, "", linkslist})

		// newst = append(newst, NewStruct{itemstr, images})
	}
	return newst
}
Beispiel #3
0
func (c *webCache) load(url string) (*goquery.Document, error) {
	localPath := c.urlToLocal(url)

	if file, err := os.Open(localPath); err == nil {
		defer file.Close()
		return goquery.NewDocumentFromReader(file)
	}

	<-c.ticker.C

	res, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	defer res.Body.Close()

	var buff bytes.Buffer
	if _, err := buff.ReadFrom(res.Body); err != nil {
		return nil, err
	}

	if err := ioutil.WriteFile(localPath, buff.Bytes(), 0644); err != nil {
		return nil, err
	}

	return goquery.NewDocumentFromReader(&buff)
}
Beispiel #4
0
// Preprocess fetches the HTML page if needed, converts it to UTF-8 and applies
// some text normalisation to guarantee better results when extracting the content
func (c *Crawler) Preprocess() (*goquery.Document, error) {
	if c.RawHTML == "" {
		c.RawHTML = c.fetchHTML(c.url, c.config.timeout)
	}
	if c.RawHTML == "" {
		return nil, nil
	}

	c.RawHTML = c.addSpacesBetweenTags(c.RawHTML)

	reader := strings.NewReader(c.RawHTML)
	document, err := goquery.NewDocumentFromReader(reader)

	if err != nil {
		return nil, err
	}

	cs := c.GetCharset(document)
	//log.Println("-------------------------------------------CHARSET:", cs)
	if "" != cs && "UTF-8" != cs {
		// the net/html parser and goquery require UTF-8 data
		c.RawHTML = UTF8encode(c.RawHTML, cs)
		reader = strings.NewReader(c.RawHTML)
		document, err = goquery.NewDocumentFromReader(reader)

		if nil != err {
			return nil, err
		}
	}

	return document, nil
}
Beispiel #5
0
func TestPostAfterUpdating(t *testing.T) {

	Convey("the post should not be displayed on frontpage", t, func() {
		var recorder = httptest.NewRecorder()
		request, _ := http.NewRequest("GET", "/", nil)
		server.ServeHTTP(recorder, request)
		So(recorder.Code, ShouldEqual, 200)
		doc, _ := goquery.NewDocumentFromReader(recorder.Body)
		sel := doc.Find("article h1").Text()
		So(sel, ShouldBeEmpty)
	})

	Convey("update should return HTTP 200", t, func() {
		var recorder = httptest.NewRecorder()
		request, _ := http.NewRequest("GET", fmt.Sprintf("/api/post/%s/publish", post.Slug), nil)
		cookie := &http.Cookie{Name: "id", Value: sessioncookie}
		request.AddCookie(cookie)
		server.ServeHTTP(recorder, request)
		So(recorder.Body.String(), ShouldEqual, `{"success":"Post published"}`)
		So(recorder.Code, ShouldEqual, 200)
	})

	Convey("after updating, post should be displayed on frontpage", t, func() {
		var recorder = httptest.NewRecorder()
		request, _ := http.NewRequest("GET", "/", nil)
		server.ServeHTTP(recorder, request)
		So(recorder.Code, ShouldEqual, 200)
		doc, _ := goquery.NewDocumentFromReader(recorder.Body)
		sel := doc.Find("article .title").Text()
		So(sel, ShouldEqual, post.Title)
	})

	Convey("the post should not be displayed trough API", t, func() {
		var recorder = httptest.NewRecorder()
		request, _ := http.NewRequest("GET", "/api/posts", nil)
		server.ServeHTTP(recorder, request)
		So(recorder.Code, ShouldEqual, 200)
		var posts []Post
		json.Unmarshal(recorder.Body.Bytes(), &posts)
		for i, p := range posts {
			So(i, ShouldEqual, 0)
			So(post.ID, ShouldEqual, p.ID)
			So(post.Title, ShouldEqual, p.Title)
			So(post.Content, ShouldEqual, p.Content)
			So(post.Markdown, ShouldEqual, p.Markdown)
			So(post.Slug, ShouldEqual, p.Slug)
			So(post.Author, ShouldEqual, p.Author)
			So(post.Created, ShouldBeGreaterThan, int64(1400000000))
			if post.Updated != post.Created {
				So(post.Updated, ShouldAlmostEqual, post.Created, 5)
			}
			So(post.Excerpt, ShouldEqual, p.Excerpt)
		}
	})
}
Beispiel #6
0
// Login() authenticates with ShopKeep.
// Returns a non-nil error value if login fails.
func (d *Downloader) Login() error {
	// Get the login page
	lp, err := d.client.Get(d.site)
	if err != nil {
		return errors.New("Could not get: " + d.site)
	}
	defer lp.Body.Close()

	// Pull the login page into a goquery.Document
	loginPage, err := goquery.NewDocumentFromReader(lp.Body)
	if err != nil {
		return errors.New("Failed to login: Could not read response body.")
	}

	// Determine what the authenticity token is.
	at := authToken(loginPage)
	if at == "" {
		return errors.New("Failed to find authenticity_token.")
	}
	d.authenticity_token = at
	log.Println("Found authenticity_token: " + d.authenticity_token)

	// Get the homepage by posting login credentials
	hp, err := d.client.PostForm(d.site+"/session",
		url.Values{
			"authenticity_token": {d.authenticity_token},
			"utf8":               {"✓"},
			"login":              {d.username},
			"password":           {d.password},
			"commit":             {"Sign in"},
		})
	if err != nil {
		return errors.New("Failed POSTing login form: " + err.Error())
	}
	defer hp.Body.Close()

	// Pull the homepage response into a goquery.Document
	homePage, err := goquery.NewDocumentFromReader(hp.Body)
	if err != nil {
		return errors.New("Failed to access homepage: " + err.Error())
	}

	// Check the login status.
	// Can't simply check response status (ShopKeep returns 200 whether login was successful or not).
	// Can't check location header as it is not included in the response.
	if loginStatus(homePage) == false {
		return errors.New("Invalid username or password")
	}

	log.Println("Login successful!")

	return nil
}
Beispiel #7
0
func (user *User) Etl(links []string) {
	mscnt_regexp := regexp.MustCompile(`(\d+)人参加`)
	date_regexp := regexp.MustCompile(`0?(\d+)月0?(\d+)日`)
	for _, link := range links {
		go func(u User, link string) {
			fmt.Println("Etl <-", link)
			response, err := u.RequestWithCookie(link, "GET", nil)
			if err != nil {
				fmt.Println(err)
			} else {
				defer response.Body.Close()
				if rawbody, err := goquery.NewDocumentFromReader(response.Body); err != nil {
					fmt.Printf("error: %s\n", err)
				} else {
					var mscnt int
					var acdate time.Time
					body := rawbody.Find("div[class='tn-box-content tn-widget-content tn-corner-all']")
					subject := rawbody.Find("h1[class='tn-helper-reset tn-text-heading']").Text()
					body.Find("span[class='tn-action']").Find("a").Each(func(i int, s *goquery.Selection) {
						if mscnt_content := mscnt_regexp.FindStringSubmatch(s.Text()); len(mscnt_content) > 1 {
							if cnt, err := strconv.Atoi(mscnt_content[1]); err != nil {
								panic(err)
							} else {
								mscnt = cnt
							}
						}
					})
					if datext := body.Find("span[class='tn-date']").Text(); datext != "" {
						ad, _ := time.Parse("2006年01月02日", "2014年"+date_regexp.FindStringSubmatch(datext)[0])
						acdate = ad
					}
					robbery_body := body.Find("span[class='tn-icon-join tn-icon']").Next()
					robbery_text := robbery_body.Text()
					robbery_addr, _ := robbery_body.Attr("href")
					if strings.Contains(robbery_text, "我要报名") {
						form_response, _ := u.RequestWithCookie(domain+robbery_addr, "GET", nil)
						form_body, _ := goquery.NewDocumentFromReader(form_response.Body)
						if form_addr, form_exists := form_body.Find("form").Attr("action"); form_exists {
							activitie := Activity{subject, acdate, acdate.Weekday(), mscnt, domain + form_addr}
							fmt.Println("Activitys <-", activitie)
							activities <- activitie
						}
					}
				}
			}
		}(*user, link)
	}
}
Beispiel #8
0
func parse(s string) []string {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
	if err != nil {
		log.Fatalln("pare error", err)
	}

	result := []string{}
	f := func(i int, q *goquery.Selection) {
		q = q.Children()

		if q.Length() != 7 {
			return
		}

		dt := strings.TrimSpace(q.Eq(1).Text())

		name := strings.TrimSpace(q.Eq(2).Text())
		name = strings.Replace(name, "_", "", -1)
		id, _ := q.Eq(2).Find("a").Attr("href")
		id = strings.TrimSpace(id)
		id = strings.Split(id, "=")[1]

		b := strings.TrimSpace(q.Eq(3).Text())
		b = strings.Replace(b, "_", "", -1)
		w := strings.TrimSpace(q.Eq(4).Text())
		w = strings.Replace(w, "_", "", -1)

		result = append(result, fmt.Sprintf("%v_%v_%v_%v_%v", name, dt, b, w, id))
	}

	doc.Find("#table1 tr").Each(f)
	return result
}
func (this *HttpDownloader) downloadHtml(p *page.Page, req *request.Request) *page.Page {
	var err error
	p, destbody := this.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Page error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		mlog.LogInst().LogError(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}
Beispiel #10
0
func getGbkDoc(client *http.Client, url string) (*goquery.Document, error) {
	retry := 3
get:
	resp, err := client.Get(url)
	if err != nil {
		if retry > 0 {
			retry--
			goto get
		} else {
			return nil, me(err, "get")
		}
	}
	defer resp.Body.Close()
	r := transform.NewReader(resp.Body, simplifiedchinese.GBK.NewDecoder())
	doc, err := goquery.NewDocumentFromReader(r)
	if err != nil {
		if retry > 0 {
			retry--
			goto get
		} else {
			return nil, me(err, "new document from response")
		}
	}
	return doc, nil
}
Beispiel #11
0
func MakeDoubanSpider() *spiders.Spider {
	spider := &spiders.Spider{}
	spider.Name = "douban_img_spider"
	spider.StartUrls = []string{"http://movie.douban.com/"}
	spider.ParseMap = make(map[string]func(response *http.Response) ([]*http.Request, error))
	spider.ParseMap[spiders.BASE_PARSE_NAME] = func(response *http.Response) ([]*http.Request, error) {
		if response.Request.Depth > 10 {
			return nil, nil
		}
		doc, err := goquery.NewDocumentFromReader(strings.NewReader(response.Body))
		if err != nil {
			return nil, err
		}
		nodes := doc.Find("#page .n").Nodes
		if len(nodes) == 0 {
			return nil, err
		}
		nextNode := nodes[len(nodes)-1]
		attrList := nextNode.Attr
		var nextPageLink string
		for _, attr := range attrList {
			if attr.Key == "href" {
				nextPageLink = attr.Val
				break
			}
		}
		nextPage := "http://www.baidu.com" + nextPageLink
		request, err := http.NewRequest("GET", nextPage, spider.Name, spiders.BASE_PARSE_NAME, nil, 0)
		requestList := make([]*http.Request, 0)
		requestList = append(requestList, request)
		return requestList, nil
	}
	return spider
}
func firstURLFromHTML(con *data.Context, body string) ([]string, error) {
	if body == "" {
		return nil, nil
	}
	strRdr := strings.NewReader(body)
	doc, err := goquery.NewDocumentFromReader(strRdr)
	if err != nil {
		return nil, err
	}

	var links []string
	found := false

	doc.Find("a").First().Each(func(i int, s *goquery.Selection) {
		if found {
			return
		}
		link, exists := s.Attr("href")
		if !exists {
			return
		}
		if strings.Contains(link, "mailto:") {
			return
		}
		links = append(links, link)
		found = true

		con.Log.Infof("HTML found %v", link)
	})

	return links, nil
}
Beispiel #13
0
// Returns the page title or an error. If there is an error, the url is returned as well.
func getPageTitle(url string) (string, error) {
	client := &http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return url, err
	}

	req.Header.Set("User-Agent", SUFRUserAgent)

	res, err := client.Do(req)
	if err != nil {
		return url, err
	}

	defer res.Body.Close()

	doc, err := goquery.NewDocumentFromReader(res.Body)

	if err != nil {
		return url, err
	}

	title := doc.Find("title").Text()
	return title, nil
}
Beispiel #14
0
// Robtex looks up a host at robtex.com.
func Robtex(ip string) (string, Results, error) {
	task := "robtex.com"
	results := Results{}
	resp, err := http.Get("http://www.robtex.com/ip/" + ip + ".html")
	if err != nil {
		return task, results, err
	}
	defer resp.Body.Close()
	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		return task, results, err
	}
	doc.Selection.Find("#x_summary td:nth-child(1)").Each(func(_ int, s *goquery.Selection) {
		hostname := s.Text()
		if strings.Contains(hostname, "*") {
			return
		}
		if hostname == "." {
			return
		}
		if _, err := strconv.Atoi(hostname); err == nil {
			return
		}
		results = append(results, Result{Source: task, IP: ip, Hostname: s.Text()})
	})
	return task, results, nil
}
Beispiel #15
0
// send uses the given *http.Request to make an HTTP request.
func (bow *Browser) httpRequest(req *http.Request) error {
	bow.preSend()
	resp, err := bow.client.Do(req)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	bow.body, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		return err
	}

	buff := bytes.NewBuffer(bow.body)
	dom, err := goquery.NewDocumentFromReader(buff)
	if err != nil {
		return err
	}

	bow.history.Push(bow.state)
	bow.state = jar.NewHistoryState(req, resp, dom)
	bow.postSend()

	return nil
}
func TestSanrioNewsReleaseSource(t *testing.T) {
	f, err := os.Open("data/www.sanrio.co.jp/corporate/release/index.html")
	if err != nil {
		t.Fatal(err)
	}
	defer f.Close()

	doc, err := goquery.NewDocumentFromReader(f)
	if err != nil {
		t.Fatal(err)
	}

	loc, err := time.LoadLocation("Asia/Tokyo")
	if err != nil {
		t.Fatal(err)
	}

	source := NewSanrioNewsReleaseSource()
	feed, err := source.ScrapeFromDocument(doc)
	if err != nil {
		t.Fatal(err)
	}
	assert.Equal(t, 51, len(feed.Items))
	assert.Equal(t, "ぐでぐでやる気のない「ぐでたま」のイベント九州初上陸! 夏休み企画 「ぐでたま in ふくおか」 7月21日(木)〜 福岡パルコ & sanrio vivitix 天神地下街店にて開催 (PDF)", feed.Items[0].Title)
	assert.Equal(t, "http://www.sanrio.co.jp/wp-content/uploads/2015/05/20160708-1.pdf", feed.Items[0].Link.Href)
	assert.WithinDuration(t, time.Date(2016, 7, 8, 0, 0, 0, 0, loc), feed.Items[0].Created, 0)
	assert.Equal(t, "2016年バレンタイン向けスペシャルギフト「GODIVA &ハローキティ」・「GODIVA &マイメロディ」1月6日(水)よりサンリオ限定販売", feed.Items[50].Title)
	assert.Equal(t, "http://www.sanrio.co.jp/corporate/release/y2016/d0106/", feed.Items[50].Link.Href)
	assert.WithinDuration(t, time.Date(2016, 1, 6, 0, 0, 0, 0, loc), feed.Items[50].Created, 0)
}
Beispiel #17
0
// Parse returns array of the StreetInfo populated by all streets
func (parser *WikipediaMoscow) Parse(reader io.Reader) ([]StreetInfo, error) {
	parser.result = make([]StreetInfo, 0)

	doc, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return nil, err
	}

	// Get links (a tag) inside li tags (li tags must be without id or class)
	liAll := doc.Find("li").FilterFunction(filterLITag).Children().Filter("a")

	parser.result = make([]StreetInfo, 0, liAll.Length())

	done := make(chan *StreetInfo, liAll.Length())

	for n := range liAll.Nodes {
		element := liAll.Eq(n)
		go parser.processLink(n, element, done)
	}

	for i := 0; i < liAll.Length(); i++ {
		info := <-done
		if len(info.Name) != 0 {
			parser.result = append(parser.result, *info)
			parser.writer.Print(info)
		}
	}

	return parser.result, nil
}
Beispiel #18
0
func TestSubheadRemoval(t *testing.T) {
	html := bytes.NewBufferString(testHTML)
	doc, err := goquery.NewDocumentFromReader(html)
	if err != nil {
		t.Fatal(err)
	}

	extractedBody := ExtractBodyFromDocument(doc, false, false)

	subhead := "Depth at forward"
	if strings.Contains(extractedBody.Text, subhead) {
		t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead)
	}

	subhead = "New leaders on ‘D’"
	if strings.Contains(extractedBody.Text, subhead) {
		t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead)
	}

	subhead = "Go with veterans"
	if strings.Contains(extractedBody.Text, subhead) {
		t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead)
	}

	subhead = "Goaltending duties"
	if strings.Contains(extractedBody.Text, subhead) {
		t.Fatalf("'%s' is a subhead, which should not appear in text body", subhead)
	}

	actualText := strings.Join(strings.Fields(extractedBody.Text), " ")
	if actualText != testExpectedText {
		t.Fatal("Actiual text does not match expected text")
	}

}
Beispiel #19
0
func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response {
	var err error
	p, destbody := self.downloadFile(p, req)
	//fmt.Printf("Destbody %v \r\n", destbody)
	if !p.IsSucc() {
		//fmt.Print("Response error \r\n")
		return p
	}
	bodyReader := bytes.NewReader([]byte(destbody))

	var doc *goquery.Document
	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
		reporter.Log.Println(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	var body string
	if body, err = doc.Html(); err != nil {
		reporter.Log.Println(err.Error())
		p.SetStatus(true, err.Error())
		return p
	}

	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")

	return p
}
Beispiel #20
0
func TestManipulatingSettings(t *testing.T) {

	Convey("when manipulating the global Settings variable", t, func() {

		Convey("should save the changes to disk", func() {
			settings = *Settings
			settings.Name = "Juuso's Blog"
			s, err := settings.Update()
			if err != nil {
				panic(err)
			}
			Settings = s
		})

		Convey("frontpage's <title> should now be 'Juuso's Blog'", func() {
			var recorder = httptest.NewRecorder()
			request, _ := http.NewRequest("GET", "/", nil)
			server.ServeHTTP(recorder, request)
			doc, _ := goquery.NewDocumentFromReader(recorder.Body)
			sel := doc.Find("title").Text()
			So(sel, ShouldEqual, Settings.Name)
		})
	})

	TestSettingValues(t)
}
Beispiel #21
0
func TestJobsRegionsPaginate(t *testing.T) {
	pg := []struct {
		from, to int
	}{
		{0, 1},
		{0, 2},
		{1, 4},
	}
	for _, page := range pg {
		for _, reg := range regionsSample {
			paginate := fmt.Sprintf("%s/jobs/regions/%s/%d/%d", ts.URL, reg.short, page.from, page.to)
			b, err := com.HttpGetBytes(client, paginate, nil)
			if err != nil {
				t.Errorf("getting regions home page %v", err)
			}
			doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(b)))
			if err != nil {
				t.Errorf("loading document %v", err)
			}
			s := doc.Find(".job-item")
			d := page.to - page.from
			if s.Length() != d {
				t.Errorf("expected %d got %d", d, s.Length())
			}
		}

	}
}
Beispiel #22
0
func newDocFromString(s string) *goquery.Document {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
	if err != nil {
		log.Fatal(err)
	}
	return doc
}
Beispiel #23
0
func (p ProxyHTTP) ProcessArticles(ua []content.UserArticle) []content.UserArticle {
	if len(ua) == 0 {
		return ua
	}

	p.logger.Infof("Proxying urls of feed '%d'\n", ua[0].Data().FeedId)

	for i := range ua {
		data := ua[i].Data()

		if d, err := goquery.NewDocumentFromReader(strings.NewReader(data.Description)); err == nil {
			if processor.ProxyArticleLinks(d, p.urlTemplate) {
				if content, err := d.Html(); err == nil {
					// net/http tries to provide valid html, adding html, head and body tags
					content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")]

					data.Description = content
					ua[i].Data(data)
				}
			}
		}
	}

	return ua
}
func TestActiveHeader(t *testing.T) {
	record := request.TestServer{t, TestHandler}

	testcases := []struct {
		Path string
		Link string
	}{
		{"/about", "About"},
		{"/contact", "Contact"},
		{"/help", "Help"},
		{"/", "Home"},
	}

	It("should highlight the correct header link", func() {
		for _, test := range testcases {
			ctx := record.Get(test.Path)
			reader := strings.NewReader(ctx.ResponseRecorder.Body.String())
			doc, err := goquery.NewDocumentFromReader(reader)
			if err != nil {
				t.Fatal(err)
			}
			active := doc.Find(".nav-item.active").First().Text()
			active = strings.TrimSpace(active)
			assert.Equal(t, test.Link, active)
		}
	})
}
Beispiel #25
0
func sfvTask(url string) {
	res, err := http.Get(url)
	if err != nil {
		// handle error
		fmt.Println("got error:", err)
		return
	}
	defer res.Body.Close()

	// use utfBody using goquery
	doc, err := goquery.NewDocumentFromReader(res.Body)
	if err != nil {
		// handler error
		fmt.Println("got error:", err)
		return
	}
	doc.Find("div.textblock").Each(func(i int, s *goquery.Selection) {
		alt, ok := s.Find("IMG").Attr("alt")
		if !ok {
			fmt.Println(alt)
			return
		}
		fmt.Println("get from html as result:", alt)
		if strings.Contains(alt, NOT_AVAILABLE) {
			// notify("Hi Sfver:</br> No available space currently. please try one via the link below: </br><a href=\"http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm\">http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm</a>")
			fmt.Println(time.Now().Format("2006-01-02 15:04:06"), alt, ", please waiting for available")
		} else {
			notify("Hi Sfver:</br> Some of SFVs are available! please try one via the link below: </br><a href=\"http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm\">http://www.immigration.govt.nz/migrant/stream/work/silverfern/jobsearch.htm</a>")
			fmt.Println(time.Now().Format("2006-01-02 15:04:06"), "Some of SFVs are available! :-)")
		}
	})
}
Beispiel #26
0
func (h *HttpClient) FetchDocument(method, urlStr string, query map[string]string) (*goquery.Document, error) {
	buf, err := h.Do(method, urlStr, query)
	if err != nil {
		return nil, err
	}
	return goquery.NewDocumentFromReader(bytes.NewReader(buf))
}
func (p InsertThumbnailTarget) ProcessArticles(ua []content.UserArticle) []content.UserArticle {
	if len(ua) == 0 {
		return ua
	}

	p.logger.Infof("Proxying urls of feed '%d'\n", ua[0].Data().FeedId)

	for i := range ua {
		data := ua[i].Data()

		if data.ThumbnailLink == "" {
			continue
		}

		if d, err := goquery.NewDocumentFromReader(strings.NewReader(data.Description)); err == nil {
			if insertThumbnailTarget(d, data.ThumbnailLink, p.logger) {
				if content, err := d.Html(); err == nil {
					// net/http tries to provide valid html, adding html, head and body tags
					content = content[strings.Index(content, "<body>")+6 : strings.LastIndex(content, "</body>")]

					data.Description = content
					ua[i].Data(data)
				}
			}
		}
	}

	return ua
}
Beispiel #28
0
func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error) {
	var wg sync.WaitGroup

	err := validateSelector(selector)
	if err != nil {
		return "", nil, err
	}

	jobId := "READER" + GenerateStringKey(selector)
	log.Printf("INFO: Scrap [%v] from Reader started\n", jobId)

	items := make(chan ItemResult, bufferItemsSize)
	wg.Add(1)

	go func() {
		doc, err := goquery.NewDocumentFromReader(*s.reader)
		if err != nil {
			log.Println("ERROR Scrapping ", selector.Url, " with message", err.Error())
			return
		}
		DocumentScrap(jobId, selector, doc, items)
		wg.Done()
	}()

	closeItemsChannel(jobId, items, &wg)

	return jobId, items, nil
}
Beispiel #29
0
func (notify *NotifyParams) extractHTML() error {
	reader := strings.NewReader(notify.RawHTML)
	document, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return err
	}

	notify.ParsedHTML.Doctype, err = notify.extractDoctype(notify.RawHTML)
	if err != nil {
		return err
	}

	notify.ParsedHTML.Head, err = notify.extractHead(document)
	if err != nil {
		return err
	}

	bodyAttributes := ""
	for _, attribute := range document.Find("body").Nodes[0].Attr {
		bodyAttributes += " " + attribute.Key + `="` + attribute.Val + `"`
	}
	bodyAttributes = strings.TrimPrefix(bodyAttributes, " ")

	bodyContent, err := document.Find("body").Html()
	if err != nil {
		return err
	}

	if bodyContent != "" {
		notify.ParsedHTML.BodyAttributes = bodyAttributes
		notify.ParsedHTML.BodyContent = bodyContent
	}

	return nil
}
Beispiel #30
0
func getListDetail(m string, s []byte) []string {
	r := bytes.NewReader(s)
	doc, err := goquery.NewDocumentFromReader(r)
	checkError(err)

	arr := []string{}

	dir := "./detail/" + m
	err0 := os.MkdirAll(dir, 0777)
	checkError(err0)

	doc.Find("div.cont dl").Each(func(i int, s *goquery.Selection) {
		title := s.Find("dt a").Text()
		link, _ := s.Find("dt a").Attr("href")

		url := "http://www.safe10000.com" + link

		file := dir + "/" + path.Base(url)
		//desc := s.Find("dd").Text()
		arr = append(arr, url)
		c := getContent(url, file)

		fmt.Printf("Review %d: %s - %s - %s\n", i, url, title, file)
		parseDetailHtml(m, c, path.Base(url))
	})
	return arr
}