Example #1
0
// ScrapeArticle fetches and parses the article.
// article should be provided as a *Article.
func ScrapeArticle(article Article) error {
	cookies := NewCookieJar()
	client := &http.Client{Jar: cookies}

	// build request
	req, err := http.NewRequest("GET", article.GetLink(), nil) //create http request
	err = buildArticleHeader(req)
	if err != nil {
		log.Error("could not build article request:", err)
		return err
	}

	//send http request
	resp, err := client.Do(req)
	if err != nil {
		log.Error("error sending article request:", err)
		return err
	}
	defer resp.Body.Close()

	// TODO: check resp.Header to see if X-Article-Template is [full]

	// parse request
	parser := html.NewTokenizer(resp.Body)
	err = article.DoParse(parser) //parse the html body
	if err != nil {
		log.Error("error building article request:", err)
		return err
	}
	return nil
}
Example #2
0
// UpdateRSS finds articles currently in the RSS feed.
// Clears old articles out of RSS feed before getting new ones.
// rss should be passed as an *RSS
func UpdateRSS(rss RSS) error {
	// clear out old articles so we don't double add
	rss.GetChannel().ClearArticles()

	// send request
	resp, err := http.Get(rss.GetLink())
	if err != nil {
		// TODO: error checking here
		log.Error("error getting RSS:", err)
		return err
	}
	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Error("error reading RSS body:", err)
		// TODO: error handling
		return err
	}

	err = xml.Unmarshal(body, rss)
	if err != nil {
		log.Error("could not build RSS obj from rss request:", err)
		return err
	}

	return nil
}
Example #3
0
// Run runs the client in an infinite loop
func (c *Client) Run() {
	ticker := time.NewTicker(time.Duration(10) * time.Second)
	c.ignoreBad = make(map[string]struct{})
	defer c.printBad()

	for {

		// wait for the next itr
		<-ticker.C

		// go get the article
		req, err := Get(c.IP)
		if err != nil {
			log.Error(err)
			continue // go around to the next iteration
		}

		// don't reply to empty requests
		if netScraper.IsEmptyRequest(req) {
			log.Info("got empty request")
			continue
		}
		if _, contains := c.ignoreBad[req.URL]; contains {
			log.Warn("got bad article:", req.URL)
			continue
		}

		log.Info("got article", req.URL)

		// for now only use the NYT
		article := scraper.NYTArticle{}
		article.Link = req.URL

		err = scraper.ScrapeArticle(&article)
		if err != nil {
			c.ignoreBad[req.URL] = struct{}{} // add to ignore
			log.Error("could not scrape article", req.URL, ":", err)
			continue
		}

		if len(article.GetData()) == 0 {
			c.ignoreBad[req.URL] = struct{}{} // add to ignore
			log.Error("bad article body for url:", req.URL)
			continue
		}

		// send article back up
		result := netScraper.Response{URL: req.URL, Data: article.Data, Error: netScraper.ResponseOk}
		err = Post(c.IP, result)
		if err != nil {
			time.Sleep(time.Duration(10) * time.Second) // sleep incase the WIFI is just down

			err = Post(c.IP, result)
			if err != nil {
				log.Error(err)
			}
		}
	}
}
Example #4
0
// Get the server for an article to scrape.
func Get(target string) (netScraper.Request, error) {
	c := &http.Client{}

	// get next work unit
	resp, err := c.Get(target)

	if err != nil {
		return netScraper.EmptyRequest(), err
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
		log.Error("oh nose, did not get OK status:", resp.StatusCode)
	}

	js, err := ioutil.ReadAll(resp.Body)
	toDo := netScraper.Request{}

	err = json.Unmarshal(js, &toDo)
	if err != nil {
		return netScraper.EmptyRequest(), err

	}
	return toDo, err
}
Example #5
0
// handles scraped articles
// TODO: think about where this should be
func handleScrapedArticle(article scraper.Article) {
	if err := scraper.CheckFile(article.GetData()); err != nil {
		log.Warn("when checking article", article.GetTitle(), "got err:", err)
	}
	if err := storeArticle(article); err != nil {
		log.Error("failed to write article", article.GetTitle(), ":", err)
		return
	}
}
Example #6
0
// helper to push article updates
func (j Jefe) updateStatus(name string, status int) {
	// note that exposed methods need to protect access
	if j.openRequests[name] == nil {
		// if not in open requests
		return
	}

	select {
	case j.openRequests[name] <- status:

	default:
		// this is a big issue because the signal chan should not be blocking
		log.Error("could not send status for article", name)
	}
}
func (task *SchedulableRSS) Run(scheduler *scheduler.Scheduler) {

	err := scraper.UpdateRSS(task.rss)
	if err != nil {
		log.Error("error updating rss stories:", err)
		// requeue
		task.start = time.Now()
		task.rss.GetChannel().ClearArticles()
		go scheduler.Add(task)
		return
	}

	// mark all articles as not in list
	for key := range task.oldArticles {
		task.oldArticles[key] = false
	}

	// schedule any new articles
	// an article is new if it wasn't in the last RSS ping
	delay := 60 // TODO: create legitimate task delays
	for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ {
		article := task.rss.GetChannel().GetArticle(i)

		if _, inOld := task.oldArticles[article.GetLink()]; !inOld {
			toSchedule := CreateSchedulableArticle(article, delay, task.j)
			delay += 600
			go scheduler.Add(toSchedule)
		}

		// add or update what we found
		task.oldArticles[article.GetLink()] = true
	}

	// remove any articles not in the set
	for key, inList := range task.oldArticles {
		if !inList {
			delete(task.oldArticles, key)
		}
	}

	// reschedule this task
	if task.IsLoopable() && scheduler.IsRunning() {
		task.start = time.Now()
		task.rss.GetChannel().ClearArticles()
		go scheduler.Add(task)
	}
}
Example #8
0
func (task rssMonitor) didChange() (bool, error) {
	err := scraper.UpdateRSS(task.rss)
	if err != nil {
		log.Error("error reading rss:", err)
		return false, err
	}

	// mark all articles as not in list
	for key := range task.oldArticles {
		task.oldArticles[key] = false
	}

	// an article is new if it wasn't in the last RSS ping
	found := false
	for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ {
		article := task.rss.GetChannel().GetArticle(i)

		if _, inOld := task.oldArticles[article.GetLink()]; !inOld {
			found = true
		}

		// add or update what we found
		task.oldArticles[article.GetLink()] = true
	}

	// remove any articles not in the set
	for key, inList := range task.oldArticles {
		if !inList {
			delete(task.oldArticles, key)
		}
	}

	if found {
		log.Info("found new article")
	}
	task.rss.GetChannel().ClearArticles()
	return found, nil
}