Ejemplo n.º 1
0
func TestInitStd(t *testing.T) {
	log.InitStd()
	log.Info("hello world", "from", "info")
	log.Infof("one is: %d", 1)
	log.Info("two")
	log.Infof("three is: %d", 3)
}
Ejemplo n.º 2
0
// Run runs the client in an infinite loop
func (c *Client) Run() {
	ticker := time.NewTicker(time.Duration(10) * time.Second)
	c.ignoreBad = make(map[string]struct{})
	defer c.printBad()

	for {

		// wait for the next itr
		<-ticker.C

		// go get the article
		req, err := Get(c.IP)
		if err != nil {
			log.Error(err)
			continue // go around to the next iteration
		}

		// don't reply to empty requests
		if netScraper.IsEmptyRequest(req) {
			log.Info("got empty request")
			continue
		}
		if _, contains := c.ignoreBad[req.URL]; contains {
			log.Warn("got bad article:", req.URL)
			continue
		}

		log.Info("got article", req.URL)

		// for now only use the NYT
		article := scraper.NYTArticle{}
		article.Link = req.URL

		err = scraper.ScrapeArticle(&article)
		if err != nil {
			c.ignoreBad[req.URL] = struct{}{} // add to ignore
			log.Error("could not scrape article", req.URL, ":", err)
			continue
		}

		if len(article.GetData()) == 0 {
			c.ignoreBad[req.URL] = struct{}{} // add to ignore
			log.Error("bad article body for url:", req.URL)
			continue
		}

		// send article back up
		result := netScraper.Response{URL: req.URL, Data: article.Data, Error: netScraper.ResponseOk}
		err = Post(c.IP, result)
		if err != nil {
			time.Sleep(time.Duration(10) * time.Second) // sleep incase the WIFI is just down

			err = Post(c.IP, result)
			if err != nil {
				log.Error(err)
			}
		}
	}
}
Ejemplo n.º 3
0
func (task *SchedulableArticle) Run(scheduler *scheduler.Scheduler) {
	// check if the task ran while we were waiting
	select {
	case result := <-task.ran:
		if result == ARTICLE_OK {
			log.Info("scraped article:", task.article.GetLink())
			return
		}
		if result == ARTICLE_BAD {
			log.Warn("bad result for article:", task.article.GetLink(), "requeueing")
		}
	default:
		// nothing read
	}

	task.j.AddArticle(task.article, task.ran)

	// wait for the article to go off to a client
	res := <-task.ran
	if res == ARTICLE_OK {
		log.Info("scraped article", task.article.GetLink())
		return
	}
	if res == ARTICLE_BAD {
		log.Warn("bad result for article:", task.article.GetLink(), "requeueing")
		// re-queue
		task.start = time.Now()
		task.delay = 15 // set delay to 2 here b/c prev delay was relative
		scheduler.Add(task)
		return
	}

	// once the article is at the client, wait a reasonable amount of time
	// if the article did not come back in the expected time, requeue it
	var waitTime time.Duration = 15

	select {
	case result := <-task.ran:
		if result == ARTICLE_OK {
			log.Info("scraped article", task.article.GetLink())
			return // finish this
		}

		log.Warn("got result", toString(result), "for article", task.article.GetLink(), "requeueing")
		// else fall through to requeue

	case <-time.After(waitTime * time.Second):
		// fall through to requeue
		log.Info("timing out for article", task.article.GetLink())
	}

	task.start = time.Now()
	task.delay = 15 // set delay to 2 here b/c prev delay was relative
	scheduler.Add(task)
}
Ejemplo n.º 4
0
// HandleRepsonse handles a response from the server. It updates
// the article and stores it. Returns an error if there was an
// unexpected issue.
func (j *Jefe) HandleResponse(response netScraper.Response) error {
	j.mutex.Lock()
	defer j.mutex.Unlock()

	log.Info("handling response:", response)

	_, isOpen := j.openRequests[response.URL]
	if isOpen && response.Error == netScraper.ResponseOk {

		// pass this article off
		// TODO: put in some error checking on the article body
		article := j.openArticles[response.URL]
		article.SetData(response.Data)
		go handleScrapedArticle(article)

		// got a good response
		j.updateStatus(response.URL, ARTICLE_OK)

		// close everything up
		close(j.openRequests[response.URL])
		delete(j.openRequests, response.URL)
		delete(j.openArticles, response.URL)

	} else if isOpen {
		// tell the article schedulable that is needs to re-add
		// don't remove it from the openRequests in case the article
		// comes back before it gets added again
		j.updateStatus(response.URL, ARTICLE_BAD)
	}
	// else a response has already come back, fall through

	return nil
}
Ejemplo n.º 5
0
// AddArticle adds an article to the ready queue. The article will be scraped by
// a client then sent back up to the Jefe. The chan signals back to the
// schedulable article
func (j *Jefe) AddArticle(article scraper.Article, c chan int) {
	j.mutex.Lock()
	defer j.mutex.Unlock()

	log.Info("adding article", article.GetLink(), "to ready queue")
	j.queue = append(j.queue, article)

	j.openRequests[article.GetLink()] = c
	j.openArticles[article.GetLink()] = article
}
Ejemplo n.º 6
0
func main() {
	infoFile, err := os.OpenFile("scrapeInfoLog.txt", os.O_RDWR|os.O_CREATE, 0666)
	if err != nil {
		panic(err)
	}

	errFile, err := os.OpenFile("scrapeErrorLog.txt", os.O_RDWR|os.O_CREATE, 0666)
	if err != nil {
		panic(err)
	}

	warnFile, err := os.OpenFile("scrapeWarnLog.txt", os.O_RDWR|os.O_CREATE, 0666)
	if err != nil {
		panic(err)
	}
	log.Init(infoFile, errFile, warnFile)

	defer infoFile.Close()
	defer errFile.Close()
	defer warnFile.Close()

	// create the new server
	s := server.NewScrapeServer()
	j := s.GetJefe()

	j.SetCycleTime(1)
	j.Start()

	log.Info("started jefe")

	// make server scrape WSJ
	rss := server.CreateSchedulableRSS(&scraper.NYTRSS{}, 10, j)
	j.AddSchedulable(rss)

	log.Info("going to start server")

	// start up the server
	http.HandleFunc("/", s.Handle())
	http.ListenAndServe(":8080", nil)
}
Ejemplo n.º 7
0
// write the provided article to the storage
func storeArticle(article scraper.Article) error {
	jsonStr, err := json.Marshal(article)
	if err != nil {
		return err
	}

	// take all spaces out of title
	// TODO: think about cleaning this up a little more
	fileName := strings.Replace(article.GetTitle(), " ", "", -1)
	path := "opinionatedData/" + fileName + ".json"

	err = ioutil.WriteFile(path, jsonStr, 0644)
	if err != nil {
		return err
	}

	log.Info("wrote article:", article.GetTitle(), "to location:", path)
	return nil
}
Ejemplo n.º 8
0
// NextArticle returns the next article to scrape. If there is an article to
// scrape, it returns the article and true, else it returns nil and false. Tells
// the schedulableArticle that the article has been sent
func (j *Jefe) NextArticle() (scraper.Article, bool) {
	j.mutex.Lock()
	defer j.mutex.Unlock()

	// look for the next article that has an open requests (the jefe may recieve
	// an article after it has been requeued)
	for j.hasNext() {

		next := j.pop()
		if _, ok := j.openRequests[next.GetLink()]; ok {

			j.updateStatus(next.GetLink(), ARTICLE_SENT)
			log.Info("going to send article", next.GetLink())

			return next, true
		}
	}

	return nil, false
}
Ejemplo n.º 9
0
func main() {
	infoFile, err := os.OpenFile("rateInfoLog.txt", os.O_RDWR|os.O_CREATE, 0666)
	errFile, err := os.OpenFile("rateErrorLog.txt", os.O_RDWR|os.O_CREATE, 0666)
	if err != nil {
		panic(err)
	}

	defer infoFile.Close()
	defer errFile.Close()

	log.Init(infoFile, nil, errFile)

	feeds := []rssMonitor{newMonitor(&scraper.WSJRSS{}),
		newMonitor(&scraper.NYTRSS{})}

	// get whats currently in the feeds without sending an update signal
	for _, monitor := range feeds {
		monitor.didChange()
	}

	ticker := time.NewTicker(time.Duration(5) * time.Minute)

	// flip only prints the articles once every 5 minutes
	flip := false

	for {
		// wait for new ticker value
		<-ticker.C

		updateFeeds(feeds)

		if flip {
			for _, monitor := range feeds {
				log.Info("average time is: ", monitor.tracker.getAverage())
			}
		}
		flip = !flip
	}

}
Ejemplo n.º 10
0
func (task rssMonitor) didChange() (bool, error) {
	err := scraper.UpdateRSS(task.rss)
	if err != nil {
		log.Error("error reading rss:", err)
		return false, err
	}

	// mark all articles as not in list
	for key := range task.oldArticles {
		task.oldArticles[key] = false
	}

	// an article is new if it wasn't in the last RSS ping
	found := false
	for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ {
		article := task.rss.GetChannel().GetArticle(i)

		if _, inOld := task.oldArticles[article.GetLink()]; !inOld {
			found = true
		}

		// add or update what we found
		task.oldArticles[article.GetLink()] = true
	}

	// remove any articles not in the set
	for key, inList := range task.oldArticles {
		if !inList {
			delete(task.oldArticles, key)
		}
	}

	if found {
		log.Info("found new article")
	}
	task.rss.GetChannel().ClearArticles()
	return found, nil
}
Ejemplo n.º 11
0
func TestLine(t *testing.T) {
	log.Info("test")
}