func TestInitStd(t *testing.T) { log.InitStd() log.Info("hello world", "from", "info") log.Infof("one is: %d", 1) log.Info("two") log.Infof("three is: %d", 3) }
// Run runs the client in an infinite loop func (c *Client) Run() { ticker := time.NewTicker(time.Duration(10) * time.Second) c.ignoreBad = make(map[string]struct{}) defer c.printBad() for { // wait for the next itr <-ticker.C // go get the article req, err := Get(c.IP) if err != nil { log.Error(err) continue // go around to the next iteration } // don't reply to empty requests if netScraper.IsEmptyRequest(req) { log.Info("got empty request") continue } if _, contains := c.ignoreBad[req.URL]; contains { log.Warn("got bad article:", req.URL) continue } log.Info("got article", req.URL) // for now only use the NYT article := scraper.NYTArticle{} article.Link = req.URL err = scraper.ScrapeArticle(&article) if err != nil { c.ignoreBad[req.URL] = struct{}{} // add to ignore log.Error("could not scrape article", req.URL, ":", err) continue } if len(article.GetData()) == 0 { c.ignoreBad[req.URL] = struct{}{} // add to ignore log.Error("bad article body for url:", req.URL) continue } // send article back up result := netScraper.Response{URL: req.URL, Data: article.Data, Error: netScraper.ResponseOk} err = Post(c.IP, result) if err != nil { time.Sleep(time.Duration(10) * time.Second) // sleep incase the WIFI is just down err = Post(c.IP, result) if err != nil { log.Error(err) } } } }
func (task *SchedulableArticle) Run(scheduler *scheduler.Scheduler) { // check if the task ran while we were waiting select { case result := <-task.ran: if result == ARTICLE_OK { log.Info("scraped article:", task.article.GetLink()) return } if result == ARTICLE_BAD { log.Warn("bad result for article:", task.article.GetLink(), "requeueing") } default: // nothing read } task.j.AddArticle(task.article, task.ran) // wait for the article to go off to a client res := <-task.ran if res == ARTICLE_OK { log.Info("scraped article", task.article.GetLink()) return } if res == ARTICLE_BAD { log.Warn("bad result for article:", task.article.GetLink(), "requeueing") // re-queue task.start = time.Now() task.delay = 15 // set delay to 2 here b/c prev delay was relative scheduler.Add(task) return } // once the article is at the client, wait a reasonable amount of time // if the article did not come back in the expected time, requeue it var waitTime time.Duration = 15 select { case result := <-task.ran: if result == ARTICLE_OK { log.Info("scraped article", task.article.GetLink()) return // finish this } log.Warn("got result", toString(result), "for article", task.article.GetLink(), "requeueing") // else fall through to requeue case <-time.After(waitTime * time.Second): // fall through to requeue log.Info("timing out for article", task.article.GetLink()) } task.start = time.Now() task.delay = 15 // set delay to 2 here b/c prev delay was relative scheduler.Add(task) }
// HandleRepsonse handles a response from the server. It updates // the article and stores it. Returns an error if there was an // unexpected issue. func (j *Jefe) HandleResponse(response netScraper.Response) error { j.mutex.Lock() defer j.mutex.Unlock() log.Info("handling response:", response) _, isOpen := j.openRequests[response.URL] if isOpen && response.Error == netScraper.ResponseOk { // pass this article off // TODO: put in some error checking on the article body article := j.openArticles[response.URL] article.SetData(response.Data) go handleScrapedArticle(article) // got a good response j.updateStatus(response.URL, ARTICLE_OK) // close everything up close(j.openRequests[response.URL]) delete(j.openRequests, response.URL) delete(j.openArticles, response.URL) } else if isOpen { // tell the article schedulable that is needs to re-add // don't remove it from the openRequests in case the article // comes back before it gets added again j.updateStatus(response.URL, ARTICLE_BAD) } // else a response has already come back, fall through return nil }
// AddArticle adds an article to the ready queue. The article will be scraped by // a client then sent back up to the Jefe. The chan signals back to the // schedulable article func (j *Jefe) AddArticle(article scraper.Article, c chan int) { j.mutex.Lock() defer j.mutex.Unlock() log.Info("adding article", article.GetLink(), "to ready queue") j.queue = append(j.queue, article) j.openRequests[article.GetLink()] = c j.openArticles[article.GetLink()] = article }
func main() { infoFile, err := os.OpenFile("scrapeInfoLog.txt", os.O_RDWR|os.O_CREATE, 0666) if err != nil { panic(err) } errFile, err := os.OpenFile("scrapeErrorLog.txt", os.O_RDWR|os.O_CREATE, 0666) if err != nil { panic(err) } warnFile, err := os.OpenFile("scrapeWarnLog.txt", os.O_RDWR|os.O_CREATE, 0666) if err != nil { panic(err) } log.Init(infoFile, errFile, warnFile) defer infoFile.Close() defer errFile.Close() defer warnFile.Close() // create the new server s := server.NewScrapeServer() j := s.GetJefe() j.SetCycleTime(1) j.Start() log.Info("started jefe") // make server scrape WSJ rss := server.CreateSchedulableRSS(&scraper.NYTRSS{}, 10, j) j.AddSchedulable(rss) log.Info("going to start server") // start up the server http.HandleFunc("/", s.Handle()) http.ListenAndServe(":8080", nil) }
// write the provided article to the storage func storeArticle(article scraper.Article) error { jsonStr, err := json.Marshal(article) if err != nil { return err } // take all spaces out of title // TODO: think about cleaning this up a little more fileName := strings.Replace(article.GetTitle(), " ", "", -1) path := "opinionatedData/" + fileName + ".json" err = ioutil.WriteFile(path, jsonStr, 0644) if err != nil { return err } log.Info("wrote article:", article.GetTitle(), "to location:", path) return nil }
// NextArticle returns the next article to scrape. If there is an article to // scrape, it returns the article and true, else it returns nil and false. Tells // the schedulableArticle that the article has been sent func (j *Jefe) NextArticle() (scraper.Article, bool) { j.mutex.Lock() defer j.mutex.Unlock() // look for the next article that has an open requests (the jefe may recieve // an article after it has been requeued) for j.hasNext() { next := j.pop() if _, ok := j.openRequests[next.GetLink()]; ok { j.updateStatus(next.GetLink(), ARTICLE_SENT) log.Info("going to send article", next.GetLink()) return next, true } } return nil, false }
func main() { infoFile, err := os.OpenFile("rateInfoLog.txt", os.O_RDWR|os.O_CREATE, 0666) errFile, err := os.OpenFile("rateErrorLog.txt", os.O_RDWR|os.O_CREATE, 0666) if err != nil { panic(err) } defer infoFile.Close() defer errFile.Close() log.Init(infoFile, nil, errFile) feeds := []rssMonitor{newMonitor(&scraper.WSJRSS{}), newMonitor(&scraper.NYTRSS{})} // get whats currently in the feeds without sending an update signal for _, monitor := range feeds { monitor.didChange() } ticker := time.NewTicker(time.Duration(5) * time.Minute) // flip only prints the articles once every 5 minutes flip := false for { // wait for new ticker value <-ticker.C updateFeeds(feeds) if flip { for _, monitor := range feeds { log.Info("average time is: ", monitor.tracker.getAverage()) } } flip = !flip } }
func (task rssMonitor) didChange() (bool, error) { err := scraper.UpdateRSS(task.rss) if err != nil { log.Error("error reading rss:", err) return false, err } // mark all articles as not in list for key := range task.oldArticles { task.oldArticles[key] = false } // an article is new if it wasn't in the last RSS ping found := false for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ { article := task.rss.GetChannel().GetArticle(i) if _, inOld := task.oldArticles[article.GetLink()]; !inOld { found = true } // add or update what we found task.oldArticles[article.GetLink()] = true } // remove any articles not in the set for key, inList := range task.oldArticles { if !inList { delete(task.oldArticles, key) } } if found { log.Info("found new article") } task.rss.GetChannel().ClearArticles() return found, nil }
func TestLine(t *testing.T) { log.Info("test") }