// ScrapeArticle fetches and parses the article. // article should be provided as a *Article. func ScrapeArticle(article Article) error { cookies := NewCookieJar() client := &http.Client{Jar: cookies} // build request req, err := http.NewRequest("GET", article.GetLink(), nil) //create http request err = buildArticleHeader(req) if err != nil { log.Error("could not build article request:", err) return err } //send http request resp, err := client.Do(req) if err != nil { log.Error("error sending article request:", err) return err } defer resp.Body.Close() // TODO: check resp.Header to see if X-Article-Template is [full] // parse request parser := html.NewTokenizer(resp.Body) err = article.DoParse(parser) //parse the html body if err != nil { log.Error("error building article request:", err) return err } return nil }
// UpdateRSS finds articles currently in the RSS feed. // Clears old articles out of RSS feed before getting new ones. // rss should be passed as an *RSS func UpdateRSS(rss RSS) error { // clear out old articles so we don't double add rss.GetChannel().ClearArticles() // send request resp, err := http.Get(rss.GetLink()) if err != nil { // TODO: error checking here log.Error("error getting RSS:", err) return err } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Error("error reading RSS body:", err) // TODO: error handling return err } err = xml.Unmarshal(body, rss) if err != nil { log.Error("could not build RSS obj from rss request:", err) return err } return nil }
// Run runs the client in an infinite loop func (c *Client) Run() { ticker := time.NewTicker(time.Duration(10) * time.Second) c.ignoreBad = make(map[string]struct{}) defer c.printBad() for { // wait for the next itr <-ticker.C // go get the article req, err := Get(c.IP) if err != nil { log.Error(err) continue // go around to the next iteration } // don't reply to empty requests if netScraper.IsEmptyRequest(req) { log.Info("got empty request") continue } if _, contains := c.ignoreBad[req.URL]; contains { log.Warn("got bad article:", req.URL) continue } log.Info("got article", req.URL) // for now only use the NYT article := scraper.NYTArticle{} article.Link = req.URL err = scraper.ScrapeArticle(&article) if err != nil { c.ignoreBad[req.URL] = struct{}{} // add to ignore log.Error("could not scrape article", req.URL, ":", err) continue } if len(article.GetData()) == 0 { c.ignoreBad[req.URL] = struct{}{} // add to ignore log.Error("bad article body for url:", req.URL) continue } // send article back up result := netScraper.Response{URL: req.URL, Data: article.Data, Error: netScraper.ResponseOk} err = Post(c.IP, result) if err != nil { time.Sleep(time.Duration(10) * time.Second) // sleep incase the WIFI is just down err = Post(c.IP, result) if err != nil { log.Error(err) } } } }
// Get the server for an article to scrape. func Get(target string) (netScraper.Request, error) { c := &http.Client{} // get next work unit resp, err := c.Get(target) if err != nil { return netScraper.EmptyRequest(), err } defer resp.Body.Close() if resp.StatusCode != 200 { log.Error("oh nose, did not get OK status:", resp.StatusCode) } js, err := ioutil.ReadAll(resp.Body) toDo := netScraper.Request{} err = json.Unmarshal(js, &toDo) if err != nil { return netScraper.EmptyRequest(), err } return toDo, err }
// handles scraped articles // TODO: think about where this should be func handleScrapedArticle(article scraper.Article) { if err := scraper.CheckFile(article.GetData()); err != nil { log.Warn("when checking article", article.GetTitle(), "got err:", err) } if err := storeArticle(article); err != nil { log.Error("failed to write article", article.GetTitle(), ":", err) return } }
// helper to push article updates func (j Jefe) updateStatus(name string, status int) { // note that exposed methods need to protect access if j.openRequests[name] == nil { // if not in open requests return } select { case j.openRequests[name] <- status: default: // this is a big issue because the signal chan should not be blocking log.Error("could not send status for article", name) } }
func (task *SchedulableRSS) Run(scheduler *scheduler.Scheduler) { err := scraper.UpdateRSS(task.rss) if err != nil { log.Error("error updating rss stories:", err) // requeue task.start = time.Now() task.rss.GetChannel().ClearArticles() go scheduler.Add(task) return } // mark all articles as not in list for key := range task.oldArticles { task.oldArticles[key] = false } // schedule any new articles // an article is new if it wasn't in the last RSS ping delay := 60 // TODO: create legitimate task delays for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ { article := task.rss.GetChannel().GetArticle(i) if _, inOld := task.oldArticles[article.GetLink()]; !inOld { toSchedule := CreateSchedulableArticle(article, delay, task.j) delay += 600 go scheduler.Add(toSchedule) } // add or update what we found task.oldArticles[article.GetLink()] = true } // remove any articles not in the set for key, inList := range task.oldArticles { if !inList { delete(task.oldArticles, key) } } // reschedule this task if task.IsLoopable() && scheduler.IsRunning() { task.start = time.Now() task.rss.GetChannel().ClearArticles() go scheduler.Add(task) } }
func (task rssMonitor) didChange() (bool, error) { err := scraper.UpdateRSS(task.rss) if err != nil { log.Error("error reading rss:", err) return false, err } // mark all articles as not in list for key := range task.oldArticles { task.oldArticles[key] = false } // an article is new if it wasn't in the last RSS ping found := false for i := 0; i < task.rss.GetChannel().GetNumArticles(); i++ { article := task.rss.GetChannel().GetArticle(i) if _, inOld := task.oldArticles[article.GetLink()]; !inOld { found = true } // add or update what we found task.oldArticles[article.GetLink()] = true } // remove any articles not in the set for key, inList := range task.oldArticles { if !inList { delete(task.oldArticles, key) } } if found { log.Info("found new article") } task.rss.GetChannel().ClearArticles() return found, nil }