func processArticle(articleUrl string, session *mgo.Session) bool { artDebugger.Println("article url ", articleUrl) processor := a.ParseArticleAtURL(articleUrl, true) if processor.Err != nil { artDebugger.Println("Failed to process article: ", processor.Err) return false } var isNew bool var err error if globalConfig.MongoUrl != "" { if session == nil { session = lib.DBConnect(globalConfig.MongoUrl) defer lib.DBClose(session) } artDebugger.Println("Attempting to save article: ", processor.Article) isNew, err = processor.Article.Save(session) if err != nil { lib.Logger.Println(err) } } artDebugger.Println(processor.Article) return isNew }
func runTest(t *testing.T, rec *TestRec) { t.Logf("Testing URL: %v", rec.url) var extract *m.ExtractedBody if rec.html == "" { processor := a.ParseArticleAtURL(rec.url, true) extract = processor.ExtractedBody if processor.Err != nil { t.Fatalf("Failed to parse article: %v", processor.Err) } println("Here's the HTML to embed for", rec.url) println(processor.Html) } else { extract = extraction.ExtractDataFromHTMLString(rec.html, rec.url, false) } text := extract.Text if text == "" { t.Errorf("Body extractor returned no text.") } else { for _, s := range rec.expected { if !strings.Contains(text, s) { t.Errorf("Expected body fragment not found: %#v", s) } } for _, s := range rec.forbidden { if strings.Contains(text, s) { t.Errorf("Forbidden body fragment found: %#v", s) } } for _, s := range globalForbidden { if strings.Contains(text, s) { t.Errorf("Globally forbidden body fragment found: %#v", s) } } t.Logf("in body: %#v", text) } }
/* Fetch the top pages data for each url in the urls parameter. Url expected to be http://api.chartbeat.com/live/toppages/v3 */ func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot { chartbeatDebugger.Println("Fetching chartbeat top pages") topArticles := make([]*mc.TopArticle, 0, 100*len(urls)) topArticlesProcessed := make([]*m.Article, 0, 100*len(urls)) articleQueue := make(chan *mc.TopArticle, 100*len(urls)) var wg sync.WaitGroup for i := 0; i < len(urls); i++ { wg.Add(1) go func(url string) { pages, err := GetTopPages(url) host, _ := GetHostFromParams(url) if err != nil { chartbeatError.Println("Failed to json parse url %s: %v", url, err) wg.Done() return } for i := 0; i < len(pages.Pages); i++ { page := pages.Pages[i] articleUrl := page.Path articleId := lib.GetArticleId(articleUrl) article := &mc.TopArticle{} // this means we can't find an article ID. It's probably a section front, // so ignore if articleId < 0 || lib.IsBlacklisted(articleUrl) { continue } article.ArticleId = articleId article.Headline = page.Title article.Url = page.Path article.Sections = page.Sections article.Visits = page.Stats.Visits article.Loyalty = page.Stats.Loyalty article.Authors = e.ParseAuthors(page.Authors) article.Source = strings.Replace(host, ".com", "", -1) articleQueue <- article } wg.Done() }(urls[i]) } wg.Wait() chartbeatDebugger.Println("Done") close(articleQueue) for topArticle := range articleQueue { topArticles = append(topArticles, topArticle) } chartbeatDebugger.Printf("Num article: %d", len(topArticles)) chartbeatDebugger.Println("Done fetching and parsing URLs...") // The snapshot object that will be saved snapshotDoc := mc.TopPagesSnapshotDocument{} snapshotDoc.Articles = SortTopArticles(topArticles) snapshotDoc.Created_at = time.Now() // For the top 50 pages, make sure we've processed the body and generated // an Article{} document (and summary) var articleBodyWait sync.WaitGroup articleCol := session.DB("").C("Article") numToSummarize := 50 if len(snapshotDoc.Articles) < numToSummarize { numToSummarize = len(snapshotDoc.Articles) } chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize) for i := 0; i < numToSummarize; i++ { topArticle := snapshotDoc.Articles[i] articleBodyWait.Add(1) // Process each article go func(url string, index int) { // First, see if the article exists in the DB. if it does, don't worry about it article := &m.Article{} url = "http://" + url articleCol.Find(bson.M{"url": url}).One(&article) if article.Id.Valid() { articleBodyWait.Done() return } chartbeatDebugger.Printf("Processing article %d (url %s)", index, url) processor := a.ParseArticleAtURL(url, true) if processor.Err != nil { chartbeatError.Println("Failed to process article: ", processor.Err) } else { topArticlesProcessed = append(topArticlesProcessed, processor.Article) } articleBodyWait.Done() }(topArticle.Url, i) } articleBodyWait.Wait() // Compile the snapshot snapshot := mc.TopPagesSnapshot{} snapshot.Document = snapshotDoc snapshot.Articles = topArticlesProcessed return snapshot }