/* Fetch the top pages data for each url in the urls parameter. Url expected to be http://api.chartbeat.com/live/toppages/v3 */ func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot { chartbeatDebugger.Println("Fetching chartbeat top pages") topArticles := make([]*mc.TopArticle, 0, 100*len(urls)) topArticlesProcessed := make([]*m.Article, 0, 100*len(urls)) articleQueue := make(chan *mc.TopArticle, 100*len(urls)) var wg sync.WaitGroup for i := 0; i < len(urls); i++ { wg.Add(1) go func(url string) { pages, err := GetTopPages(url) host, _ := GetHostFromParams(url) if err != nil { chartbeatError.Println("Failed to json parse url %s: %v", url, err) wg.Done() return } for i := 0; i < len(pages.Pages); i++ { page := pages.Pages[i] articleUrl := page.Path articleId := lib.GetArticleId(articleUrl) article := &mc.TopArticle{} // this means we can't find an article ID. It's probably a section front, // so ignore if articleId < 0 || lib.IsBlacklisted(articleUrl) { continue } article.ArticleId = articleId article.Headline = page.Title article.Url = page.Path article.Sections = page.Sections article.Visits = page.Stats.Visits article.Loyalty = page.Stats.Loyalty article.Authors = e.ParseAuthors(page.Authors) article.Source = strings.Replace(host, ".com", "", -1) articleQueue <- article } wg.Done() }(urls[i]) } wg.Wait() chartbeatDebugger.Println("Done") close(articleQueue) for topArticle := range articleQueue { topArticles = append(topArticles, topArticle) } chartbeatDebugger.Printf("Num article: %d", len(topArticles)) chartbeatDebugger.Println("Done fetching and parsing URLs...") // The snapshot object that will be saved snapshotDoc := mc.TopPagesSnapshotDocument{} snapshotDoc.Articles = SortTopArticles(topArticles) snapshotDoc.Created_at = time.Now() // For the top 50 pages, make sure we've processed the body and generated // an Article{} document (and summary) var articleBodyWait sync.WaitGroup articleCol := session.DB("").C("Article") numToSummarize := 50 if len(snapshotDoc.Articles) < numToSummarize { numToSummarize = len(snapshotDoc.Articles) } chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize) for i := 0; i < numToSummarize; i++ { topArticle := snapshotDoc.Articles[i] articleBodyWait.Add(1) // Process each article go func(url string, index int) { // First, see if the article exists in the DB. if it does, don't worry about it article := &m.Article{} url = "http://" + url articleCol.Find(bson.M{"url": url}).One(&article) if article.Id.Valid() { articleBodyWait.Done() return } chartbeatDebugger.Printf("Processing article %d (url %s)", index, url) processor := a.ParseArticleAtURL(url, true) if processor.Err != nil { chartbeatError.Println("Failed to process article: ", processor.Err) } else { topArticlesProcessed = append(topArticlesProcessed, processor.Article) } articleBodyWait.Done() }(topArticle.Url, i) } articleBodyWait.Wait() // Compile the snapshot snapshot := mc.TopPagesSnapshot{} snapshot.Document = snapshotDoc snapshot.Articles = topArticlesProcessed return snapshot }
func (a *ArticleIn) isBlacklisted() bool { return lib.IsBlacklisted(a.Url) }