Esempio n. 1
0
/*
	Fetch the top pages data for each url in the urls parameter. Url expected
	to be http://api.chartbeat.com/live/toppages/v3
*/
func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot {
	chartbeatDebugger.Println("Fetching chartbeat top pages")
	topArticles := make([]*mc.TopArticle, 0, 100*len(urls))
	topArticlesProcessed := make([]*m.Article, 0, 100*len(urls))
	articleQueue := make(chan *mc.TopArticle, 100*len(urls))

	var wg sync.WaitGroup

	for i := 0; i < len(urls); i++ {
		wg.Add(1)

		go func(url string) {
			pages, err := GetTopPages(url)
			host, _ := GetHostFromParams(url)

			if err != nil {
				chartbeatError.Println("Failed to json parse url %s: %v", url, err)
				wg.Done()
				return
			}

			for i := 0; i < len(pages.Pages); i++ {
				page := pages.Pages[i]
				articleUrl := page.Path
				articleId := lib.GetArticleId(articleUrl)
				article := &mc.TopArticle{}

				// this means we can't find an article ID. It's probably a section front,
				// so ignore
				if articleId < 0 || lib.IsBlacklisted(articleUrl) {
					continue
				}

				article.ArticleId = articleId
				article.Headline = page.Title
				article.Url = page.Path
				article.Sections = page.Sections
				article.Visits = page.Stats.Visits
				article.Loyalty = page.Stats.Loyalty
				article.Authors = e.ParseAuthors(page.Authors)
				article.Source = strings.Replace(host, ".com", "", -1)

				articleQueue <- article
			}

			wg.Done()
		}(urls[i])
	}

	wg.Wait()
	chartbeatDebugger.Println("Done")
	close(articleQueue)

	for topArticle := range articleQueue {
		topArticles = append(topArticles, topArticle)
	}

	chartbeatDebugger.Printf("Num article: %d", len(topArticles))
	chartbeatDebugger.Println("Done fetching and parsing URLs...")

	// The snapshot object that will be saved
	snapshotDoc := mc.TopPagesSnapshotDocument{}
	snapshotDoc.Articles = SortTopArticles(topArticles)
	snapshotDoc.Created_at = time.Now()

	// For the top 50 pages, make sure we've processed the body and generated
	// an Article{} document (and summary)
	var articleBodyWait sync.WaitGroup
	articleCol := session.DB("").C("Article")

	numToSummarize := 50
	if len(snapshotDoc.Articles) < numToSummarize {
		numToSummarize = len(snapshotDoc.Articles)
	}

	chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize)

	for i := 0; i < numToSummarize; i++ {
		topArticle := snapshotDoc.Articles[i]
		articleBodyWait.Add(1)

		// Process each article
		go func(url string, index int) {
			// First, see if the article exists in the DB. if it does, don't worry about it
			article := &m.Article{}
			url = "http://" + url
			articleCol.Find(bson.M{"url": url}).One(&article)

			if article.Id.Valid() {
				articleBodyWait.Done()
				return
			}

			chartbeatDebugger.Printf("Processing article %d (url %s)", index, url)

			processor := a.ParseArticleAtURL(url, true)
			if processor.Err != nil {
				chartbeatError.Println("Failed to process article: ", processor.Err)
			} else {
				topArticlesProcessed = append(topArticlesProcessed, processor.Article)
			}

			articleBodyWait.Done()
		}(topArticle.Url, i)
	}

	articleBodyWait.Wait()

	// Compile the snapshot
	snapshot := mc.TopPagesSnapshot{}
	snapshot.Document = snapshotDoc
	snapshot.Articles = topArticlesProcessed
	return snapshot
}
Esempio n. 2
0
func (a *ArticleIn) isBlacklisted() bool {
	return lib.IsBlacklisted(a.Url)
}