Esempio n. 1
0
func processArticle(articleUrl string, session *mgo.Session) bool {
	artDebugger.Println("article url ", articleUrl)
	processor := a.ParseArticleAtURL(articleUrl, true)
	if processor.Err != nil {
		artDebugger.Println("Failed to process article: ", processor.Err)
		return false
	}

	var isNew bool
	var err error
	if globalConfig.MongoUrl != "" {
		if session == nil {
			session = lib.DBConnect(globalConfig.MongoUrl)
			defer lib.DBClose(session)
		}

		artDebugger.Println("Attempting to save article: ", processor.Article)
		isNew, err = processor.Article.Save(session)
		if err != nil {
			lib.Logger.Println(err)
		}
	}

	artDebugger.Println(processor.Article)
	return isNew
}
Esempio n. 2
0
func runTest(t *testing.T, rec *TestRec) {
	t.Logf("Testing URL: %v", rec.url)

	var extract *m.ExtractedBody
	if rec.html == "" {
		processor := a.ParseArticleAtURL(rec.url, true)
		extract = processor.ExtractedBody
		if processor.Err != nil {
			t.Fatalf("Failed to parse article: %v", processor.Err)
		}

		println("Here's the HTML to embed for", rec.url)
		println(processor.Html)
	} else {
		extract = extraction.ExtractDataFromHTMLString(rec.html, rec.url, false)
	}

	text := extract.Text

	if text == "" {
		t.Errorf("Body extractor returned no text.")
	} else {
		for _, s := range rec.expected {
			if !strings.Contains(text, s) {
				t.Errorf("Expected body fragment not found: %#v", s)
			}
		}
		for _, s := range rec.forbidden {
			if strings.Contains(text, s) {
				t.Errorf("Forbidden body fragment found: %#v", s)
			}
		}
		for _, s := range globalForbidden {
			if strings.Contains(text, s) {
				t.Errorf("Globally forbidden body fragment found: %#v", s)
			}
		}
		t.Logf("in body: %#v", text)
	}
}
Esempio n. 3
0
/*
	Fetch the top pages data for each url in the urls parameter. Url expected
	to be http://api.chartbeat.com/live/toppages/v3
*/
func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot {
	chartbeatDebugger.Println("Fetching chartbeat top pages")
	topArticles := make([]*mc.TopArticle, 0, 100*len(urls))
	topArticlesProcessed := make([]*m.Article, 0, 100*len(urls))
	articleQueue := make(chan *mc.TopArticle, 100*len(urls))

	var wg sync.WaitGroup

	for i := 0; i < len(urls); i++ {
		wg.Add(1)

		go func(url string) {
			pages, err := GetTopPages(url)
			host, _ := GetHostFromParams(url)

			if err != nil {
				chartbeatError.Println("Failed to json parse url %s: %v", url, err)
				wg.Done()
				return
			}

			for i := 0; i < len(pages.Pages); i++ {
				page := pages.Pages[i]
				articleUrl := page.Path
				articleId := lib.GetArticleId(articleUrl)
				article := &mc.TopArticle{}

				// this means we can't find an article ID. It's probably a section front,
				// so ignore
				if articleId < 0 || lib.IsBlacklisted(articleUrl) {
					continue
				}

				article.ArticleId = articleId
				article.Headline = page.Title
				article.Url = page.Path
				article.Sections = page.Sections
				article.Visits = page.Stats.Visits
				article.Loyalty = page.Stats.Loyalty
				article.Authors = e.ParseAuthors(page.Authors)
				article.Source = strings.Replace(host, ".com", "", -1)

				articleQueue <- article
			}

			wg.Done()
		}(urls[i])
	}

	wg.Wait()
	chartbeatDebugger.Println("Done")
	close(articleQueue)

	for topArticle := range articleQueue {
		topArticles = append(topArticles, topArticle)
	}

	chartbeatDebugger.Printf("Num article: %d", len(topArticles))
	chartbeatDebugger.Println("Done fetching and parsing URLs...")

	// The snapshot object that will be saved
	snapshotDoc := mc.TopPagesSnapshotDocument{}
	snapshotDoc.Articles = SortTopArticles(topArticles)
	snapshotDoc.Created_at = time.Now()

	// For the top 50 pages, make sure we've processed the body and generated
	// an Article{} document (and summary)
	var articleBodyWait sync.WaitGroup
	articleCol := session.DB("").C("Article")

	numToSummarize := 50
	if len(snapshotDoc.Articles) < numToSummarize {
		numToSummarize = len(snapshotDoc.Articles)
	}

	chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize)

	for i := 0; i < numToSummarize; i++ {
		topArticle := snapshotDoc.Articles[i]
		articleBodyWait.Add(1)

		// Process each article
		go func(url string, index int) {
			// First, see if the article exists in the DB. if it does, don't worry about it
			article := &m.Article{}
			url = "http://" + url
			articleCol.Find(bson.M{"url": url}).One(&article)

			if article.Id.Valid() {
				articleBodyWait.Done()
				return
			}

			chartbeatDebugger.Printf("Processing article %d (url %s)", index, url)

			processor := a.ParseArticleAtURL(url, true)
			if processor.Err != nil {
				chartbeatError.Println("Failed to process article: ", processor.Err)
			} else {
				topArticlesProcessed = append(topArticlesProcessed, processor.Article)
			}

			articleBodyWait.Done()
		}(topArticle.Url, i)
	}

	articleBodyWait.Wait()

	// Compile the snapshot
	snapshot := mc.TopPagesSnapshot{}
	snapshot.Document = snapshotDoc
	snapshot.Articles = topArticlesProcessed
	return snapshot
}