func DownloadRecipesFromUrls(urls []string) DownloadRecipesResult {
	result := DownloadRecipesResult{}
	result.URLs = make([]string, 0)
	result.URLsWithoutRecipes = make([]string, 0)

	visited := make(map[string]bool)
	for len(urls) > 0 {
		url := urls[0]
		urls = urls[1:]

		if visited[url] {
			continue
		}
		visited[url] = true

		articleId := lib.GetArticleId(url)
		if articleId < 1 {
			recipeDebugger.Println("Skipped, cannot determine article ID")
			continue
		}

		extracted := extraction.ExtractDataFromHTMLAtURL(url, false)

		recipesInArticle := extracted.RecipeData.Recipes

		for _, recipe := range recipesInArticle {
			recipe.ArticleId = articleId
		}

		if recipeDebugger.IsEnabled() {
			fmt.Printf("Found %d recipes + %d links in %s\n", len(recipesInArticle), len(extracted.RecipeData.EmbeddedArticleUrls), url)
		}

		if len(recipesInArticle) == 0 && len(extracted.RecipeData.EmbeddedArticleUrls) == 0 {
			result.URLsWithoutRecipes = append(result.URLsWithoutRecipes, url)
		}
		result.URLs = append(result.URLs, url)

		if false {
			for i, recipe := range result.Recipes {
				recipeDebugger.Println()
				recipeDebugger.Println("Recipe ", i, "=", recipe.String())
				recipeDebugger.Println()
			}
		}

		result.Recipes = append(result.Recipes, extracted.RecipeData.Recipes...)
		urls = append(urls, extracted.RecipeData.EmbeddedArticleUrls...)
	}

	return result
}
Example #2
0
func (a *ArticleIn) GetData() error {

	artDebugger.Println("Fetching: ", a.Url)

	doc, err := gq.NewDocument(a.Url)
	if err != nil {
		return err
	}

	a.Site, _ = lib.GetHost(a.Url)
	a.Doc = doc
	a.ArticleId = lib.GetArticleId(a.Url)

	return nil
}
Example #3
0
func (r Recent) Fetch(urls []string, session *mgo.Session) m.Snapshot {
	var wait sync.WaitGroup
	queue := make(chan *m.RecentResp, len(urls))

	for _, url := range urls {
		wait.Add(1)

		go func(url string) {

			recent, err := GetRecents(url)
			if err != nil {
				chartbeatDebugger.Printf("Failed to get %s: %v", url, err)
			} else {
				parsed_articles := make([]m.Recent, 0, 100)
				for _, article := range recent.Recents {
					articleId := lib.GetArticleId(article.Url)

					if articleId > 0 {
						article.Host = strings.Replace(article.Host, ".com", "", -1)
						parsed_articles = append(parsed_articles, article)
					}
				}

				recent.Recents = parsed_articles
				queue <- recent
			}
			wait.Done()
		}(url)
	}

	wait.Wait()
	close(queue)

	recents := make([]*m.RecentResp, 0, len(urls))
	for recent := range queue {
		recents = append(recents, recent)
	}

	snapshot := m.RecentSnapshot{}
	snapshot.Created_at = time.Now()
	snapshot.Recents = recents
	return snapshot
}
Example #4
0
/*
	Fetch the top pages data for each url in the urls parameter. Url expected
	to be http://api.chartbeat.com/live/toppages/v3
*/
func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot {
	chartbeatDebugger.Println("Fetching chartbeat top pages")
	topArticles := make([]*mc.TopArticle, 0, 100*len(urls))
	topArticlesProcessed := make([]*m.Article, 0, 100*len(urls))
	articleQueue := make(chan *mc.TopArticle, 100*len(urls))

	var wg sync.WaitGroup

	for i := 0; i < len(urls); i++ {
		wg.Add(1)

		go func(url string) {
			pages, err := GetTopPages(url)
			host, _ := GetHostFromParams(url)

			if err != nil {
				chartbeatError.Println("Failed to json parse url %s: %v", url, err)
				wg.Done()
				return
			}

			for i := 0; i < len(pages.Pages); i++ {
				page := pages.Pages[i]
				articleUrl := page.Path
				articleId := lib.GetArticleId(articleUrl)
				article := &mc.TopArticle{}

				// this means we can't find an article ID. It's probably a section front,
				// so ignore
				if articleId < 0 || lib.IsBlacklisted(articleUrl) {
					continue
				}

				article.ArticleId = articleId
				article.Headline = page.Title
				article.Url = page.Path
				article.Sections = page.Sections
				article.Visits = page.Stats.Visits
				article.Loyalty = page.Stats.Loyalty
				article.Authors = e.ParseAuthors(page.Authors)
				article.Source = strings.Replace(host, ".com", "", -1)

				articleQueue <- article
			}

			wg.Done()
		}(urls[i])
	}

	wg.Wait()
	chartbeatDebugger.Println("Done")
	close(articleQueue)

	for topArticle := range articleQueue {
		topArticles = append(topArticles, topArticle)
	}

	chartbeatDebugger.Printf("Num article: %d", len(topArticles))
	chartbeatDebugger.Println("Done fetching and parsing URLs...")

	// The snapshot object that will be saved
	snapshotDoc := mc.TopPagesSnapshotDocument{}
	snapshotDoc.Articles = SortTopArticles(topArticles)
	snapshotDoc.Created_at = time.Now()

	// For the top 50 pages, make sure we've processed the body and generated
	// an Article{} document (and summary)
	var articleBodyWait sync.WaitGroup
	articleCol := session.DB("").C("Article")

	numToSummarize := 50
	if len(snapshotDoc.Articles) < numToSummarize {
		numToSummarize = len(snapshotDoc.Articles)
	}

	chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize)

	for i := 0; i < numToSummarize; i++ {
		topArticle := snapshotDoc.Articles[i]
		articleBodyWait.Add(1)

		// Process each article
		go func(url string, index int) {
			// First, see if the article exists in the DB. if it does, don't worry about it
			article := &m.Article{}
			url = "http://" + url
			articleCol.Find(bson.M{"url": url}).One(&article)

			if article.Id.Valid() {
				articleBodyWait.Done()
				return
			}

			chartbeatDebugger.Printf("Processing article %d (url %s)", index, url)

			processor := a.ParseArticleAtURL(url, true)
			if processor.Err != nil {
				chartbeatError.Println("Failed to process article: ", processor.Err)
			} else {
				topArticlesProcessed = append(topArticlesProcessed, processor.Article)
			}

			articleBodyWait.Done()
		}(topArticle.Url, i)
	}

	articleBodyWait.Wait()

	// Compile the snapshot
	snapshot := mc.TopPagesSnapshot{}
	snapshot.Document = snapshotDoc
	snapshot.Articles = topArticlesProcessed
	return snapshot
}