func DownloadRecipesFromUrls(urls []string) DownloadRecipesResult { result := DownloadRecipesResult{} result.URLs = make([]string, 0) result.URLsWithoutRecipes = make([]string, 0) visited := make(map[string]bool) for len(urls) > 0 { url := urls[0] urls = urls[1:] if visited[url] { continue } visited[url] = true articleId := lib.GetArticleId(url) if articleId < 1 { recipeDebugger.Println("Skipped, cannot determine article ID") continue } extracted := extraction.ExtractDataFromHTMLAtURL(url, false) recipesInArticle := extracted.RecipeData.Recipes for _, recipe := range recipesInArticle { recipe.ArticleId = articleId } if recipeDebugger.IsEnabled() { fmt.Printf("Found %d recipes + %d links in %s\n", len(recipesInArticle), len(extracted.RecipeData.EmbeddedArticleUrls), url) } if len(recipesInArticle) == 0 && len(extracted.RecipeData.EmbeddedArticleUrls) == 0 { result.URLsWithoutRecipes = append(result.URLsWithoutRecipes, url) } result.URLs = append(result.URLs, url) if false { for i, recipe := range result.Recipes { recipeDebugger.Println() recipeDebugger.Println("Recipe ", i, "=", recipe.String()) recipeDebugger.Println() } } result.Recipes = append(result.Recipes, extracted.RecipeData.Recipes...) urls = append(urls, extracted.RecipeData.EmbeddedArticleUrls...) } return result }
func (a *ArticleIn) GetData() error { artDebugger.Println("Fetching: ", a.Url) doc, err := gq.NewDocument(a.Url) if err != nil { return err } a.Site, _ = lib.GetHost(a.Url) a.Doc = doc a.ArticleId = lib.GetArticleId(a.Url) return nil }
func (r Recent) Fetch(urls []string, session *mgo.Session) m.Snapshot { var wait sync.WaitGroup queue := make(chan *m.RecentResp, len(urls)) for _, url := range urls { wait.Add(1) go func(url string) { recent, err := GetRecents(url) if err != nil { chartbeatDebugger.Printf("Failed to get %s: %v", url, err) } else { parsed_articles := make([]m.Recent, 0, 100) for _, article := range recent.Recents { articleId := lib.GetArticleId(article.Url) if articleId > 0 { article.Host = strings.Replace(article.Host, ".com", "", -1) parsed_articles = append(parsed_articles, article) } } recent.Recents = parsed_articles queue <- recent } wait.Done() }(url) } wait.Wait() close(queue) recents := make([]*m.RecentResp, 0, len(urls)) for recent := range queue { recents = append(recents, recent) } snapshot := m.RecentSnapshot{} snapshot.Created_at = time.Now() snapshot.Recents = recents return snapshot }
/* Fetch the top pages data for each url in the urls parameter. Url expected to be http://api.chartbeat.com/live/toppages/v3 */ func (t TopPages) Fetch(urls []string, session *mgo.Session) mc.Snapshot { chartbeatDebugger.Println("Fetching chartbeat top pages") topArticles := make([]*mc.TopArticle, 0, 100*len(urls)) topArticlesProcessed := make([]*m.Article, 0, 100*len(urls)) articleQueue := make(chan *mc.TopArticle, 100*len(urls)) var wg sync.WaitGroup for i := 0; i < len(urls); i++ { wg.Add(1) go func(url string) { pages, err := GetTopPages(url) host, _ := GetHostFromParams(url) if err != nil { chartbeatError.Println("Failed to json parse url %s: %v", url, err) wg.Done() return } for i := 0; i < len(pages.Pages); i++ { page := pages.Pages[i] articleUrl := page.Path articleId := lib.GetArticleId(articleUrl) article := &mc.TopArticle{} // this means we can't find an article ID. It's probably a section front, // so ignore if articleId < 0 || lib.IsBlacklisted(articleUrl) { continue } article.ArticleId = articleId article.Headline = page.Title article.Url = page.Path article.Sections = page.Sections article.Visits = page.Stats.Visits article.Loyalty = page.Stats.Loyalty article.Authors = e.ParseAuthors(page.Authors) article.Source = strings.Replace(host, ".com", "", -1) articleQueue <- article } wg.Done() }(urls[i]) } wg.Wait() chartbeatDebugger.Println("Done") close(articleQueue) for topArticle := range articleQueue { topArticles = append(topArticles, topArticle) } chartbeatDebugger.Printf("Num article: %d", len(topArticles)) chartbeatDebugger.Println("Done fetching and parsing URLs...") // The snapshot object that will be saved snapshotDoc := mc.TopPagesSnapshotDocument{} snapshotDoc.Articles = SortTopArticles(topArticles) snapshotDoc.Created_at = time.Now() // For the top 50 pages, make sure we've processed the body and generated // an Article{} document (and summary) var articleBodyWait sync.WaitGroup articleCol := session.DB("").C("Article") numToSummarize := 50 if len(snapshotDoc.Articles) < numToSummarize { numToSummarize = len(snapshotDoc.Articles) } chartbeatDebugger.Printf("Number summarizing: %d", numToSummarize) for i := 0; i < numToSummarize; i++ { topArticle := snapshotDoc.Articles[i] articleBodyWait.Add(1) // Process each article go func(url string, index int) { // First, see if the article exists in the DB. if it does, don't worry about it article := &m.Article{} url = "http://" + url articleCol.Find(bson.M{"url": url}).One(&article) if article.Id.Valid() { articleBodyWait.Done() return } chartbeatDebugger.Printf("Processing article %d (url %s)", index, url) processor := a.ParseArticleAtURL(url, true) if processor.Err != nil { chartbeatError.Println("Failed to process article: ", processor.Err) } else { topArticlesProcessed = append(topArticlesProcessed, processor.Article) } articleBodyWait.Done() }(topArticle.Url, i) } articleBodyWait.Wait() // Compile the snapshot snapshot := mc.TopPagesSnapshot{} snapshot.Document = snapshotDoc snapshot.Articles = topArticlesProcessed return snapshot }