func DownloadRecipesFromUrls(urls []string) DownloadRecipesResult { result := DownloadRecipesResult{} result.URLs = make([]string, 0) result.URLsWithoutRecipes = make([]string, 0) visited := make(map[string]bool) for len(urls) > 0 { url := urls[0] urls = urls[1:] if visited[url] { continue } visited[url] = true articleId := lib.GetArticleId(url) if articleId < 1 { recipeDebugger.Println("Skipped, cannot determine article ID") continue } extracted := extraction.ExtractDataFromHTMLAtURL(url, false) recipesInArticle := extracted.RecipeData.Recipes for _, recipe := range recipesInArticle { recipe.ArticleId = articleId } if recipeDebugger.IsEnabled() { fmt.Printf("Found %d recipes + %d links in %s\n", len(recipesInArticle), len(extracted.RecipeData.EmbeddedArticleUrls), url) } if len(recipesInArticle) == 0 && len(extracted.RecipeData.EmbeddedArticleUrls) == 0 { result.URLsWithoutRecipes = append(result.URLsWithoutRecipes, url) } result.URLs = append(result.URLs, url) if false { for i, recipe := range result.Recipes { recipeDebugger.Println() recipeDebugger.Println("Recipe ", i, "=", recipe.String()) recipeDebugger.Println() } } result.Recipes = append(result.Recipes, extracted.RecipeData.Recipes...) urls = append(urls, extracted.RecipeData.EmbeddedArticleUrls...) } return result }
package commands import ( "strings" "time" "github.com/michigan-com/newsfetch/extraction" "github.com/michigan-com/newsfetch/lib" "github.com/spf13/cobra" ) var cmdBody = &cobra.Command{ Use: "body", Short: "Get article body content from Gannett URL", Run: func(cmd *cobra.Command, args []string) { startTime = time.Now() if len(args) > 0 && args[0] != "" { articleUrl = args[0] } extracted := extraction.ExtractDataFromHTMLAtURL(articleUrl, includeTitle) bodyFmt := strings.Join(strings.Split(extracted.Text, "\n"), "\n\n") lib.Logger.Println(bodyFmt) getElapsedTime(&startTime) }, }