func DownloadRecipesFromUrls(urls []string) DownloadRecipesResult {
	result := DownloadRecipesResult{}
	result.URLs = make([]string, 0)
	result.URLsWithoutRecipes = make([]string, 0)

	visited := make(map[string]bool)
	for len(urls) > 0 {
		url := urls[0]
		urls = urls[1:]

		if visited[url] {
			continue
		}
		visited[url] = true

		articleId := lib.GetArticleId(url)
		if articleId < 1 {
			recipeDebugger.Println("Skipped, cannot determine article ID")
			continue
		}

		extracted := extraction.ExtractDataFromHTMLAtURL(url, false)

		recipesInArticle := extracted.RecipeData.Recipes

		for _, recipe := range recipesInArticle {
			recipe.ArticleId = articleId
		}

		if recipeDebugger.IsEnabled() {
			fmt.Printf("Found %d recipes + %d links in %s\n", len(recipesInArticle), len(extracted.RecipeData.EmbeddedArticleUrls), url)
		}

		if len(recipesInArticle) == 0 && len(extracted.RecipeData.EmbeddedArticleUrls) == 0 {
			result.URLsWithoutRecipes = append(result.URLsWithoutRecipes, url)
		}
		result.URLs = append(result.URLs, url)

		if false {
			for i, recipe := range result.Recipes {
				recipeDebugger.Println()
				recipeDebugger.Println("Recipe ", i, "=", recipe.String())
				recipeDebugger.Println()
			}
		}

		result.Recipes = append(result.Recipes, extracted.RecipeData.Recipes...)
		urls = append(urls, extracted.RecipeData.EmbeddedArticleUrls...)
	}

	return result
}
Esempio n. 2
0
package commands

import (
	"strings"
	"time"

	"github.com/michigan-com/newsfetch/extraction"
	"github.com/michigan-com/newsfetch/lib"
	"github.com/spf13/cobra"
)

var cmdBody = &cobra.Command{
	Use:   "body",
	Short: "Get article body content from Gannett URL",
	Run: func(cmd *cobra.Command, args []string) {
		startTime = time.Now()

		if len(args) > 0 && args[0] != "" {
			articleUrl = args[0]
		}

		extracted := extraction.ExtractDataFromHTMLAtURL(articleUrl, includeTitle)

		bodyFmt := strings.Join(strings.Split(extracted.Text, "\n"), "\n\n")
		lib.Logger.Println(bodyFmt)

		getElapsedTime(&startTime)
	},
}