import ( "encoding/json" "fmt" "os/exec" "strings" "sync" "time" "gopkg.in/mgo.v2" a "github.com/michigan-com/newsfetch/fetch/article" "github.com/michigan-com/newsfetch/lib" "github.com/spf13/cobra" ) var artDebugger = lib.NewCondLogger("newsfetch:commands:article") type SummaryResponse struct { Skipped int `json:"skipped"` Summarized int `json:"summarized"` } func processSummaries() (*SummaryResponse, error) { artDebugger.Println("Sending request to brevity to process summaries") if globalConfig.SummaryVENV == "" { return nil, fmt.Errorf("Missing SUMMARY_VENV environtment variable, skipping summarizer") } cmd := fmt.Sprintf("%s/bin/python", globalConfig.SummaryVENV) pyScript := fmt.Sprintf("%s/bin/summary.py", globalConfig.SummaryVENV)
import ( "errors" "fmt" "net/http" "net/url" "strings" "gopkg.in/mgo.v2" "github.com/michigan-com/newsfetch/lib" m "github.com/michigan-com/newsfetch/model/chartbeat" ) const chartbeatApiUrlFormat = "http://api.chartbeat.com/%s/?apikey=%s&host=%s&limit=100" var chartbeatDebugger = lib.NewCondLogger("newsfetch:fetch:chartbeat") var chartbeatError = lib.NewCondLogger("newsfetch:fetch:chartbeat:error") type ChartbeatFetch interface { Fetch([]string, *mgo.Session) m.Snapshot } // Types // Beat type to be called from command package type Beat interface { Run(*mgo.Session, string, string, []string) } // ChartbeatApi to be used in the chartbeat package type ChartbeatApi struct { Url ChartbeatUrl
package model import ( "fmt" "time" "github.com/michigan-com/newsfetch/lib" "gopkg.in/mgo.v2" "gopkg.in/mgo.v2/bson" ) var Debugger = lib.NewCondLogger("newsfetch:model:article") var articleIdIndex = mgo.Index{ Key: []string{"article_id"}, Unique: true, } type Article struct { Id bson.ObjectId `bson:"_id,omitempty" json:"_id"` ArticleId int `bson:"article_id" json:"article_id"` Headline string `bson:"headline" json:"headline` Subheadline string `bson:"subheadline" json:"subheadline"` Section string `bson:"section" json:"section"` Subsection string `bson:"subsection" json:"subsection"` Sections []string `bson:"sections" json"sections"` Source string `bson:"source" json:"source"` Created_at time.Time `bson:"created_at" json:"created_at"` Updated_at time.Time `bson:"updated_at" json:"updated_at"` Timestamp time.Time `bson:"timestamp" json:"timestamp"` Url string `bson:"url" json:"url"`
package model import ( "gopkg.in/mgo.v2" "gopkg.in/mgo.v2/bson" "github.com/michigan-com/newsfetch/lib" ) var debugger = lib.NewCondLogger("newsfetch:model:chartbeat") type Snapshot interface { Save(session *mgo.Session) } func removeOldSnapshots(col *mgo.Collection) { var snapshot = bson.M{ "_id": -1, } // Remove old snapshots col.Find(bson.M{}). Select(bson.M{"_id": 1}). Sort("-_id"). One(&snapshot) _, err := col.RemoveAll(bson.M{ "_id": bson.M{ "$ne": snapshot["_id"], }, })
package model import ( "github.com/michigan-com/newsfetch/lib" ) var debugger = lib.NewCondLogger("newsfetch:model")
import ( "encoding/json" "fmt" "net/http" "strconv" "time" "gopkg.in/mgo.v2/bson" r "github.com/michigan-com/newsfetch/fetch/recipe" "github.com/michigan-com/newsfetch/lib" m "github.com/michigan-com/newsfetch/model" "github.com/spf13/cobra" ) var recipeDebugger = lib.NewCondLogger("newsfetch:commands:recipes") func printRecipies(articles []*m.Article) { for _, article := range articles { lib.Logger.Printf("%s/%s/%s - %s - %s\n", article.Source, article.Section, article.Subsection, article.Headline, article.Url) } } var cmdRecipes = &cobra.Command{ Use: "recipes", Short: "Command for Gannett recipe articles", } var cmdReprocessRecipies = &cobra.Command{ Use: "reprocess-all", Short: "Re-extract recipes from all articles saved in Mongo",
package commands import ( "strings" "sync" "time" f "github.com/michigan-com/newsfetch/fetch/chartbeat" "github.com/michigan-com/newsfetch/lib" "github.com/spf13/cobra" "gopkg.in/mgo.v2" ) var chartbeatDebugger = lib.NewCondLogger("newsfetch:commands:chartbeat") var cmdChartbeat = &cobra.Command{ Use: "chartbeat", Short: "Hit the Chartbeat API", } var cmdAllBeats = &cobra.Command{ Use: "all", Short: "Fetch all Chartbeat Beats (toppages, quickstats)", Run: func(cmd *cobra.Command, argv []string) { RunChartbeatCommands([]f.Beat{ f.TopPagesApi, f.QuickStatsApi, f.TopGeoApi, f.ReferrersApi, f.RecentApi, f.TrafficSeriesApi,
"github.com/spf13/cobra" ) var ( articleUrl string siteStr string sectionStr string title string includeTitle bool noprompt bool startTime time.Time VERSION string COMMITHASH string loop int noUpdate bool timeLogger = lib.NewCondLogger("timer") NewsfetchCmd = &cobra.Command{Use: "newsfetch"} url = "http://www.freep.com/story/news/local/michigan/2015/08/06/farid-fata-cancer-sentencing/31213475/" //w = new(tabwriter.Writer) ) func Execute(ver, commit string) { VERSION = ver COMMITHASH = commit loadConfig() AddCommands() AddFlags() NewsfetchCmd.Execute() }
package fetch import ( "errors" "fmt" "strings" "github.com/michigan-com/newsfetch/extraction" "github.com/michigan-com/newsfetch/lib" m "github.com/michigan-com/newsfetch/model" ) var artDebugger = lib.NewCondLogger("newsfetch:fetch:article") // Processor object that contains the Article to be saved // as well as the body text type ArticleProcess struct { *m.Article *m.ExtractedBody Html string Err error } func (p *ArticleProcess) String() string { return fmt.Sprintf("<ArticleProcess %s\n %s\n Error: %v>\n", p.Article, p.ExtractedBody, p.Err) } // Primary entry point to process an article's json // based on the article Url func ParseArticleAtURL(articleUrl string, runExtraction bool) *ArticleProcess { processor := &ArticleProcess{}
import ( "fmt" "strconv" "strings" "time" gq "github.com/PuerkitoBio/goquery" "github.com/michigan-com/newsfetch/extraction/classify" "github.com/michigan-com/newsfetch/extraction/dateline" "github.com/michigan-com/newsfetch/extraction/recipe_parsing" "github.com/michigan-com/newsfetch/lib" m "github.com/michigan-com/newsfetch/model" ) var Debugger = lib.NewCondLogger("newsfetch:extraction:body_parsing") func withoutEmptyStrings(strings []string) []string { result := make([]string, 0, len(strings)) for _, el := range strings { if el != "" { result = append(result, el) } } return result } func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody { msg := new(m.Messages) var paragraphs *gq.Selection
package fetch import ( "fmt" "github.com/michigan-com/newsfetch/extraction" "github.com/michigan-com/newsfetch/lib" m "github.com/michigan-com/newsfetch/model" ) var recipeDebugger = lib.NewCondLogger("newsfetch:fetch:recipe") func DownloadAndSaveRecipesForArticles(mongoUrl string, articles []*m.Article) error { for _, article := range articles { err := DownloadAndSaveRecipesForArticle(mongoUrl, article) if err != nil { return err } } return nil } func DownloadAndSaveRecipesForArticle(mongoUrl string, article *m.Article) error { recipes := DownloadRecipesForArticle(article) if mongoUrl != "" { err := SaveRecipes(mongoUrl, recipes) return err } else { return nil }