Пример #1
0
import (
	"encoding/json"
	"fmt"
	"os/exec"
	"strings"
	"sync"
	"time"

	"gopkg.in/mgo.v2"

	a "github.com/michigan-com/newsfetch/fetch/article"
	"github.com/michigan-com/newsfetch/lib"
	"github.com/spf13/cobra"
)

var artDebugger = lib.NewCondLogger("newsfetch:commands:article")

type SummaryResponse struct {
	Skipped    int `json:"skipped"`
	Summarized int `json:"summarized"`
}

func processSummaries() (*SummaryResponse, error) {
	artDebugger.Println("Sending request to brevity to process summaries")

	if globalConfig.SummaryVENV == "" {
		return nil, fmt.Errorf("Missing SUMMARY_VENV environtment variable, skipping summarizer")
	}

	cmd := fmt.Sprintf("%s/bin/python", globalConfig.SummaryVENV)
	pyScript := fmt.Sprintf("%s/bin/summary.py", globalConfig.SummaryVENV)
Пример #2
0
import (
	"errors"
	"fmt"
	"net/http"
	"net/url"
	"strings"

	"gopkg.in/mgo.v2"

	"github.com/michigan-com/newsfetch/lib"
	m "github.com/michigan-com/newsfetch/model/chartbeat"
)

const chartbeatApiUrlFormat = "http://api.chartbeat.com/%s/?apikey=%s&host=%s&limit=100"

var chartbeatDebugger = lib.NewCondLogger("newsfetch:fetch:chartbeat")
var chartbeatError = lib.NewCondLogger("newsfetch:fetch:chartbeat:error")

type ChartbeatFetch interface {
	Fetch([]string, *mgo.Session) m.Snapshot
}

// Types
// Beat type to be called from command package
type Beat interface {
	Run(*mgo.Session, string, string, []string)
}

// ChartbeatApi to be used in the chartbeat package
type ChartbeatApi struct {
	Url           ChartbeatUrl
Пример #3
0
package model

import (
	"fmt"
	"time"

	"github.com/michigan-com/newsfetch/lib"
	"gopkg.in/mgo.v2"
	"gopkg.in/mgo.v2/bson"
)

var Debugger = lib.NewCondLogger("newsfetch:model:article")

var articleIdIndex = mgo.Index{
	Key:    []string{"article_id"},
	Unique: true,
}

type Article struct {
	Id          bson.ObjectId  `bson:"_id,omitempty" json:"_id"`
	ArticleId   int            `bson:"article_id" json:"article_id"`
	Headline    string         `bson:"headline" json:"headline`
	Subheadline string         `bson:"subheadline" json:"subheadline"`
	Section     string         `bson:"section" json:"section"`
	Subsection  string         `bson:"subsection" json:"subsection"`
	Sections    []string       `bson:"sections" json"sections"`
	Source      string         `bson:"source" json:"source"`
	Created_at  time.Time      `bson:"created_at" json:"created_at"`
	Updated_at  time.Time      `bson:"updated_at" json:"updated_at"`
	Timestamp   time.Time      `bson:"timestamp" json:"timestamp"`
	Url         string         `bson:"url" json:"url"`
Пример #4
0
package model

import (
	"gopkg.in/mgo.v2"
	"gopkg.in/mgo.v2/bson"

	"github.com/michigan-com/newsfetch/lib"
)

var debugger = lib.NewCondLogger("newsfetch:model:chartbeat")

type Snapshot interface {
	Save(session *mgo.Session)
}

func removeOldSnapshots(col *mgo.Collection) {
	var snapshot = bson.M{
		"_id": -1,
	}
	// Remove old snapshots
	col.Find(bson.M{}).
		Select(bson.M{"_id": 1}).
		Sort("-_id").
		One(&snapshot)

	_, err := col.RemoveAll(bson.M{
		"_id": bson.M{
			"$ne": snapshot["_id"],
		},
	})
Пример #5
0
package model

import (
	"github.com/michigan-com/newsfetch/lib"
)

var debugger = lib.NewCondLogger("newsfetch:model")
Пример #6
0
import (
	"encoding/json"
	"fmt"
	"net/http"
	"strconv"
	"time"

	"gopkg.in/mgo.v2/bson"

	r "github.com/michigan-com/newsfetch/fetch/recipe"
	"github.com/michigan-com/newsfetch/lib"
	m "github.com/michigan-com/newsfetch/model"
	"github.com/spf13/cobra"
)

var recipeDebugger = lib.NewCondLogger("newsfetch:commands:recipes")

func printRecipies(articles []*m.Article) {
	for _, article := range articles {
		lib.Logger.Printf("%s/%s/%s - %s - %s\n", article.Source, article.Section, article.Subsection, article.Headline, article.Url)
	}
}

var cmdRecipes = &cobra.Command{
	Use:   "recipes",
	Short: "Command for Gannett recipe articles",
}

var cmdReprocessRecipies = &cobra.Command{
	Use:   "reprocess-all",
	Short: "Re-extract recipes from all articles saved in Mongo",
Пример #7
0
package commands

import (
	"strings"
	"sync"
	"time"

	f "github.com/michigan-com/newsfetch/fetch/chartbeat"
	"github.com/michigan-com/newsfetch/lib"
	"github.com/spf13/cobra"
	"gopkg.in/mgo.v2"
)

var chartbeatDebugger = lib.NewCondLogger("newsfetch:commands:chartbeat")

var cmdChartbeat = &cobra.Command{
	Use:   "chartbeat",
	Short: "Hit the Chartbeat API",
}

var cmdAllBeats = &cobra.Command{
	Use:   "all",
	Short: "Fetch all Chartbeat Beats (toppages, quickstats)",
	Run: func(cmd *cobra.Command, argv []string) {
		RunChartbeatCommands([]f.Beat{
			f.TopPagesApi,
			f.QuickStatsApi,
			f.TopGeoApi,
			f.ReferrersApi,
			f.RecentApi,
			f.TrafficSeriesApi,
Пример #8
0
	"github.com/spf13/cobra"
)

var (
	articleUrl   string
	siteStr      string
	sectionStr   string
	title        string
	includeTitle bool
	noprompt     bool
	startTime    time.Time
	VERSION      string
	COMMITHASH   string
	loop         int
	noUpdate     bool
	timeLogger   = lib.NewCondLogger("timer")
	NewsfetchCmd = &cobra.Command{Use: "newsfetch"}
	url          = "http://www.freep.com/story/news/local/michigan/2015/08/06/farid-fata-cancer-sentencing/31213475/"
	//w            = new(tabwriter.Writer)
)

func Execute(ver, commit string) {
	VERSION = ver
	COMMITHASH = commit
	loadConfig()
	AddCommands()
	AddFlags()

	NewsfetchCmd.Execute()
}
Пример #9
0
package fetch

import (
	"errors"
	"fmt"
	"strings"

	"github.com/michigan-com/newsfetch/extraction"
	"github.com/michigan-com/newsfetch/lib"
	m "github.com/michigan-com/newsfetch/model"
)

var artDebugger = lib.NewCondLogger("newsfetch:fetch:article")

// Processor object that contains the Article to be saved
// as well as the body text
type ArticleProcess struct {
	*m.Article
	*m.ExtractedBody
	Html string
	Err  error
}

func (p *ArticleProcess) String() string {
	return fmt.Sprintf("<ArticleProcess %s\n %s\n Error: %v>\n", p.Article, p.ExtractedBody, p.Err)
}

// Primary entry point to process an article's json
// based on the article Url
func ParseArticleAtURL(articleUrl string, runExtraction bool) *ArticleProcess {
	processor := &ArticleProcess{}
Пример #10
0
import (
	"fmt"
	"strconv"
	"strings"
	"time"

	gq "github.com/PuerkitoBio/goquery"
	"github.com/michigan-com/newsfetch/extraction/classify"
	"github.com/michigan-com/newsfetch/extraction/dateline"
	"github.com/michigan-com/newsfetch/extraction/recipe_parsing"
	"github.com/michigan-com/newsfetch/lib"
	m "github.com/michigan-com/newsfetch/model"
)

var Debugger = lib.NewCondLogger("newsfetch:extraction:body_parsing")

func withoutEmptyStrings(strings []string) []string {
	result := make([]string, 0, len(strings))
	for _, el := range strings {
		if el != "" {
			result = append(result, el)
		}
	}
	return result
}

func ExtractBodyFromDocument(doc *gq.Document, fromJSON bool, includeTitle bool) *m.ExtractedBody {
	msg := new(m.Messages)

	var paragraphs *gq.Selection
Пример #11
0
package fetch

import (
	"fmt"

	"github.com/michigan-com/newsfetch/extraction"
	"github.com/michigan-com/newsfetch/lib"
	m "github.com/michigan-com/newsfetch/model"
)

var recipeDebugger = lib.NewCondLogger("newsfetch:fetch:recipe")

func DownloadAndSaveRecipesForArticles(mongoUrl string, articles []*m.Article) error {
	for _, article := range articles {
		err := DownloadAndSaveRecipesForArticle(mongoUrl, article)
		if err != nil {
			return err
		}
	}
	return nil
}

func DownloadAndSaveRecipesForArticle(mongoUrl string, article *m.Article) error {
	recipes := DownloadRecipesForArticle(article)

	if mongoUrl != "" {
		err := SaveRecipes(mongoUrl, recipes)
		return err
	} else {
		return nil
	}