Example #1
0
func indexSpecifiedMember(res *wcg.Response, req *wcg.Request, app *App) {
	var appCtx = lib.NewAppContextFromRequest(req)
	member, ok := app.Members[req.Param("member")]
	if !ok {
		lib.NotFound(res, req)
		return
	}

	num := wcg.ParseInt(req.Param("n"), 0, 0, wcg.ParseIntMax)
	if num == 0 {
		num = wcg.ParseIntMax
	}
	// Crawling
	crawler := ameblo.NewCrawler(appCtx.NewHttpClient())
	prefix := strings.TrimSuffix(member.BlogUrl, ".html") // xxxx.html => xxxx-{num}.html
	entries := make([]*ameblo.AmebloEntry, 0)
	for i := 1; i < num; i += 1 {
		url := fmt.Sprintf("%s-%d.html", prefix, i)
		req.Logger.Info("Indexing from %s ... ", url)
		newentries, err := crawler.CrawlEntryList(url)
		if err != nil {
			lib.InternalError(res, req, err)
			return
		}
		if len(newentries) > 20 {
			panic(fmt.Errorf("Unexpected number of entries (%d) are returned during indexing.", len(newentries)))
		}
		if len(newentries) == 0 {
			break
		}
		if len(newentries) < 20 {
			entries = append(entries, newentries...)
			break
		}
		if len(entries) > 0 && entries[len(entries)-1].Url == newentries[len(newentries)-1].Url {
			break
		}
		entries = append(entries, newentries...)
	}

	// Save and return resutls
	results := make([]string, 0)
	for _, ent := range entries {
		ent.Owner = member.Name
		results = append(results, ent.Url)
	}
	if err := updateIndexes(appCtx, entries); err != nil {
		req.Logger.Error("Failed to update the entry: %v", err)
		lib.InternalError(res, req, err) // stopped.
	} else {
		time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore.
		mc := appCtx.NewMemcacheDriver()
		mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, member.Name)
		mc.Delete(mckey)
		res.WriteJson(results)
	}
}
Example #2
0
func setupApi(app *App) {
	app.Api.Get("/ameblo/insights/:member/history.json", func(res *wcg.Response, req *wcg.Request) {
		historyInsights(res, req, app)
	})
	app.Api.Get("/ameblo/indexes/", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) {
		indexAllMembers(res, req, app)
	}))
	app.Api.Get("/ameblo/indexes/:member.json", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) {
		indexSpecifiedMember(res, req, app)
	}))
	app.Api.Get("/ameblo/contents/", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) {
		crawlAllMembers(res, req, app)
	}))
	app.Api.Get("/ameblo/contents/:member.json", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) {
		crawlSpecifiedMembers(res, req, app)
	}))
	app.Api.Delete("/ameblo/contents/:member.json", lib.Admin.Required(
		func(res *wcg.Response, req *wcg.Request) {
			var appCtx = lib.NewAppContextFromRequest(req)

			member, ok := app.Members[req.Param("member")]
			if !ok {
				lib.NotFound(res, req)
				return
			}

			PER_ENT := 100
			offset := 0
			d := NewAmebloEntryDriver(appCtx)

			for {
				var list []*ameblo.AmebloEntry
				var q = d.NewQuery().Filter("Owner =", member.Name).Offset(offset).Limit(PER_ENT)
				if keys, err := q.GetAll(&list); err != nil {
					lib.InternalError(res, req, err)
					return
				} else {
					for _, ent := range list {
						ent.CrawledAt = time.Time{}
					}
					if _, err = d.PutMulti(keys, list); datastore.IsDatastoreError(err) {
						lib.InternalError(res, req, err)
					}
					offset = offset + len(list)
					if len(list) < PER_ENT {
						break
					}
				}
			}
			app.Api.Ok(res, req)
		},
	))
}
Example #3
0
func crawl(res *wcg.Response, req *wcg.Request, member *ameblo.Member, app *App) {
	var appCtx = lib.NewAppContextFromRequest(req)
	var logger = appCtx.Logger
	var targets []*ameblo.AmebloEntry
	result := make([]string, 0)

	d := NewAmebloEntryDriver(appCtx)
	crawler := ameblo.NewCrawler(appCtx.NewHttpClient())

	// prioritize the entries which are not crawled and are posted recently.
	q := d.NewQuery().Filter("CrawledAt =", time.Time{}).Order("PostAt").Limit(NUM_ENTRIES_TO_CRAWL_PER_CALL)
	if member != nil {
		q = q.Filter("Owner =", member.Name)
	}

	if _, err := q.GetAll(&targets); err != nil {
		lib.InternalError(res, req, err)
		return
	}

	// Crawl Contents
	for _, e := range targets {
		logger.Info("Crawling %s ... ", e.Url)
		if e1, err := crawler.CrawlEntry(e.Url); err != nil {
			logger.Warn("Failed to crawl %s, skipped: %v", e.Url, err)
			continue
		} else {
			if e1 == nil {
				logger.Warn("CrawlEntry returns nil entry for %s", e.Url)
				e.Content = "<No Content>"
				e.CrawledAt = time.Now()
			} else {
				logger.Debug("CrawlEntry scraped %d bytes.", len(e1.Content))
				e.Content = e1.Content
			}
			result = append(result, e.Url)
		}
	}
	if err := updateContents(appCtx, targets, app.MemberList); err != nil {
		lib.InternalError(res, req, err)
		return
	}
	res.WriteJson(result)
}
Example #4
0
func init() {
	app.Api.Get("/channels/",
		func(res *wcg.Response, req *wcg.Request) {
			if list, err := listTvChannels(res, req); err != nil {
				app.Api.InternalError(res, req, err)
			} else {
				res.WriteJson(list)
			}
		},
	)
	app.Api.Post("/channels/", lib.Admin.Required(
		func(res *wcg.Response, req *wcg.Request) {
			ctx := gae.NewContext(req)
			d := NewTvChannelDriver(app.Key, ctx, req.Logger)
			mc := memcache.NewDriver(ctx, req.Logger)
			if err := d.AddChannel(req.Form("cid"), req.Form("sid"), req.Form("name"), req.Form("iepg_station_id")); err != nil {
				lib.InternalError(res, req, err)
			} else {
				mc.Delete(MC_KEY_CHANNELS)
				id := fmt.Sprintf("%s/%s", req.Form("cid"), req.Form("sid"))
				app.Api.Created(res, req, id)
			}
		},
	))
	app.Api.Delete("/channels/:cid/:sid.json", lib.Admin.Required(
		func(res *wcg.Response, req *wcg.Request) {
			ctx := gae.NewContext(req)
			d := NewTvChannelDriver(app.Key, ctx, req.Logger)
			mc := memcache.NewDriver(ctx, req.Logger)
			if err := d.DelChannel(req.Param("cid"), req.Param("sid")); err != nil {
				lib.InternalError(res, req, err)
			} else {
				mc.Delete(MC_KEY_CHANNELS)
				app.Api.Ok(res, req)
			}
		},
	))
}
Example #5
0
func init() {
	app.Api.Get("/keywords/",
		func(res *wcg.Response, req *wcg.Request) {
			var list []*tv.CrawlerConfig
			ctx := gae.NewContext(req)
			mc := memcache.NewDriver(ctx, req.Logger)
			err := mc.CachedObject(MC_KEY_KEYWORDS, &list, func() (interface{}, error) {
				var list []*tv.CrawlerConfig
				d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger)
				q := d.NewQuery().Order("-CreatedAt")
				_, err := q.GetAll(&list)
				if list == nil {
					return make([]*tv.CrawlerConfig, 0), nil
				} else {
					return list, err
				}
			}, req.Query("force") == "1")

			if err != nil {
				lib.InternalError(res, req, err)
			} else {
				res.WriteJson(list)
			}
		},
	)

	app.Api.Get("/keywords/preview/:keyword.json", lib.Family.Required(
		func(res *wcg.Response, req *wcg.Request) {
			var channels []*tv.TvChannel
			ctx := gae.NewContext(req)
			mc := memcache.NewDriver(ctx, req.Logger)
			err := mc.CachedObject(MC_KEY_CHANNELS, &channels, func() (interface{}, error) {
				return NewTvChannelDriver(app.Key, ctx, req.Logger).AllAsList()
			}, false)

			keyword := req.Param("keyword")
			scope, _ := strconv.Atoi(req.Query("scope"))
			list, err := getIEpgListFromCrawlerConfig(res, req, &tv.CrawlerConfig{
				Keyword:  keyword,
				Scope:    scope,
				Category: "dummy",
			}, channels)
			if err != nil {
				lib.InternalError(res, req, err)
			} else {
				res.WriteJson(map[string]interface{}{
					"samples": list,
					"total":   len(list),
				})
			}
		},
	))

	app.Api.Post("/keywords/", lib.Family.Required(
		func(res *wcg.Response, req *wcg.Request) {
			ctx := gae.NewContext(req)
			mc := memcache.NewDriver(ctx, req.Logger)
			d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger)
			scope, _ := strconv.Atoi(req.Form("scope"))
			cfg := &tv.CrawlerConfig{
				Keyword:  req.Form("keyword"),
				Category: req.Form("category"),
				Scope:    scope,
			}
			if err := d.Add(cfg); err != nil {
				app.Api.InternalError(res, req, err)
				return
			} else {
				mc.Delete(MC_KEY_KEYWORDS)
				app.Api.Created(res, req, req.Form("keyword"))
				return
			}
		},
	))

	app.Api.Delete("/keywords/:keyword.json", lib.Family.Required(
		func(res *wcg.Response, req *wcg.Request) {
			ctx := gae.NewContext(req)
			d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger)
			mc := memcache.NewDriver(ctx, req.Logger)
			keyword := req.Param("keyword")
			if err := d.Delete(keyword); err != nil {
				lib.InternalError(res, req, err)
				return
			} else {
				mc.Delete(MC_KEY_KEYWORDS)
				app.Api.Ok(res, req)
				return
			}
		},
	))

}