func indexSpecifiedMember(res *wcg.Response, req *wcg.Request, app *App) { var appCtx = lib.NewAppContextFromRequest(req) member, ok := app.Members[req.Param("member")] if !ok { lib.NotFound(res, req) return } num := wcg.ParseInt(req.Param("n"), 0, 0, wcg.ParseIntMax) if num == 0 { num = wcg.ParseIntMax } // Crawling crawler := ameblo.NewCrawler(appCtx.NewHttpClient()) prefix := strings.TrimSuffix(member.BlogUrl, ".html") // xxxx.html => xxxx-{num}.html entries := make([]*ameblo.AmebloEntry, 0) for i := 1; i < num; i += 1 { url := fmt.Sprintf("%s-%d.html", prefix, i) req.Logger.Info("Indexing from %s ... ", url) newentries, err := crawler.CrawlEntryList(url) if err != nil { lib.InternalError(res, req, err) return } if len(newentries) > 20 { panic(fmt.Errorf("Unexpected number of entries (%d) are returned during indexing.", len(newentries))) } if len(newentries) == 0 { break } if len(newentries) < 20 { entries = append(entries, newentries...) break } if len(entries) > 0 && entries[len(entries)-1].Url == newentries[len(newentries)-1].Url { break } entries = append(entries, newentries...) } // Save and return resutls results := make([]string, 0) for _, ent := range entries { ent.Owner = member.Name results = append(results, ent.Url) } if err := updateIndexes(appCtx, entries); err != nil { req.Logger.Error("Failed to update the entry: %v", err) lib.InternalError(res, req, err) // stopped. } else { time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore. mc := appCtx.NewMemcacheDriver() mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, member.Name) mc.Delete(mckey) res.WriteJson(results) } }
func setupApi(app *App) { app.Api.Get("/ameblo/insights/:member/history.json", func(res *wcg.Response, req *wcg.Request) { historyInsights(res, req, app) }) app.Api.Get("/ameblo/indexes/", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) { indexAllMembers(res, req, app) })) app.Api.Get("/ameblo/indexes/:member.json", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) { indexSpecifiedMember(res, req, app) })) app.Api.Get("/ameblo/contents/", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) { crawlAllMembers(res, req, app) })) app.Api.Get("/ameblo/contents/:member.json", lib.Admin.Required(func(res *wcg.Response, req *wcg.Request) { crawlSpecifiedMembers(res, req, app) })) app.Api.Delete("/ameblo/contents/:member.json", lib.Admin.Required( func(res *wcg.Response, req *wcg.Request) { var appCtx = lib.NewAppContextFromRequest(req) member, ok := app.Members[req.Param("member")] if !ok { lib.NotFound(res, req) return } PER_ENT := 100 offset := 0 d := NewAmebloEntryDriver(appCtx) for { var list []*ameblo.AmebloEntry var q = d.NewQuery().Filter("Owner =", member.Name).Offset(offset).Limit(PER_ENT) if keys, err := q.GetAll(&list); err != nil { lib.InternalError(res, req, err) return } else { for _, ent := range list { ent.CrawledAt = time.Time{} } if _, err = d.PutMulti(keys, list); datastore.IsDatastoreError(err) { lib.InternalError(res, req, err) } offset = offset + len(list) if len(list) < PER_ENT { break } } } app.Api.Ok(res, req) }, )) }
func crawl(res *wcg.Response, req *wcg.Request, member *ameblo.Member, app *App) { var appCtx = lib.NewAppContextFromRequest(req) var logger = appCtx.Logger var targets []*ameblo.AmebloEntry result := make([]string, 0) d := NewAmebloEntryDriver(appCtx) crawler := ameblo.NewCrawler(appCtx.NewHttpClient()) // prioritize the entries which are not crawled and are posted recently. q := d.NewQuery().Filter("CrawledAt =", time.Time{}).Order("PostAt").Limit(NUM_ENTRIES_TO_CRAWL_PER_CALL) if member != nil { q = q.Filter("Owner =", member.Name) } if _, err := q.GetAll(&targets); err != nil { lib.InternalError(res, req, err) return } // Crawl Contents for _, e := range targets { logger.Info("Crawling %s ... ", e.Url) if e1, err := crawler.CrawlEntry(e.Url); err != nil { logger.Warn("Failed to crawl %s, skipped: %v", e.Url, err) continue } else { if e1 == nil { logger.Warn("CrawlEntry returns nil entry for %s", e.Url) e.Content = "<No Content>" e.CrawledAt = time.Now() } else { logger.Debug("CrawlEntry scraped %d bytes.", len(e1.Content)) e.Content = e1.Content } result = append(result, e.Url) } } if err := updateContents(appCtx, targets, app.MemberList); err != nil { lib.InternalError(res, req, err) return } res.WriteJson(result) }
func init() { app.Api.Get("/channels/", func(res *wcg.Response, req *wcg.Request) { if list, err := listTvChannels(res, req); err != nil { app.Api.InternalError(res, req, err) } else { res.WriteJson(list) } }, ) app.Api.Post("/channels/", lib.Admin.Required( func(res *wcg.Response, req *wcg.Request) { ctx := gae.NewContext(req) d := NewTvChannelDriver(app.Key, ctx, req.Logger) mc := memcache.NewDriver(ctx, req.Logger) if err := d.AddChannel(req.Form("cid"), req.Form("sid"), req.Form("name"), req.Form("iepg_station_id")); err != nil { lib.InternalError(res, req, err) } else { mc.Delete(MC_KEY_CHANNELS) id := fmt.Sprintf("%s/%s", req.Form("cid"), req.Form("sid")) app.Api.Created(res, req, id) } }, )) app.Api.Delete("/channels/:cid/:sid.json", lib.Admin.Required( func(res *wcg.Response, req *wcg.Request) { ctx := gae.NewContext(req) d := NewTvChannelDriver(app.Key, ctx, req.Logger) mc := memcache.NewDriver(ctx, req.Logger) if err := d.DelChannel(req.Param("cid"), req.Param("sid")); err != nil { lib.InternalError(res, req, err) } else { mc.Delete(MC_KEY_CHANNELS) app.Api.Ok(res, req) } }, )) }
func init() { app.Api.Get("/keywords/", func(res *wcg.Response, req *wcg.Request) { var list []*tv.CrawlerConfig ctx := gae.NewContext(req) mc := memcache.NewDriver(ctx, req.Logger) err := mc.CachedObject(MC_KEY_KEYWORDS, &list, func() (interface{}, error) { var list []*tv.CrawlerConfig d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger) q := d.NewQuery().Order("-CreatedAt") _, err := q.GetAll(&list) if list == nil { return make([]*tv.CrawlerConfig, 0), nil } else { return list, err } }, req.Query("force") == "1") if err != nil { lib.InternalError(res, req, err) } else { res.WriteJson(list) } }, ) app.Api.Get("/keywords/preview/:keyword.json", lib.Family.Required( func(res *wcg.Response, req *wcg.Request) { var channels []*tv.TvChannel ctx := gae.NewContext(req) mc := memcache.NewDriver(ctx, req.Logger) err := mc.CachedObject(MC_KEY_CHANNELS, &channels, func() (interface{}, error) { return NewTvChannelDriver(app.Key, ctx, req.Logger).AllAsList() }, false) keyword := req.Param("keyword") scope, _ := strconv.Atoi(req.Query("scope")) list, err := getIEpgListFromCrawlerConfig(res, req, &tv.CrawlerConfig{ Keyword: keyword, Scope: scope, Category: "dummy", }, channels) if err != nil { lib.InternalError(res, req, err) } else { res.WriteJson(map[string]interface{}{ "samples": list, "total": len(list), }) } }, )) app.Api.Post("/keywords/", lib.Family.Required( func(res *wcg.Response, req *wcg.Request) { ctx := gae.NewContext(req) mc := memcache.NewDriver(ctx, req.Logger) d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger) scope, _ := strconv.Atoi(req.Form("scope")) cfg := &tv.CrawlerConfig{ Keyword: req.Form("keyword"), Category: req.Form("category"), Scope: scope, } if err := d.Add(cfg); err != nil { app.Api.InternalError(res, req, err) return } else { mc.Delete(MC_KEY_KEYWORDS) app.Api.Created(res, req, req.Form("keyword")) return } }, )) app.Api.Delete("/keywords/:keyword.json", lib.Family.Required( func(res *wcg.Response, req *wcg.Request) { ctx := gae.NewContext(req) d := NewCrawlerConfigDriver(app.Key, ctx, req.Logger) mc := memcache.NewDriver(ctx, req.Logger) keyword := req.Param("keyword") if err := d.Delete(keyword); err != nil { lib.InternalError(res, req, err) return } else { mc.Delete(MC_KEY_KEYWORDS) app.Api.Ok(res, req) return } }, )) }