func indexAllMembers(res *wcg.Response, req *wcg.Request, app *App) { var appCtx = lib.NewAppContextFromRequest(req) result := make(map[string][]string) crawler := ameblo.NewCrawler(appCtx.NewHttpClient()) for _, m := range app.Members { req.Logger.Debug("Crawling %s (%s)", m.BlogUrl, m.Name) entries, err := crawler.CrawlEntryList(m.BlogUrl) if err != nil { req.Logger.Error("An error occurred while crawling %s: %v", m.BlogUrl, err) continue } req.Logger.Debug("Found %d entries.", len(entries)) list := make([]string, 0) for _, e := range entries { e.Owner = m.Name list = append(list, e.Url) } if err := updateIndexes(appCtx, entries); err != nil { req.Logger.Error("Failed to update the entry: %v", err) } else { result[m.Name] = list } } // invalidate the cache time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore. mc := appCtx.NewMemcacheDriver() for _, m := range app.Members { mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, m.Name) mc.Delete(mckey) } res.WriteJson(result) }
func indexSpecifiedMember(res *wcg.Response, req *wcg.Request, app *App) { var appCtx = lib.NewAppContextFromRequest(req) member, ok := app.Members[req.Param("member")] if !ok { lib.NotFound(res, req) return } num := wcg.ParseInt(req.Param("n"), 0, 0, wcg.ParseIntMax) if num == 0 { num = wcg.ParseIntMax } // Crawling crawler := ameblo.NewCrawler(appCtx.NewHttpClient()) prefix := strings.TrimSuffix(member.BlogUrl, ".html") // xxxx.html => xxxx-{num}.html entries := make([]*ameblo.AmebloEntry, 0) for i := 1; i < num; i += 1 { url := fmt.Sprintf("%s-%d.html", prefix, i) req.Logger.Info("Indexing from %s ... ", url) newentries, err := crawler.CrawlEntryList(url) if err != nil { lib.InternalError(res, req, err) return } if len(newentries) > 20 { panic(fmt.Errorf("Unexpected number of entries (%d) are returned during indexing.", len(newentries))) } if len(newentries) == 0 { break } if len(newentries) < 20 { entries = append(entries, newentries...) break } if len(entries) > 0 && entries[len(entries)-1].Url == newentries[len(newentries)-1].Url { break } entries = append(entries, newentries...) } // Save and return resutls results := make([]string, 0) for _, ent := range entries { ent.Owner = member.Name results = append(results, ent.Url) } if err := updateIndexes(appCtx, entries); err != nil { req.Logger.Error("Failed to update the entry: %v", err) lib.InternalError(res, req, err) // stopped. } else { time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore. mc := appCtx.NewMemcacheDriver() mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, member.Name) mc.Delete(mckey) res.WriteJson(results) } }
func crawl(res *wcg.Response, req *wcg.Request, member *ameblo.Member, app *App) { var appCtx = lib.NewAppContextFromRequest(req) var logger = appCtx.Logger var targets []*ameblo.AmebloEntry result := make([]string, 0) d := NewAmebloEntryDriver(appCtx) crawler := ameblo.NewCrawler(appCtx.NewHttpClient()) // prioritize the entries which are not crawled and are posted recently. q := d.NewQuery().Filter("CrawledAt =", time.Time{}).Order("PostAt").Limit(NUM_ENTRIES_TO_CRAWL_PER_CALL) if member != nil { q = q.Filter("Owner =", member.Name) } if _, err := q.GetAll(&targets); err != nil { lib.InternalError(res, req, err) return } // Crawl Contents for _, e := range targets { logger.Info("Crawling %s ... ", e.Url) if e1, err := crawler.CrawlEntry(e.Url); err != nil { logger.Warn("Failed to crawl %s, skipped: %v", e.Url, err) continue } else { if e1 == nil { logger.Warn("CrawlEntry returns nil entry for %s", e.Url) e.Content = "<No Content>" e.CrawledAt = time.Now() } else { logger.Debug("CrawlEntry scraped %d bytes.", len(e1.Content)) e.Content = e1.Content } result = append(result, e.Url) } } if err := updateContents(appCtx, targets, app.MemberList); err != nil { lib.InternalError(res, req, err) return } res.WriteJson(result) }