Beispiel #1
0
func indexAllMembers(res *wcg.Response, req *wcg.Request, app *App) {
	var appCtx = lib.NewAppContextFromRequest(req)
	result := make(map[string][]string)
	crawler := ameblo.NewCrawler(appCtx.NewHttpClient())

	for _, m := range app.Members {
		req.Logger.Debug("Crawling %s (%s)", m.BlogUrl, m.Name)
		entries, err := crawler.CrawlEntryList(m.BlogUrl)
		if err != nil {
			req.Logger.Error("An error occurred while crawling %s: %v", m.BlogUrl, err)
			continue
		}
		req.Logger.Debug("Found %d entries.", len(entries))
		list := make([]string, 0)
		for _, e := range entries {
			e.Owner = m.Name
			list = append(list, e.Url)
		}
		if err := updateIndexes(appCtx, entries); err != nil {
			req.Logger.Error("Failed to update the entry: %v", err)
		} else {
			result[m.Name] = list
		}
	}

	// invalidate the cache
	time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore.
	mc := appCtx.NewMemcacheDriver()
	for _, m := range app.Members {
		mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, m.Name)
		mc.Delete(mckey)
	}
	res.WriteJson(result)
}
Beispiel #2
0
func indexSpecifiedMember(res *wcg.Response, req *wcg.Request, app *App) {
	var appCtx = lib.NewAppContextFromRequest(req)
	member, ok := app.Members[req.Param("member")]
	if !ok {
		lib.NotFound(res, req)
		return
	}

	num := wcg.ParseInt(req.Param("n"), 0, 0, wcg.ParseIntMax)
	if num == 0 {
		num = wcg.ParseIntMax
	}
	// Crawling
	crawler := ameblo.NewCrawler(appCtx.NewHttpClient())
	prefix := strings.TrimSuffix(member.BlogUrl, ".html") // xxxx.html => xxxx-{num}.html
	entries := make([]*ameblo.AmebloEntry, 0)
	for i := 1; i < num; i += 1 {
		url := fmt.Sprintf("%s-%d.html", prefix, i)
		req.Logger.Info("Indexing from %s ... ", url)
		newentries, err := crawler.CrawlEntryList(url)
		if err != nil {
			lib.InternalError(res, req, err)
			return
		}
		if len(newentries) > 20 {
			panic(fmt.Errorf("Unexpected number of entries (%d) are returned during indexing.", len(newentries)))
		}
		if len(newentries) == 0 {
			break
		}
		if len(newentries) < 20 {
			entries = append(entries, newentries...)
			break
		}
		if len(entries) > 0 && entries[len(entries)-1].Url == newentries[len(newentries)-1].Url {
			break
		}
		entries = append(entries, newentries...)
	}

	// Save and return resutls
	results := make([]string, 0)
	for _, ent := range entries {
		ent.Owner = member.Name
		results = append(results, ent.Url)
	}
	if err := updateIndexes(appCtx, entries); err != nil {
		req.Logger.Error("Failed to update the entry: %v", err)
		lib.InternalError(res, req, err) // stopped.
	} else {
		time.Sleep(10 * time.Second) // TODO: wait for all indexes are updated on datastore.
		mc := appCtx.NewMemcacheDriver()
		mckey := fmt.Sprintf(MC_KEY_HISTORY, app.Key, member.Name)
		mc.Delete(mckey)
		res.WriteJson(results)
	}
}
Beispiel #3
0
func crawl(res *wcg.Response, req *wcg.Request, member *ameblo.Member, app *App) {
	var appCtx = lib.NewAppContextFromRequest(req)
	var logger = appCtx.Logger
	var targets []*ameblo.AmebloEntry
	result := make([]string, 0)

	d := NewAmebloEntryDriver(appCtx)
	crawler := ameblo.NewCrawler(appCtx.NewHttpClient())

	// prioritize the entries which are not crawled and are posted recently.
	q := d.NewQuery().Filter("CrawledAt =", time.Time{}).Order("PostAt").Limit(NUM_ENTRIES_TO_CRAWL_PER_CALL)
	if member != nil {
		q = q.Filter("Owner =", member.Name)
	}

	if _, err := q.GetAll(&targets); err != nil {
		lib.InternalError(res, req, err)
		return
	}

	// Crawl Contents
	for _, e := range targets {
		logger.Info("Crawling %s ... ", e.Url)
		if e1, err := crawler.CrawlEntry(e.Url); err != nil {
			logger.Warn("Failed to crawl %s, skipped: %v", e.Url, err)
			continue
		} else {
			if e1 == nil {
				logger.Warn("CrawlEntry returns nil entry for %s", e.Url)
				e.Content = "<No Content>"
				e.CrawledAt = time.Now()
			} else {
				logger.Debug("CrawlEntry scraped %d bytes.", len(e1.Content))
				e.Content = e1.Content
			}
			result = append(result, e.Url)
		}
	}
	if err := updateContents(appCtx, targets, app.MemberList); err != nil {
		lib.InternalError(res, req, err)
		return
	}
	res.WriteJson(result)
}