コード例 #1
0
func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error {
	crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache))
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL)
	crawled, err := crawler.RunOnPostURL(post.URL)
	if err != nil {
		return err
	}
	post.NumLikes = crawled.NumLikes
	post.NumComments = crawled.NumComments
	post.NumReblogs = crawled.NumReblogs
	post.PostAt = crawled.PostAt
	post.Title = crawled.Title
	post.Theme = crawled.Theme
	post.IsContentsCrawled = true
	post.Images = make([]models.Image, len(crawled.ImageURLs))
	if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
		iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL)
			const retries = 5
			var err error
			var cache *models.ImageCache
			for j := 0; j < retries; j++ {
				if j != 0 {
					req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j)
				}
				cache, err = cacher.Cache(url)
				if err == nil {
					post.Images[i] = *cache.ToImage()
					return nil
				}
				lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second)
			}
			req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err)
			return err
		})
	} else {
		for j, url := range crawled.ImageURLs {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL)
			post.Images[j] = models.Image{
				URL: url,
			}
		}
	}
	// Update MemberKey only if not set.
	if post.MemberKey == "" {
		if m := _guessMember(req, post, members); m != nil {
			post.MemberKey = m.Key
		}
	}
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling")
	return nil
}
コード例 #2
0
func _crawlAmebloEntryList(req *wcg.Request, settings *hplink.CrawlerSettings, url string) (string, error) {
	_, _artist, err := entities.Artist.Get().Key(settings.ArtistKey).One(req)
	if err != nil {
		return "", err
	}
	p, err := entities.Member.Query().Filter("ArtistKey=", settings.ArtistKey).Execute(req)
	if err != nil {
		return "", err
	}
	if _artist == nil {
		req.Logger.Warnf("No artist found by the key %q (CralwerSettings: %s)", settings.ArtistKey, settings.URL)
		return "", nil
	}
	artist := _artist.(*hplink.Artist)
	members := p.Data.([]hplink.Member)
	crawler := crawlers.NewAmebloPostCrawler(urlfetch.NewHTTPClient(req))
	postList, err := crawler.RunOnListURL(url)
	if err != nil {
		return "", fmt.Errorf("Error crawling %q - %v", url, err)
	}
	if len(postList.List) == 0 {
		return "", nil
	}
	var amebloPostList []*hplink.AmebloPost
	amebloPostKeys := make([]*datastore.Key, len(postList.List))
	for i, post := range postList.List {
		amebloPostKeys[i] = entities.AmebloPost.NewKey(req, post.URL, nil)
	}
	_, _currentPostList, err := entities.AmebloPost.GetMulti().DatastoreKeys(amebloPostKeys...).List(req)
	if err != nil {
		return "", err
	}
	currentPostList := _currentPostList.([]*hplink.AmebloPost)
	for i, post := range postList.List {
		if currentPostList[i] != nil && currentPostList[i].IsContentsCrawled {
			// we don't update entities if they are already crawled by _processAmebloPosts
			continue
		}
		amebloPost := &hplink.AmebloPost{
			URL:               post.URL,
			Title:             post.Title,
			PostAt:            post.PostAt,
			Theme:             post.Theme,
			ArtistKey:         artist.Key,
			SettingsURL:       settings.URL,
			IsContentsCrawled: false,
		}
		if settings.MemberKey != "" {
			amebloPost.MemberKey = settings.MemberKey
		} else {
			if m := _guessMember(req, amebloPost, members); m != nil {
				amebloPost.MemberKey = m.Key
			}
		}
		amebloPostList = append(amebloPostList, amebloPost)
	}
	_, err = entities.AmebloPost.PutMulti().DatastoreKeys(amebloPostKeys...).Update(req, amebloPostList)
	if err != nil {
		return "", err
	}
	return postList.Next, nil
}