func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error {
	crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache))
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL)
	crawled, err := crawler.RunOnPostURL(post.URL)
	if err != nil {
		return err
	}
	post.NumLikes = crawled.NumLikes
	post.NumComments = crawled.NumComments
	post.NumReblogs = crawled.NumReblogs
	post.PostAt = crawled.PostAt
	post.Title = crawled.Title
	post.Theme = crawled.Theme
	post.IsContentsCrawled = true
	post.Images = make([]models.Image, len(crawled.ImageURLs))
	if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
		iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL)
			const retries = 5
			var err error
			var cache *models.ImageCache
			for j := 0; j < retries; j++ {
				if j != 0 {
					req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j)
				}
				cache, err = cacher.Cache(url)
				if err == nil {
					post.Images[i] = *cache.ToImage()
					return nil
				}
				lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second)
			}
			req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err)
			return err
		})
	} else {
		for j, url := range crawled.ImageURLs {
			req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL)
			post.Images[j] = models.Image{
				URL: url,
			}
		}
	}
	// Update MemberKey only if not set.
	if post.MemberKey == "" {
		if m := _guessMember(req, post, members); m != nil {
			post.MemberKey = m.Key
		}
	}
	req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling")
	return nil
}
Пример #2
0
func _syncArtistProfiles(req *wcg.Request) ([]*datastore.Key, []*hplink.Artist, error) {
	req.Logger.Infof("Importing the artist list")
	sourceList, err := crawlers.NewHelloProject(urlfetch.NewHTTPClient(req)).Run()
	if err != nil {
		return nil, nil, err
	}
	if len(sourceList) == 0 {
		return nil, nil, errSyncNoArtistInfo
	}
	keys := make([]string, len(sourceList))
	for i, source := range sourceList {
		keys[i] = source.Key
	}
	dskeys, _list := entities.Artist.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req)
	list := _list.([]*hplink.Artist)
	for i, source := range sourceList {
		list[i].Index = source.Index
		list[i].Key = source.Key
		list[i].Name = source.Name
		list[i].Thumbnail = models.Image{
			URL: source.ImageURL,
		}
	}

	if lib.IsProduction() {
		if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
			for i, source := range sourceList {
				cache, err := cacher.Cache(source.ImageURL)
				if err != nil {
					req.Logger.Warnf("Image cache failed: %v", err)
				} else {
					list[i].Thumbnail = *cache.ToImage()
				}
			}
		}
	}
	entities.Artist.PutMulti().DatastoreKeys(dskeys...).Cache(ckAllArtits).MustUpdate(req, list)
	return dskeys, list, nil
}
Пример #3
0
func _syncMemberProfiles(req *wcg.Request, key *datastore.Key, artist *hplink.Artist) ([]*datastore.Key, []*hplink.Member, error) {
	//
	// Crawling pages.
	//
	req.Logger.Infof("Importing the member list from the artist page ...")
	artistPageInfo, err := crawlers.NewArtist(urlfetch.NewHTTPClient(req), artist.Key).Run()
	if err != nil {
		return nil, nil, err
	}
	keys := make([]string, len(artistPageInfo.Members))
	memberPageInfoList := make([]*crawlers.MemberPageInfo, len(artistPageInfo.Members))
	for i, member := range artistPageInfo.Members {
		req.Logger.Infof("Importing member details")
		result, err := crawlers.NewMember(urlfetch.NewHTTPClient(req), artist.Key, member.Key).Run()
		if err != nil {
			return nil, nil, err
		}
		keys[i] = fmt.Sprintf("%s.%s", artist.Key, member.Key)
		memberPageInfoList[i] = result
	}
	// optional task to update crawler settings
	var wg sync.WaitGroup
	wg.Add(1)
	go func() {
		defer wg.Done()
		syncCrawlerSettings(req, key, artistPageInfo)
	}()
	defer wg.Wait()
	//
	// Update Datastore
	//
	// Check existing Member entities to merge with the crawling result.
	memberKeys, _members := entities.Member.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req)
	members := _members.([]*hplink.Member)
	// MemberPublicProfile can always be overwritten
	profiles := make([]*hplink.MemberPublicProfile, len(members))
	for i := range profiles {
		profiles[i] = &hplink.MemberPublicProfile{}
	}

	for i, member := range artistPageInfo.Members {
		members[i].Key = keys[i]
		members[i].ShortKey = member.Key
		members[i].ArtistKey = artist.Key
		members[i].Name = member.Name
		members[i].Index = member.Index
		members[i].Birthday = memberPageInfoList[i].Birthday
		members[i].Joinday = memberPageInfoList[i].Joinday
		members[i].Images = make([]models.Image, len(memberPageInfoList[i].ImageURLs))
		for j, u := range memberPageInfoList[i].ImageURLs {
			// TODO: integrate blob service
			members[i].Images[j] = models.Image{
				URL: u,
			}
		}
		profiles[i].Key = keys[i]
		profiles[i].Nicknames = memberPageInfoList[i].Nicknames
		profiles[i].BloodType = memberPageInfoList[i].BloodType
		profiles[i].Hometown = memberPageInfoList[i].Hometown
		profiles[i].Skills = memberPageInfoList[i].Skills
		profiles[i].Hobbies = memberPageInfoList[i].Hobbies
		profiles[i].MusicGenres = memberPageInfoList[i].MusicGenres
		profiles[i].Sports = memberPageInfoList[i].Sports
		profiles[i].Motto = memberPageInfoList[i].Motto
	}

	// cache images
	if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
		for _, m := range members {
			for i := range m.Images {
				cache, err := cacher.Cache(m.Images[i].URL)
				if err != nil {
					req.Logger.Warnf("Image cache failed: %v", err)
				} else {
					m.Images[i] = *cache.ToImage()
				}
			}
		}
	}

	entities.Member.PutMulti().DatastoreKeys(memberKeys...).Cache(
		fmt.Sprintf(ckAllMembersTemplate, artist.Key),
	).MustUpdate(req, members)

	profileKeys := make([]*datastore.Key, len(memberKeys))
	for i, key := range memberKeys {
		profileKeys[i] = entities.MemberPublicProfile.NewKey(req, key.StringID(), nil)
	}
	entities.MemberPublicProfile.PutMulti().DatastoreKeys(profileKeys...).Cache(
		fmt.Sprintf(ckAllMembersTemplate, artist.Key),
	).MustUpdate(req, profiles)

	return memberKeys, members, nil
}