func _crawlAmebloPost(req *wcg.Request, post *hplink.AmebloPost, members []hplink.Member) error { crawler := crawlers.NewAmebloPostCrawler(cacheutil.NewURLCacheAwareClient(req, entities.URLCache)) req.Logger.Infof("[Task.Crawlers.AmebloPosts] crawling URL: %s", post.URL) crawled, err := crawler.RunOnPostURL(post.URL) if err != nil { return err } post.NumLikes = crawled.NumLikes post.NumComments = crawled.NumComments post.NumReblogs = crawled.NumReblogs post.PostAt = crawled.PostAt post.Title = crawled.Title post.Theme = crawled.Theme post.IsContentsCrawled = true post.Images = make([]models.Image, len(crawled.ImageURLs)) if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { iterator.ParallelSlice(crawled.ImageURLs, func(i int, url string) error { req.Logger.Infof("[Task.Crawlers.AmebloPosts] caching image URL %s (on %s)", url, post.URL) const retries = 5 var err error var cache *models.ImageCache for j := 0; j < retries; j++ { if j != 0 { req.Logger.Infof("[Task.Crawlers.AmebloPosts] Retry image URL caching: %s (on %s) (%d)", url, post.URL, j) } cache, err = cacher.Cache(url) if err == nil { post.Images[i] = *cache.ToImage() return nil } lib.WaitAndEnsureAfter(lib.Now(), 3*time.Second) } req.Logger.Warnf("[Task.Crawlers.AmebloPosts] Failed to cache image URL %s (on %s) - %v", url, post.URL, err) return err }) } else { for j, url := range crawled.ImageURLs { req.Logger.Infof("[Task.Crawlers.AmebloPosts] skip caching image URL %s (on %s)", url, post.URL) post.Images[j] = models.Image{ URL: url, } } } // Update MemberKey only if not set. if post.MemberKey == "" { if m := _guessMember(req, post, members); m != nil { post.MemberKey = m.Key } } req.Logger.Infof("[Task.Crawlers.AmebloPosts] finished crawling") return nil }
func _syncArtistProfiles(req *wcg.Request) ([]*datastore.Key, []*hplink.Artist, error) { req.Logger.Infof("Importing the artist list") sourceList, err := crawlers.NewHelloProject(urlfetch.NewHTTPClient(req)).Run() if err != nil { return nil, nil, err } if len(sourceList) == 0 { return nil, nil, errSyncNoArtistInfo } keys := make([]string, len(sourceList)) for i, source := range sourceList { keys[i] = source.Key } dskeys, _list := entities.Artist.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req) list := _list.([]*hplink.Artist) for i, source := range sourceList { list[i].Index = source.Index list[i].Key = source.Key list[i].Name = source.Name list[i].Thumbnail = models.Image{ URL: source.ImageURL, } } if lib.IsProduction() { if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { for i, source := range sourceList { cache, err := cacher.Cache(source.ImageURL) if err != nil { req.Logger.Warnf("Image cache failed: %v", err) } else { list[i].Thumbnail = *cache.ToImage() } } } } entities.Artist.PutMulti().DatastoreKeys(dskeys...).Cache(ckAllArtits).MustUpdate(req, list) return dskeys, list, nil }
func _syncMemberProfiles(req *wcg.Request, key *datastore.Key, artist *hplink.Artist) ([]*datastore.Key, []*hplink.Member, error) { // // Crawling pages. // req.Logger.Infof("Importing the member list from the artist page ...") artistPageInfo, err := crawlers.NewArtist(urlfetch.NewHTTPClient(req), artist.Key).Run() if err != nil { return nil, nil, err } keys := make([]string, len(artistPageInfo.Members)) memberPageInfoList := make([]*crawlers.MemberPageInfo, len(artistPageInfo.Members)) for i, member := range artistPageInfo.Members { req.Logger.Infof("Importing member details") result, err := crawlers.NewMember(urlfetch.NewHTTPClient(req), artist.Key, member.Key).Run() if err != nil { return nil, nil, err } keys[i] = fmt.Sprintf("%s.%s", artist.Key, member.Key) memberPageInfoList[i] = result } // optional task to update crawler settings var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() syncCrawlerSettings(req, key, artistPageInfo) }() defer wg.Wait() // // Update Datastore // // Check existing Member entities to merge with the crawling result. memberKeys, _members := entities.Member.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req) members := _members.([]*hplink.Member) // MemberPublicProfile can always be overwritten profiles := make([]*hplink.MemberPublicProfile, len(members)) for i := range profiles { profiles[i] = &hplink.MemberPublicProfile{} } for i, member := range artistPageInfo.Members { members[i].Key = keys[i] members[i].ShortKey = member.Key members[i].ArtistKey = artist.Key members[i].Name = member.Name members[i].Index = member.Index members[i].Birthday = memberPageInfoList[i].Birthday members[i].Joinday = memberPageInfoList[i].Joinday members[i].Images = make([]models.Image, len(memberPageInfoList[i].ImageURLs)) for j, u := range memberPageInfoList[i].ImageURLs { // TODO: integrate blob service members[i].Images[j] = models.Image{ URL: u, } } profiles[i].Key = keys[i] profiles[i].Nicknames = memberPageInfoList[i].Nicknames profiles[i].BloodType = memberPageInfoList[i].BloodType profiles[i].Hometown = memberPageInfoList[i].Hometown profiles[i].Skills = memberPageInfoList[i].Skills profiles[i].Hobbies = memberPageInfoList[i].Hobbies profiles[i].MusicGenres = memberPageInfoList[i].MusicGenres profiles[i].Sports = memberPageInfoList[i].Sports profiles[i].Motto = memberPageInfoList[i].Motto } // cache images if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil { for _, m := range members { for i := range m.Images { cache, err := cacher.Cache(m.Images[i].URL) if err != nil { req.Logger.Warnf("Image cache failed: %v", err) } else { m.Images[i] = *cache.ToImage() } } } } entities.Member.PutMulti().DatastoreKeys(memberKeys...).Cache( fmt.Sprintf(ckAllMembersTemplate, artist.Key), ).MustUpdate(req, members) profileKeys := make([]*datastore.Key, len(memberKeys)) for i, key := range memberKeys { profileKeys[i] = entities.MemberPublicProfile.NewKey(req, key.StringID(), nil) } entities.MemberPublicProfile.PutMulti().DatastoreKeys(profileKeys...).Cache( fmt.Sprintf(ckAllMembersTemplate, artist.Key), ).MustUpdate(req, profiles) return memberKeys, members, nil }