Пример #1
0
func _syncArtistProfiles(req *wcg.Request) ([]*datastore.Key, []*hplink.Artist, error) {
	req.Logger.Infof("Importing the artist list")
	sourceList, err := crawlers.NewHelloProject(urlfetch.NewHTTPClient(req)).Run()
	if err != nil {
		return nil, nil, err
	}
	if len(sourceList) == 0 {
		return nil, nil, errSyncNoArtistInfo
	}
	keys := make([]string, len(sourceList))
	for i, source := range sourceList {
		keys[i] = source.Key
	}
	dskeys, _list := entities.Artist.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req)
	list := _list.([]*hplink.Artist)
	for i, source := range sourceList {
		list[i].Index = source.Index
		list[i].Key = source.Key
		list[i].Name = source.Name
		list[i].Thumbnail = models.Image{
			URL: source.ImageURL,
		}
	}

	if lib.IsProduction() {
		if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
			for i, source := range sourceList {
				cache, err := cacher.Cache(source.ImageURL)
				if err != nil {
					req.Logger.Warnf("Image cache failed: %v", err)
				} else {
					list[i].Thumbnail = *cache.ToImage()
				}
			}
		}
	}
	entities.Artist.PutMulti().DatastoreKeys(dskeys...).Cache(ckAllArtits).MustUpdate(req, list)
	return dskeys, list, nil
}
Пример #2
0
func (cacher *ImageCacher) openImageStream(url string) (*http.Response, error) {
	client := urlfetch.NewHTTPClient(cacher.req)
	resp, err := client.Get(url)
	if err != nil {
		return nil, err
	}
	if resp.StatusCode != 200 {
		resp.Body.Close()
		return nil, fmt.Errorf(errInvalidImageStatus, url, resp.Status)
	}
	contentType := resp.Header.Get("content-type")
	contentSize := resp.ContentLength
	if !strings.HasPrefix(contentType, "image/") {
		resp.Body.Close()
		return nil, fmt.Errorf(errInvalidImageContentType, url, contentType)
	}
	if contentSize == 0 {
		resp.Body.Close()
		return nil, fmt.Errorf(errInvalidImageContentSize, url)
	}
	return resp, nil
}
Пример #3
0
func _syncMemberProfiles(req *wcg.Request, key *datastore.Key, artist *hplink.Artist) ([]*datastore.Key, []*hplink.Member, error) {
	//
	// Crawling pages.
	//
	req.Logger.Infof("Importing the member list from the artist page ...")
	artistPageInfo, err := crawlers.NewArtist(urlfetch.NewHTTPClient(req), artist.Key).Run()
	if err != nil {
		return nil, nil, err
	}
	keys := make([]string, len(artistPageInfo.Members))
	memberPageInfoList := make([]*crawlers.MemberPageInfo, len(artistPageInfo.Members))
	for i, member := range artistPageInfo.Members {
		req.Logger.Infof("Importing member details")
		result, err := crawlers.NewMember(urlfetch.NewHTTPClient(req), artist.Key, member.Key).Run()
		if err != nil {
			return nil, nil, err
		}
		keys[i] = fmt.Sprintf("%s.%s", artist.Key, member.Key)
		memberPageInfoList[i] = result
	}
	// optional task to update crawler settings
	var wg sync.WaitGroup
	wg.Add(1)
	go func() {
		defer wg.Done()
		syncCrawlerSettings(req, key, artistPageInfo)
	}()
	defer wg.Wait()
	//
	// Update Datastore
	//
	// Check existing Member entities to merge with the crawling result.
	memberKeys, _members := entities.Member.GetMulti().Keys(keys...).UseDefaultIfNil(true).MustList(req)
	members := _members.([]*hplink.Member)
	// MemberPublicProfile can always be overwritten
	profiles := make([]*hplink.MemberPublicProfile, len(members))
	for i := range profiles {
		profiles[i] = &hplink.MemberPublicProfile{}
	}

	for i, member := range artistPageInfo.Members {
		members[i].Key = keys[i]
		members[i].ShortKey = member.Key
		members[i].ArtistKey = artist.Key
		members[i].Name = member.Name
		members[i].Index = member.Index
		members[i].Birthday = memberPageInfoList[i].Birthday
		members[i].Joinday = memberPageInfoList[i].Joinday
		members[i].Images = make([]models.Image, len(memberPageInfoList[i].ImageURLs))
		for j, u := range memberPageInfoList[i].ImageURLs {
			// TODO: integrate blob service
			members[i].Images[j] = models.Image{
				URL: u,
			}
		}
		profiles[i].Key = keys[i]
		profiles[i].Nicknames = memberPageInfoList[i].Nicknames
		profiles[i].BloodType = memberPageInfoList[i].BloodType
		profiles[i].Hometown = memberPageInfoList[i].Hometown
		profiles[i].Skills = memberPageInfoList[i].Skills
		profiles[i].Hobbies = memberPageInfoList[i].Hobbies
		profiles[i].MusicGenres = memberPageInfoList[i].MusicGenres
		profiles[i].Sports = memberPageInfoList[i].Sports
		profiles[i].Motto = memberPageInfoList[i].Motto
	}

	// cache images
	if cacher, err := cacheutil.NewImageCacher(req, entities.ImageCache); err == nil {
		for _, m := range members {
			for i := range m.Images {
				cache, err := cacher.Cache(m.Images[i].URL)
				if err != nil {
					req.Logger.Warnf("Image cache failed: %v", err)
				} else {
					m.Images[i] = *cache.ToImage()
				}
			}
		}
	}

	entities.Member.PutMulti().DatastoreKeys(memberKeys...).Cache(
		fmt.Sprintf(ckAllMembersTemplate, artist.Key),
	).MustUpdate(req, members)

	profileKeys := make([]*datastore.Key, len(memberKeys))
	for i, key := range memberKeys {
		profileKeys[i] = entities.MemberPublicProfile.NewKey(req, key.StringID(), nil)
	}
	entities.MemberPublicProfile.PutMulti().DatastoreKeys(profileKeys...).Cache(
		fmt.Sprintf(ckAllMembersTemplate, artist.Key),
	).MustUpdate(req, profiles)

	return memberKeys, members, nil
}
Пример #4
0
// NewHTTPClient implements Crawler#NewHTTPClient()
func (c *IEPGDataCrawler) NewHTTPClient() *http.Client {
	return urlfetch.NewHTTPClient(c.request)
}
func _crawlAmebloEntryList(req *wcg.Request, settings *hplink.CrawlerSettings, url string) (string, error) {
	_, _artist, err := entities.Artist.Get().Key(settings.ArtistKey).One(req)
	if err != nil {
		return "", err
	}
	p, err := entities.Member.Query().Filter("ArtistKey=", settings.ArtistKey).Execute(req)
	if err != nil {
		return "", err
	}
	if _artist == nil {
		req.Logger.Warnf("No artist found by the key %q (CralwerSettings: %s)", settings.ArtistKey, settings.URL)
		return "", nil
	}
	artist := _artist.(*hplink.Artist)
	members := p.Data.([]hplink.Member)
	crawler := crawlers.NewAmebloPostCrawler(urlfetch.NewHTTPClient(req))
	postList, err := crawler.RunOnListURL(url)
	if err != nil {
		return "", fmt.Errorf("Error crawling %q - %v", url, err)
	}
	if len(postList.List) == 0 {
		return "", nil
	}
	var amebloPostList []*hplink.AmebloPost
	amebloPostKeys := make([]*datastore.Key, len(postList.List))
	for i, post := range postList.List {
		amebloPostKeys[i] = entities.AmebloPost.NewKey(req, post.URL, nil)
	}
	_, _currentPostList, err := entities.AmebloPost.GetMulti().DatastoreKeys(amebloPostKeys...).List(req)
	if err != nil {
		return "", err
	}
	currentPostList := _currentPostList.([]*hplink.AmebloPost)
	for i, post := range postList.List {
		if currentPostList[i] != nil && currentPostList[i].IsContentsCrawled {
			// we don't update entities if they are already crawled by _processAmebloPosts
			continue
		}
		amebloPost := &hplink.AmebloPost{
			URL:               post.URL,
			Title:             post.Title,
			PostAt:            post.PostAt,
			Theme:             post.Theme,
			ArtistKey:         artist.Key,
			SettingsURL:       settings.URL,
			IsContentsCrawled: false,
		}
		if settings.MemberKey != "" {
			amebloPost.MemberKey = settings.MemberKey
		} else {
			if m := _guessMember(req, amebloPost, members); m != nil {
				amebloPost.MemberKey = m.Key
			}
		}
		amebloPostList = append(amebloPostList, amebloPost)
	}
	_, err = entities.AmebloPost.PutMulti().DatastoreKeys(amebloPostKeys...).Update(req, amebloPostList)
	if err != nil {
		return "", err
	}
	return postList.Next, nil
}