func (r *run) importPlace(parent *importer.Object, place *venueItem) (*importer.Object, error) { placeNode, err := parent.ChildPathObject(place.Id) if err != nil { return nil, err } catName := "" if cat := place.primaryCategory(); cat != nil { catName = cat.Name } icon := place.icon() if err := placeNode.SetAttrs( "foursquareId", place.Id, "camliNodeType", "foursquare.com:venue", "camliContentImage", r.urlFileRef(icon, path.Base(icon)), "foursquareCategoryName", catName, "title", place.Name, "streetAddress", place.Location.Address, "addressLocality", place.Location.City, "postalCode", place.Location.PostalCode, "addressRegion", place.Location.State, "addressCountry", place.Location.Country, "latitude", fmt.Sprint(place.Location.Lat), "longitude", fmt.Sprint(place.Location.Lng)); err != nil { return nil, err } return placeNode, nil }
func (r *run) importPlace(parent *importer.Object, place *venueItem) (*importer.Object, error) { placeNode, err := parent.ChildPathObject(place.Id) if err != nil { return nil, err } catName := "" if cat := place.primaryCategory(); cat != nil { catName = cat.Name } icon := place.icon() if err := placeNode.SetAttrs( attrFoursquareId, place.Id, nodeattr.Type, "foursquare.com:venue", nodeattr.CamliContentImage, r.urlFileRef(icon, path.Base(icon)), attrFoursquareCategoryName, catName, nodeattr.Title, place.Name, nodeattr.StreetAddress, place.Location.Address, nodeattr.AddressLocality, place.Location.City, nodeattr.PostalCode, place.Location.PostalCode, nodeattr.AddressRegion, place.Location.State, nodeattr.AddressCountry, place.Location.Country, nodeattr.Latitude, fmt.Sprint(place.Location.Lat), nodeattr.Longitude, fmt.Sprint(place.Location.Lng)); err != nil { return nil, err } return placeNode, nil }
func (r *run) importPost(post *apiPost, parent *importer.Object) error { postNode, err := parent.ChildPathObject(post.Hash) if err != nil { return err } t, err := time.Parse(timeFormat, post.Time) if err != nil { return err } attrs := []string{ "pinboard.in:hash", post.Hash, nodeattr.Type, "pinboard.in:post", nodeattr.DateCreated, schema.RFC3339FromTime(t), nodeattr.Title, post.Description, nodeattr.URL, post.Href, "pinboard.in:extended", post.Extended, "pinboard.in:meta", post.Meta, "pinboard.in:shared", post.Shared, "pinboard.in:toread", post.ToRead, } if err = postNode.SetAttrs(attrs...); err != nil { return err } if err = postNode.SetAttrValues("tag", strings.Split(post.Tags, " ")); err != nil { return err } return nil }
func (r *run) importPhotoset(parent *importer.Object, photoset *photosetInfo, page int) (int, error) { photosetNode, err := parent.ChildPathObject(photoset.Id) if err != nil { return 0, err } if err := photosetNode.SetAttrs( attrFlickrId, photoset.Id, nodeattr.Title, photoset.Title.Content, nodeattr.Description, photoset.Description.Content); err != nil { return 0, err } // keep track of primary photo so we can set the fileRef of the photo as CamliContentImage // on photosetNode when we eventually know that fileRef. r.primaryPhoto[photoset.Id] = photoset.PrimaryPhotoId resp := struct { Photoset photosetItems }{} if err := r.flickrAPIRequest(&resp, photosetAPIPath, "user_id", r.userID, "page", fmt.Sprintf("%d", page), "photoset_id", photoset.Id, "extras", "original_format"); err != nil { return 0, err } log.Printf("Importing page %d from photoset %s", page, photoset.Id) photosNode, err := r.getPhotosNode() if err != nil { return 0, err } for _, item := range resp.Photoset.Photo { filename := fmt.Sprintf("%s.%s", item.Id, item.OriginalFormat) photoNode, err := photosNode.ChildPathObject(filename) if err != nil { log.Printf("Flickr importer: error finding photo node %s for addition to photoset %s: %s", item.Id, photoset.Id, err) continue } if err := photosetNode.SetAttr("camliPath:"+filename, photoNode.PermanodeRef().String()); err != nil { log.Printf("Flickr importer: error adding photo %s to photoset %s: %s", item.Id, photoset.Id, err) } } if resp.Photoset.Page < resp.Photoset.Pages { return page + 1, nil } else { return 0, nil } }
func (im *imp) importPhotoset(parent *importer.Object, photoset *photosetsGetListItem, page int) (int, error) { photosetNode, err := parent.ChildPathObject(photoset.Id) if err != nil { return 0, err } if err := photosetNode.SetAttrs( "flickrId", photoset.Title.Content, "title", photoset.Title.Content, "description", photoset.Description.Content, "primaryPhotoId", photoset.PrimaryPhotoId); err != nil { return 0, err } resp := photosetsGetPhotos{} if err := im.flickrAPIRequest(&resp, "flickr.photosets.getPhotos", "page", fmt.Sprintf("%d", page), "photoset_id", photoset.Id, "extras", "original_format"); err != nil { return 0, err } log.Printf("Importing page %d from photoset %s", page, photoset.Id) photosNode, err := im.getPhotosNode() if err != nil { return 0, err } for _, item := range resp.Photoset.Photo { filename := fmt.Sprintf("%s.%s", item.Id, item.Originalformat) photoNode, err := photosNode.ChildPathObject(filename) if err != nil { log.Printf("Flickr importer: error finding photo node %s for addition to photoset %s: %s", item.Id, photoset.Id, err) continue } if err := photosetNode.SetAttr("camliPath:"+filename, photoNode.PermanodeRef().String()); err != nil { log.Printf("Flickr importer: error adding photo %s to photoset %s: %s", item.Id, photoset.Id, err) } } if resp.Photoset.Page < resp.Photoset.Pages { return page + 1, nil } else { return 0, nil } }
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (checkinNode *importer.Object, dup bool, err error) { checkinNode, err = parent.ChildPathObject(checkin.Id) if err != nil { return } title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name) dup = checkinNode.Attr(nodeattr.StartDate) != "" if err := checkinNode.SetAttrs( attrFoursquareId, checkin.Id, attrFoursquareVenuePermanode, placeRef.String(), nodeattr.Type, "foursquare.com:checkin", nodeattr.StartDate, schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)), nodeattr.Title, title); err != nil { return nil, false, err } return checkinNode, dup, nil }
func (r *run) importPhotos(placeNode *importer.Object) error { photosNode, err := placeNode.ChildPathObject("photos") if err != nil { return err } if err := photosNode.SetAttrs( "title", "Photos of "+placeNode.Attr("title"), "camliDefVis", "hide"); err != nil { return err } resp := photosList{} if err := r.im.doAPI(r.Context, r.token(), &resp, "venues/"+placeNode.Attr("foursquareId")+"/photos", "limit", "10"); err != nil { return err } var need []*photoItem for _, photo := range resp.Response.Photos.Items { attr := "camliPath:" + photo.Id + filepath.Ext(photo.Suffix) if photosNode.Attr(attr) == "" { need = append(need, photo) } } if len(need) > 0 { log.Printf("foursquare: importing %d photos for venue %s", len(need), placeNode.Attr("title")) for _, photo := range need { attr := "camliPath:" + photo.Id + filepath.Ext(photo.Suffix) url := photo.Prefix + "original" + photo.Suffix log.Printf("foursquare: importing photo for venue %s: %s", placeNode.Attr("title"), url) ref := r.urlFileRef(url, "") if ref == "" { log.Printf("Error slurping photo: %s", url) continue } if err := photosNode.SetAttr(attr, ref); err != nil { log.Printf("Error adding venue photo: %#v", err) } } } return nil }
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (*importer.Object, error) { checkinNode, err := parent.ChildPathObject(checkin.Id) if err != nil { return nil, err } title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name) if err := checkinNode.SetAttrs( "foursquareId", checkin.Id, "foursquareVenuePermanode", placeRef.String(), "camliNodeType", "foursquare.com:checkin", "startDate", schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)), "title", title); err != nil { return nil, err } return checkinNode, nil }
// TODO(aa): // * Parallelize: http://golang.org/doc/effective_go.html#concurrency // * Do more than one "page" worth of results // * Report progress and errors back through host interface // * All the rest of the metadata (see photoMeta) // * Conflicts: For all metadata changes, prefer any non-imported claims // * Test! func (im *imp) importPhoto(parent *importer.Object, photo *photosSearchItem) error { filename := fmt.Sprintf("%s.%s", photo.Id, photo.Originalformat) photoNode, err := parent.ChildPathObject(filename) if err != nil { return err } // Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run. // And this way if we add more things to import, they will get picked up. if err := photoNode.SetAttrs( "flickrId", photo.Id, "title", photo.Title, "description", photo.Description.Content); err != nil { return err } // Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed. if photoNode.Attr("flickrLastupdate") == photo.Lastupdate { return nil } res, err := im.flickrRequest(photo.URL, url.Values{}) if err != nil { log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err) return err } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(im.host.Target(), filename, res.Body) if err != nil { return err } if err := photoNode.SetAttr("camliContent", fileRef.String()); err != nil { return err } // Write lastupdate last, so that if any of the preceding fails, we will try again next time. if err := photoNode.SetAttr("flickrLastupdate", photo.Lastupdate); err != nil { return err } return nil }
func (r *run) importTweet(parent *importer.Object, tweet *tweetItem) error { tweetNode, err := parent.ChildPathObject(tweet.Id) if err != nil { return err } title := "Tweet id " + tweet.Id createdTime, err := time.Parse(time.RubyDate, tweet.CreatedAt) if err != nil { return fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt, err) } // TODO: import photos referenced in tweets return tweetNode.SetAttrs( "twitterId", tweet.Id, "camliNodeType", "twitter.com:tweet", "startDate", schema.RFC3339FromTime(createdTime), "content", tweet.Text, "title", title) }
func (r *run) importPhotos(placeNode *importer.Object) error { photosNode, err := placeNode.ChildPathObject("photos") if err != nil { return err } photosNode.SetAttrs( "title", "Photos of "+placeNode.Attr("title"), "camliDefVis", "hide") resp := photosList{} if err := r.im.doAPI(r.Context, r.token(), &resp, "venues/"+placeNode.Attr("foursquareId")+"/photos", "limit", "10"); err != nil { return err } itemcount := len(resp.Response.Photos.Items) log.Printf("Importing %d photos for venue %s", itemcount, placeNode.Attr("title")) for _, photo := range resp.Response.Photos.Items { attr := "camliPath:" + photo.Id + filepath.Ext(photo.Suffix) if photosNode.Attr(attr) != "" { log.Printf("Skipping photo, we already have it") // Assume we have this photo already and don't need to refetch. continue } url := photo.Prefix + "original" + photo.Suffix ref := r.urlFileRef(url, "") if ref == "" { log.Printf("Error slurping photo: %s", url) continue } err = photosNode.SetAttr(attr, ref) if err != nil { log.Printf("Error adding venue photo: %#v", err) } } return nil }
func (r *run) importItem(parent *importer.Object, item *item) error { itemNode, err := parent.ChildPathObject(item.ID) if err != nil { return err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), "", bytes.NewBufferString(item.Content)) if err != nil { return err } if err := itemNode.SetAttrs( "feedItemId", item.ID, "camliNodeType", "feed:item", "title", item.Title, "link", item.Link, "author", item.Author, "camliContent", fileRef.String(), "feedMediaContentURL", item.MediaContent, ); err != nil { return err } return nil }
func (r *run) importCompanions(parent *importer.Object, companions []*user) (companionRefs []string, err error) { for _, user := range companions { personNode, err := parent.ChildPathObject(user.Id) if err != nil { return nil, err } attrs := []string{ attrFoursquareId, user.Id, nodeattr.Type, "foursquare.com:person", nodeattr.Title, user.FirstName + " " + user.LastName, nodeattr.GivenName, user.FirstName, nodeattr.FamilyName, user.LastName, } if icon := user.icon(); icon != "" { attrs = append(attrs, nodeattr.CamliContentImage, r.urlFileRef(icon, path.Base(icon))) } if err := personNode.SetAttrs(attrs...); err != nil { return nil, err } companionRefs = append(companionRefs, personNode.PermanodeRef().String()) } return companionRefs, nil }
func (r *run) importPhotos(placeNode *importer.Object, checkinWasDup bool) error { photosNode, err := placeNode.ChildPathObject("photos") if err != nil { return err } if err := photosNode.SetAttrs( nodeattr.Title, "Photos of "+placeNode.Attr("title"), nodeattr.DefaultVisibility, "hide"); err != nil { return err } nHave := 0 photosNode.ForeachAttr(func(key, value string) { if strings.HasPrefix(key, "camliPath:") { nHave++ } }) nWant := photosRequestLimit if checkinWasDup { nWant = 1 } if nHave >= nWant { return nil } resp := photosList{} if err := r.im.doAPI(r.Context, r.token(), &resp, "venues/"+placeNode.Attr(attrFoursquareId)+"/photos", "limit", strconv.Itoa(nWant)); err != nil { return err } var need []*photoItem for _, photo := range resp.Response.Photos.Items { attr := "camliPath:" + photo.Id + filepath.Ext(photo.Suffix) if photosNode.Attr(attr) == "" { need = append(need, photo) } } if len(need) > 0 { venueTitle := placeNode.Attr(nodeattr.Title) log.Printf("foursquare: importing %d photos for venue %s", len(need), venueTitle) for _, photo := range need { attr := "camliPath:" + photo.Id + filepath.Ext(photo.Suffix) if photosNode.Attr(attr) != "" { continue } url := photo.Prefix + "original" + photo.Suffix log.Printf("foursquare: importing photo for venue %s: %s", venueTitle, url) ref := r.urlFileRef(url, "") if ref == "" { r.errorf("Error slurping photo: %s", url) continue } if err := photosNode.SetAttr(attr, ref); err != nil { r.errorf("Error adding venue photo: %#v", err) } } } return nil }
// viaAPI is true if it came via the REST API, or false if it came via a zip file. func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) { select { case <-r.Context().Done(): r.errorf("Twitter importer: interrupted") return false, r.Context().Err() default: } id := tweet.ID() tweetNode, err := parent.ChildPathObject(id) if err != nil { return false, err } // Because the zip format and the API format differ a bit, and // might diverge more in the future, never use the zip content // to overwrite data fetched via the API. If we add new // support for different fields in the future, we might want // to revisit this decision. Be wary of flip/flopping data if // modifying this, though. if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI { return true, nil } // e.g. "2014-06-12 19:11:51 +0000" createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700") if err != nil { return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err) } url := fmt.Sprintf("https://twitter.com/%s/status/%v", r.AccountNode().Attr(importer.AcctAttrUserName), id) attrs := []string{ "twitterId", id, nodeattr.Type, "twitter.com:tweet", nodeattr.StartDate, schema.RFC3339FromTime(createdTime), nodeattr.Content, tweet.Text(), nodeattr.URL, url, } if lat, long, ok := tweet.LatLong(); ok { attrs = append(attrs, nodeattr.Latitude, fmt.Sprint(lat), nodeattr.Longitude, fmt.Sprint(long), ) } if viaAPI { attrs = append(attrs, attrImportMethod, "api") } else { attrs = append(attrs, attrImportMethod, "zip") } for i, m := range tweet.Media() { filename := m.BaseFilename() if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") { // Don't re-import media we've already fetched. continue } tried, gotMedia := 0, false for _, mediaURL := range m.URLs() { tried++ res, err := ctxutil.Client(r.Context()).Get(mediaURL) if err != nil { return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err) } if res.StatusCode == http.StatusNotFound { continue } if res.StatusCode != 200 { return false, fmt.Errorf("HTTP status %d fetching %s for tweet %s", res.StatusCode, mediaURL, url) } if !viaAPI { log.Printf("For zip tweet %s, reading %v", url, mediaURL) } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) res.Body.Close() if err != nil { return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err) } attrs = append(attrs, "camliPath:"+filename, fileRef.String()) if i == 0 { attrs = append(attrs, "camliContentImage", fileRef.String()) } log.Printf("Slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef()) gotMedia = true break } if !gotMedia && tried > 0 { return false, fmt.Errorf("All media URLs 404s for tweet %s", url) } } changes, err := tweetNode.SetAttrs2(attrs...) if err == nil && changes { log.Printf("Imported tweet %s", url) } return !changes, err }
// TODO(aa): // * Parallelize: http://golang.org/doc/effective_go.html#concurrency // * Do more than one "page" worth of results // * Report progress and errors back through host interface // * All the rest of the metadata (see photoMeta) // * Conflicts: For all metadata changes, prefer any non-imported claims // * Test! func (r *run) importPhoto(parent *importer.Object, photo *photosSearchItem) error { filename := fmt.Sprintf("%s.%s", photo.Id, photo.OriginalFormat) photoNode, err := parent.ChildPathObject(filename) if err != nil { return err } // https://www.flickr.com/services/api/misc.dates.html dateTaken, err := time.ParseInLocation("2006-01-02 15:04:05", photo.DateTaken, schema.UnknownLocation) if err != nil { // default to the published date otherwise log.Printf("Flickr importer: problem with date taken of photo %v, defaulting to published date instead.", photo.Id) seconds, err := strconv.ParseInt(photo.DateUpload, 10, 64) if err != nil { return fmt.Errorf("could not parse date upload time %q for image %v: %v", photo.DateUpload, photo.Id, err) } dateTaken = time.Unix(seconds, 0) } attrs := []string{ attrFlickrId, photo.Id, nodeattr.DateCreated, schema.RFC3339FromTime(dateTaken), nodeattr.Description, photo.Description.Content, } if schema.IsInterestingTitle(photo.Title) { attrs = append(attrs, nodeattr.Title, photo.Title) } // Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run. // And this way if we add more things to import, they will get picked up. if err := photoNode.SetAttrs(attrs...); err != nil { return err } // Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed. // lastupdate is a Unix timestamp according to https://www.flickr.com/services/api/flickr.photos.getInfo.html seconds, err := strconv.ParseInt(photo.LastUpdate, 10, 64) if err != nil { return fmt.Errorf("could not parse lastupdate time for image %v: %v", photo.Id, err) } lastUpdate := time.Unix(seconds, 0) if lastUpdateString := photoNode.Attr(nodeattr.DateModified); lastUpdateString != "" { oldLastUpdate, err := time.Parse(time.RFC3339, lastUpdateString) if err != nil { return fmt.Errorf("could not parse last stored update time for image %v: %v", photo.Id, err) } if lastUpdate.Equal(oldLastUpdate) { if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } return nil } } form := url.Values{} form.Set("user_id", r.userID) res, err := r.fetch(photo.URL, form) if err != nil { log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err) return err } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { return err } if err := photoNode.SetAttr(nodeattr.CamliContent, fileRef.String()); err != nil { return err } if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } // Write lastupdate last, so that if any of the preceding fails, we will try again next time. if err := photoNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(lastUpdate)); err != nil { return err } return nil }
func (r *run) importAlbum(ctx context.Context, albumsNode *importer.Object, album picago.Album) (ret error) { if album.ID == "" { return errors.New("album has no ID") } albumNode, err := albumsNode.ChildPathObject(album.ID) if err != nil { return fmt.Errorf("importAlbum: error listing album: %v", err) } dateMod := schema.RFC3339FromTime(album.Updated) // Data reference: https://developers.google.com/picasa-web/docs/2.0/reference // TODO(tgulacsi): add more album info changes, err := albumNode.SetAttrs2( attrPicasaId, album.ID, nodeattr.Type, "picasaweb.google.com:album", nodeattr.Title, album.Title, nodeattr.DatePublished, schema.RFC3339FromTime(album.Published), nodeattr.LocationText, album.Location, nodeattr.Description, album.Description, nodeattr.URL, album.URL, ) if err != nil { return fmt.Errorf("error setting album attributes: %v", err) } if !changes && r.incremental && albumNode.Attr(nodeattr.DateModified) == dateMod { return nil } defer func() { // Don't update DateModified on the album node until // we've successfully imported all the photos. if ret == nil { ret = albumNode.SetAttr(nodeattr.DateModified, dateMod) } }() log.Printf("Importing album %v: %v/%v (published %v, updated %v)", album.ID, album.Name, album.Title, album.Published, album.Updated) // TODO(bradfitz): GetPhotos does multiple HTTP requests to // return a slice of all photos. My "InstantUpload/Auto // Backup" album has 6678 photos (and growing) and this // currently takes like 40 seconds. Fix. photos, err := picago.GetPhotos(ctxutil.Client(ctx), "default", album.ID) if err != nil { return err } log.Printf("Importing %d photos from album %q (%s)", len(photos), albumNode.Attr(nodeattr.Title), albumNode.PermanodeRef()) var grp syncutil.Group for i := range photos { select { case <-ctx.Done(): return ctx.Err() default: } photo := photos[i] r.photoGate.Start() grp.Go(func() error { defer r.photoGate.Done() return r.updatePhotoInAlbum(ctx, albumNode, photo) }) } return grp.Err() }