func (r *run) importPost(post *apiPost, parent *importer.Object) error { postNode, err := parent.ChildPathObject(post.Hash) if err != nil { return err } t, err := time.Parse(timeFormat, post.Time) if err != nil { return err } attrs := []string{ "pinboard.in:hash", post.Hash, nodeattr.Type, "pinboard.in:post", nodeattr.DateCreated, schema.RFC3339FromTime(t), nodeattr.Title, post.Description, nodeattr.URL, post.Href, "pinboard.in:extended", post.Extended, "pinboard.in:meta", post.Meta, "pinboard.in:shared", post.Shared, "pinboard.in:toread", post.ToRead, } if err = postNode.SetAttrs(attrs...); err != nil { return err } if err = postNode.SetAttrValues("tag", strings.Split(post.Tags, " ")); err != nil { return err } return nil }
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (checkinNode *importer.Object, dup bool, err error) { checkinNode, err = parent.ChildPathObject(checkin.Id) if err != nil { return } title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name) dup = checkinNode.Attr(nodeattr.StartDate) != "" if err := checkinNode.SetAttrs( attrFoursquareId, checkin.Id, attrFoursquareVenuePermanode, placeRef.String(), nodeattr.Type, "foursquare.com:checkin", nodeattr.StartDate, schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)), nodeattr.Title, title); err != nil { return nil, false, err } return checkinNode, dup, nil }
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (*importer.Object, error) { checkinNode, err := parent.ChildPathObject(checkin.Id) if err != nil { return nil, err } title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name) if err := checkinNode.SetAttrs( "foursquareId", checkin.Id, "foursquareVenuePermanode", placeRef.String(), "camliNodeType", "foursquare.com:checkin", "startDate", schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)), "title", title); err != nil { return nil, err } return checkinNode, nil }
func (r *run) importTweet(parent *importer.Object, tweet *tweetItem) error { tweetNode, err := parent.ChildPathObject(tweet.Id) if err != nil { return err } title := "Tweet id " + tweet.Id createdTime, err := time.Parse(time.RubyDate, tweet.CreatedAt) if err != nil { return fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt, err) } // TODO: import photos referenced in tweets return tweetNode.SetAttrs( "twitterId", tweet.Id, "camliNodeType", "twitter.com:tweet", "startDate", schema.RFC3339FromTime(createdTime), "content", tweet.Text, "title", title) }
// TODO(aa): // * Parallelize: http://golang.org/doc/effective_go.html#concurrency // * Do more than one "page" worth of results // * Report progress and errors back through host interface // * All the rest of the metadata (see photoMeta) // * Conflicts: For all metadata changes, prefer any non-imported claims // * Test! func (r *run) importPhoto(parent *importer.Object, photo *photosSearchItem) error { filename := fmt.Sprintf("%s.%s", photo.Id, photo.OriginalFormat) photoNode, err := parent.ChildPathObject(filename) if err != nil { return err } // https://www.flickr.com/services/api/misc.dates.html dateTaken, err := time.ParseInLocation("2006-01-02 15:04:05", photo.DateTaken, schema.UnknownLocation) if err != nil { // default to the published date otherwise log.Printf("Flickr importer: problem with date taken of photo %v, defaulting to published date instead.", photo.Id) seconds, err := strconv.ParseInt(photo.DateUpload, 10, 64) if err != nil { return fmt.Errorf("could not parse date upload time %q for image %v: %v", photo.DateUpload, photo.Id, err) } dateTaken = time.Unix(seconds, 0) } attrs := []string{ attrFlickrId, photo.Id, nodeattr.DateCreated, schema.RFC3339FromTime(dateTaken), nodeattr.Description, photo.Description.Content, } if schema.IsInterestingTitle(photo.Title) { attrs = append(attrs, nodeattr.Title, photo.Title) } // Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run. // And this way if we add more things to import, they will get picked up. if err := photoNode.SetAttrs(attrs...); err != nil { return err } // Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed. // lastupdate is a Unix timestamp according to https://www.flickr.com/services/api/flickr.photos.getInfo.html seconds, err := strconv.ParseInt(photo.LastUpdate, 10, 64) if err != nil { return fmt.Errorf("could not parse lastupdate time for image %v: %v", photo.Id, err) } lastUpdate := time.Unix(seconds, 0) if lastUpdateString := photoNode.Attr(nodeattr.DateModified); lastUpdateString != "" { oldLastUpdate, err := time.Parse(time.RFC3339, lastUpdateString) if err != nil { return fmt.Errorf("could not parse last stored update time for image %v: %v", photo.Id, err) } if lastUpdate.Equal(oldLastUpdate) { if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } return nil } } form := url.Values{} form.Set("user_id", r.userID) res, err := r.fetch(photo.URL, form) if err != nil { log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err) return err } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { return err } if err := photoNode.SetAttr(nodeattr.CamliContent, fileRef.String()); err != nil { return err } if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } // Write lastupdate last, so that if any of the preceding fails, we will try again next time. if err := photoNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(lastUpdate)); err != nil { return err } return nil }
func (r *run) updatePhotoInAlbum(ctx context.Context, albumNode *importer.Object, photo picago.Photo) (ret error) { if photo.ID == "" { return errors.New("photo has no ID") } getMediaBytes := func() (io.ReadCloser, error) { log.Printf("Importing media from %v", photo.URL) resp, err := ctxutil.Client(ctx).Get(photo.URL) if err != nil { return nil, fmt.Errorf("importing photo %s: %v", photo.ID, err) } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("importing photo %s: status code = %d", photo.ID, resp.StatusCode) } return resp.Body, nil } var fileRefStr string idFilename := photo.ID + "-" + photo.Filename photoNode, err := albumNode.ChildPathObjectOrFunc(idFilename, func() (*importer.Object, error) { h := blob.NewHash() rc, err := getMediaBytes() if err != nil { return nil, err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, io.TeeReader(rc, h)) if err != nil { return nil, err } fileRefStr = fileRef.String() wholeRef := blob.RefFromHash(h) if pn, err := findExistingPermanode(r.Host.Searcher(), wholeRef); err == nil { return r.Host.ObjectFromRef(pn) } return r.Host.NewObject() }) if err != nil { return err } const attrMediaURL = "picasaMediaURL" if fileRefStr == "" { fileRefStr = photoNode.Attr(nodeattr.CamliContent) // Only re-download the source photo if its URL has changed. // Empirically this seems to work: cropping a photo in the // photos.google.com UI causes its URL to change. And it makes // sense, looking at the ugliness of the URLs with all their // encoded/signed state. if !mediaURLsEqual(photoNode.Attr(attrMediaURL), photo.URL) { rc, err := getMediaBytes() if err != nil { return err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, rc) rc.Close() if err != nil { return err } fileRefStr = fileRef.String() } } title := strings.TrimSpace(photo.Description) if strings.Contains(title, "\n") { title = title[:strings.Index(title, "\n")] } if title == "" && schema.IsInterestingTitle(photo.Filename) { title = photo.Filename } // TODO(tgulacsi): add more attrs (comments ?) // for names, see http://schema.org/ImageObject and http://schema.org/CreativeWork attrs := []string{ nodeattr.CamliContent, fileRefStr, attrPicasaId, photo.ID, nodeattr.Title, title, nodeattr.Description, photo.Description, nodeattr.LocationText, photo.Location, nodeattr.DateModified, schema.RFC3339FromTime(photo.Updated), nodeattr.DatePublished, schema.RFC3339FromTime(photo.Published), nodeattr.URL, photo.PageURL, } if photo.Latitude != 0 || photo.Longitude != 0 { attrs = append(attrs, nodeattr.Latitude, fmt.Sprintf("%f", photo.Latitude), nodeattr.Longitude, fmt.Sprintf("%f", photo.Longitude), ) } if err := photoNode.SetAttrs(attrs...); err != nil { return err } if err := photoNode.SetAttrValues("tag", photo.Keywords); err != nil { return err } if photo.Position > 0 { if err := albumNode.SetAttr( nodeattr.CamliPathOrderColon+strconv.Itoa(photo.Position-1), photoNode.PermanodeRef().String()); err != nil { return err } } // Do this last, after we're sure the "camliContent" attribute // has been saved successfully, because this is the one that // causes us to do it again in the future or not. if err := photoNode.SetAttrs(attrMediaURL, photo.URL); err != nil { return err } return nil }
func (r *run) importAlbum(ctx context.Context, albumsNode *importer.Object, album picago.Album) (ret error) { if album.ID == "" { return errors.New("album has no ID") } albumNode, err := albumsNode.ChildPathObject(album.ID) if err != nil { return fmt.Errorf("importAlbum: error listing album: %v", err) } dateMod := schema.RFC3339FromTime(album.Updated) // Data reference: https://developers.google.com/picasa-web/docs/2.0/reference // TODO(tgulacsi): add more album info changes, err := albumNode.SetAttrs2( attrPicasaId, album.ID, nodeattr.Type, "picasaweb.google.com:album", nodeattr.Title, album.Title, nodeattr.DatePublished, schema.RFC3339FromTime(album.Published), nodeattr.LocationText, album.Location, nodeattr.Description, album.Description, nodeattr.URL, album.URL, ) if err != nil { return fmt.Errorf("error setting album attributes: %v", err) } if !changes && r.incremental && albumNode.Attr(nodeattr.DateModified) == dateMod { return nil } defer func() { // Don't update DateModified on the album node until // we've successfully imported all the photos. if ret == nil { ret = albumNode.SetAttr(nodeattr.DateModified, dateMod) } }() log.Printf("Importing album %v: %v/%v (published %v, updated %v)", album.ID, album.Name, album.Title, album.Published, album.Updated) // TODO(bradfitz): GetPhotos does multiple HTTP requests to // return a slice of all photos. My "InstantUpload/Auto // Backup" album has 6678 photos (and growing) and this // currently takes like 40 seconds. Fix. photos, err := picago.GetPhotos(ctxutil.Client(ctx), "default", album.ID) if err != nil { return err } log.Printf("Importing %d photos from album %q (%s)", len(photos), albumNode.Attr(nodeattr.Title), albumNode.PermanodeRef()) var grp syncutil.Group for i := range photos { select { case <-ctx.Done(): return ctx.Err() default: } photo := photos[i] r.photoGate.Start() grp.Go(func() error { defer r.photoGate.Done() return r.updatePhotoInAlbum(ctx, albumNode, photo) }) } return grp.Err() }
func (id *IndexDeps) advanceTime() string { id.now = id.now.Add(1 * time.Second) return schema.RFC3339FromTime(id.now) }
// viaAPI is true if it came via the REST API, or false if it came via a zip file. func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) { select { case <-r.Context().Done(): r.errorf("Twitter importer: interrupted") return false, r.Context().Err() default: } id := tweet.ID() tweetNode, err := parent.ChildPathObject(id) if err != nil { return false, err } // Because the zip format and the API format differ a bit, and // might diverge more in the future, never use the zip content // to overwrite data fetched via the API. If we add new // support for different fields in the future, we might want // to revisit this decision. Be wary of flip/flopping data if // modifying this, though. if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI { return true, nil } // e.g. "2014-06-12 19:11:51 +0000" createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700") if err != nil { return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err) } url := fmt.Sprintf("https://twitter.com/%s/status/%v", r.AccountNode().Attr(importer.AcctAttrUserName), id) attrs := []string{ "twitterId", id, nodeattr.Type, "twitter.com:tweet", nodeattr.StartDate, schema.RFC3339FromTime(createdTime), nodeattr.Content, tweet.Text(), nodeattr.URL, url, } if lat, long, ok := tweet.LatLong(); ok { attrs = append(attrs, nodeattr.Latitude, fmt.Sprint(lat), nodeattr.Longitude, fmt.Sprint(long), ) } if viaAPI { attrs = append(attrs, attrImportMethod, "api") } else { attrs = append(attrs, attrImportMethod, "zip") } for i, m := range tweet.Media() { filename := m.BaseFilename() if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") { // Don't re-import media we've already fetched. continue } tried, gotMedia := 0, false for _, mediaURL := range m.URLs() { tried++ res, err := ctxutil.Client(r.Context()).Get(mediaURL) if err != nil { return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err) } if res.StatusCode == http.StatusNotFound { continue } if res.StatusCode != 200 { return false, fmt.Errorf("HTTP status %d fetching %s for tweet %s", res.StatusCode, mediaURL, url) } if !viaAPI { log.Printf("For zip tweet %s, reading %v", url, mediaURL) } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) res.Body.Close() if err != nil { return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err) } attrs = append(attrs, "camliPath:"+filename, fileRef.String()) if i == 0 { attrs = append(attrs, "camliContentImage", fileRef.String()) } log.Printf("Slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef()) gotMedia = true break } if !gotMedia && tried > 0 { return false, fmt.Errorf("All media URLs 404s for tweet %s", url) } } changes, err := tweetNode.SetAttrs2(attrs...) if err == nil && changes { log.Printf("Imported tweet %s", url) } return !changes, err }
func (up *Uploader) uploadNodeRegularFile(n *node) (*client.PutResult, error) { m := schema.NewCommonFileMap(n.fullPath, n.fi) m["camliType"] = "file" file, err := up.open(n.fullPath) if err != nil { return nil, err } defer file.Close() if up.fileOpts.exifTime { ra, ok := file.(io.ReaderAt) if !ok { return nil, errors.New("Error asserting local file to io.ReaderAt") } modtime, err := schema.FileTime(ra) if err != nil { log.Printf("warning: getting time from EXIF failed for %v: %v", n.fullPath, err) } else { m["unixMtime"] = schema.RFC3339FromTime(modtime) } } size := n.fi.Size() var fileContents io.Reader = io.LimitReader(file, size) if up.fileOpts.wantVivify() { err := schema.WriteFileChunks(up.statReceiver(), m, fileContents) if err != nil { return nil, err } json, err := m.JSON() if err != nil { return nil, err } bref := blobref.SHA1FromString(json) h := &client.UploadHandle{ BlobRef: bref, Size: int64(len(json)), Contents: strings.NewReader(json), Vivify: true, } return up.Upload(h) } var ( blobref *blobref.BlobRef // of file schemaref sum string // "sha1-xxxxx" ) const dupCheckThreshold = 256 << 10 if size > dupCheckThreshold { sumRef, err := up.wholeFileDigest(n.fullPath) if err == nil { sum = sumRef.String() if ref, ok := up.fileMapFromDuplicate(up.statReceiver(), m, sum); ok { blobref = ref } } } if blobref == nil { if sum == "" && up.fileOpts.wantFilePermanode() { fileContents = &trackDigestReader{r: fileContents} } blobref, err = schema.WriteFileMap(up.statReceiver(), m, fileContents) if err != nil { return nil, err } } // TODO(mpl): test that none of these claims get uploaded if they've already been done if up.fileOpts.wantFilePermanode() { if td, ok := fileContents.(*trackDigestReader); ok { sum = td.Sum() } // Use a fixed time value for signing; not using modtime // so two identical files don't have different modtimes? // TODO(bradfitz): consider this more? permaNodeSigTime := time.Unix(0, 0) permaNode, err := up.UploadPlannedPermanode(sum, permaNodeSigTime) if err != nil { return nil, fmt.Errorf("Error uploading permanode for node %v: %v", n, err) } handleResult("node-permanode", permaNode, nil) // claimTime is both the time of the "claimDate" in the // JSON claim, as well as the date in the OpenPGP // header. // TODO(bradfitz): this is a little clumsy to do by hand. // There should probably be a method on *Uploader to do this // from an unsigned schema map. Maybe ditch the schema.Claimer // type and just have the Uploader override the claimDate. claimTime, err := time.Parse(time.RFC3339, m["unixMtime"].(string)) if err != nil { return nil, fmt.Errorf("While parsing modtime for file %v: %v", n.fullPath, err) } contentAttr := schema.NewSetAttributeClaim(permaNode.BlobRef, "camliContent", blobref.String()) contentAttr.SetClaimDate(claimTime) signed, err := up.SignMap(contentAttr, claimTime) if err != nil { return nil, fmt.Errorf("Failed to sign content claim for node %v: %v", n, err) } put, err := up.uploadString(signed) if err != nil { return nil, fmt.Errorf("Error uploading permanode's attribute for node %v: %v", n, err) } handleResult("node-permanode-contentattr", put, nil) if tags := up.fileOpts.tags(); len(tags) > 0 { // TODO(mpl): do these claims concurrently, not in series for _, tag := range tags { m := schema.NewAddAttributeClaim(permaNode.BlobRef, "tag", tag) m.SetClaimDate(claimTime) // TODO(mpl): verify that SetClaimDate does modify the GPG signature date of the claim signed, err := up.SignMap(m, claimTime) if err != nil { return nil, fmt.Errorf("Failed to sign tag claim for node %v: %v", n, err) } put, err := up.uploadString(signed) if err != nil { return nil, fmt.Errorf("Error uploading permanode's tag attribute %v for node %v: %v", tag, n, err) } handleResult("node-permanode-tag", put, nil) } } } // TODO(bradfitz): faking a PutResult here to return // is kinda gross. should instead make a // blobserver.Storage wrapper type (wrapping // statReceiver) that can track some of this? or make // schemaWriteFileMap return it? json, _ := m.JSON() pr := &client.PutResult{BlobRef: blobref, Size: int64(len(json)), Skipped: false} return pr, nil }