Esempio n. 1
0
func (r *run) importPost(post *apiPost, parent *importer.Object) error {
	postNode, err := parent.ChildPathObject(post.Hash)
	if err != nil {
		return err
	}

	t, err := time.Parse(timeFormat, post.Time)
	if err != nil {
		return err
	}

	attrs := []string{
		"pinboard.in:hash", post.Hash,
		nodeattr.Type, "pinboard.in:post",
		nodeattr.DateCreated, schema.RFC3339FromTime(t),
		nodeattr.Title, post.Description,
		nodeattr.URL, post.Href,
		"pinboard.in:extended", post.Extended,
		"pinboard.in:meta", post.Meta,
		"pinboard.in:shared", post.Shared,
		"pinboard.in:toread", post.ToRead,
	}
	if err = postNode.SetAttrs(attrs...); err != nil {
		return err
	}
	if err = postNode.SetAttrValues("tag", strings.Split(post.Tags, " ")); err != nil {
		return err
	}

	return nil
}
Esempio n. 2
0
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (checkinNode *importer.Object, dup bool, err error) {
	checkinNode, err = parent.ChildPathObject(checkin.Id)
	if err != nil {
		return
	}

	title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name)
	dup = checkinNode.Attr(nodeattr.StartDate) != ""
	if err := checkinNode.SetAttrs(
		attrFoursquareId, checkin.Id,
		attrFoursquareVenuePermanode, placeRef.String(),
		nodeattr.Type, "foursquare.com:checkin",
		nodeattr.StartDate, schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)),
		nodeattr.Title, title); err != nil {
		return nil, false, err
	}
	return checkinNode, dup, nil
}
Esempio n. 3
0
func (r *run) importCheckin(parent *importer.Object, checkin *checkinItem, placeRef blob.Ref) (*importer.Object, error) {
	checkinNode, err := parent.ChildPathObject(checkin.Id)
	if err != nil {
		return nil, err
	}

	title := fmt.Sprintf("Checkin at %s", checkin.Venue.Name)

	if err := checkinNode.SetAttrs(
		"foursquareId", checkin.Id,
		"foursquareVenuePermanode", placeRef.String(),
		"camliNodeType", "foursquare.com:checkin",
		"startDate", schema.RFC3339FromTime(time.Unix(checkin.CreatedAt, 0)),
		"title", title); err != nil {
		return nil, err
	}

	return checkinNode, nil
}
Esempio n. 4
0
func (r *run) importTweet(parent *importer.Object, tweet *tweetItem) error {
	tweetNode, err := parent.ChildPathObject(tweet.Id)
	if err != nil {
		return err
	}

	title := "Tweet id " + tweet.Id

	createdTime, err := time.Parse(time.RubyDate, tweet.CreatedAt)
	if err != nil {
		return fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt, err)
	}

	// TODO: import photos referenced in tweets
	return tweetNode.SetAttrs(
		"twitterId", tweet.Id,
		"camliNodeType", "twitter.com:tweet",
		"startDate", schema.RFC3339FromTime(createdTime),
		"content", tweet.Text,
		"title", title)
}
Esempio n. 5
0
// TODO(aa):
// * Parallelize: http://golang.org/doc/effective_go.html#concurrency
// * Do more than one "page" worth of results
// * Report progress and errors back through host interface
// * All the rest of the metadata (see photoMeta)
// * Conflicts: For all metadata changes, prefer any non-imported claims
// * Test!
func (r *run) importPhoto(parent *importer.Object, photo *photosSearchItem) error {
	filename := fmt.Sprintf("%s.%s", photo.Id, photo.OriginalFormat)
	photoNode, err := parent.ChildPathObject(filename)
	if err != nil {
		return err
	}

	// https://www.flickr.com/services/api/misc.dates.html
	dateTaken, err := time.ParseInLocation("2006-01-02 15:04:05", photo.DateTaken, schema.UnknownLocation)
	if err != nil {
		// default to the published date otherwise
		log.Printf("Flickr importer: problem with date taken of photo %v, defaulting to published date instead.", photo.Id)
		seconds, err := strconv.ParseInt(photo.DateUpload, 10, 64)
		if err != nil {
			return fmt.Errorf("could not parse date upload time %q for image %v: %v", photo.DateUpload, photo.Id, err)
		}
		dateTaken = time.Unix(seconds, 0)
	}

	attrs := []string{
		attrFlickrId, photo.Id,
		nodeattr.DateCreated, schema.RFC3339FromTime(dateTaken),
		nodeattr.Description, photo.Description.Content,
	}
	if schema.IsInterestingTitle(photo.Title) {
		attrs = append(attrs, nodeattr.Title, photo.Title)
	}
	// Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run.
	// And this way if we add more things to import, they will get picked up.
	if err := photoNode.SetAttrs(attrs...); err != nil {
		return err
	}

	// Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed.
	// lastupdate is a Unix timestamp according to https://www.flickr.com/services/api/flickr.photos.getInfo.html
	seconds, err := strconv.ParseInt(photo.LastUpdate, 10, 64)
	if err != nil {
		return fmt.Errorf("could not parse lastupdate time for image %v: %v", photo.Id, err)
	}
	lastUpdate := time.Unix(seconds, 0)
	if lastUpdateString := photoNode.Attr(nodeattr.DateModified); lastUpdateString != "" {
		oldLastUpdate, err := time.Parse(time.RFC3339, lastUpdateString)
		if err != nil {
			return fmt.Errorf("could not parse last stored update time for image %v: %v", photo.Id, err)
		}
		if lastUpdate.Equal(oldLastUpdate) {
			if err := r.updatePrimaryPhoto(photoNode); err != nil {
				return err
			}
			return nil
		}
	}
	form := url.Values{}
	form.Set("user_id", r.userID)
	res, err := r.fetch(photo.URL, form)
	if err != nil {
		log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err)
		return err
	}
	defer res.Body.Close()

	fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body)
	if err != nil {
		return err
	}
	if err := photoNode.SetAttr(nodeattr.CamliContent, fileRef.String()); err != nil {
		return err
	}
	if err := r.updatePrimaryPhoto(photoNode); err != nil {
		return err
	}
	// Write lastupdate last, so that if any of the preceding fails, we will try again next time.
	if err := photoNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(lastUpdate)); err != nil {
		return err
	}

	return nil
}
Esempio n. 6
0
func (r *run) updatePhotoInAlbum(ctx context.Context, albumNode *importer.Object, photo picago.Photo) (ret error) {
	if photo.ID == "" {
		return errors.New("photo has no ID")
	}

	getMediaBytes := func() (io.ReadCloser, error) {
		log.Printf("Importing media from %v", photo.URL)
		resp, err := ctxutil.Client(ctx).Get(photo.URL)
		if err != nil {
			return nil, fmt.Errorf("importing photo %s: %v", photo.ID, err)
		}
		if resp.StatusCode != http.StatusOK {
			resp.Body.Close()
			return nil, fmt.Errorf("importing photo %s: status code = %d", photo.ID, resp.StatusCode)
		}
		return resp.Body, nil
	}

	var fileRefStr string
	idFilename := photo.ID + "-" + photo.Filename
	photoNode, err := albumNode.ChildPathObjectOrFunc(idFilename, func() (*importer.Object, error) {
		h := blob.NewHash()
		rc, err := getMediaBytes()
		if err != nil {
			return nil, err
		}
		fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, io.TeeReader(rc, h))
		if err != nil {
			return nil, err
		}
		fileRefStr = fileRef.String()
		wholeRef := blob.RefFromHash(h)
		if pn, err := findExistingPermanode(r.Host.Searcher(), wholeRef); err == nil {
			return r.Host.ObjectFromRef(pn)
		}
		return r.Host.NewObject()
	})
	if err != nil {
		return err
	}

	const attrMediaURL = "picasaMediaURL"
	if fileRefStr == "" {
		fileRefStr = photoNode.Attr(nodeattr.CamliContent)
		// Only re-download the source photo if its URL has changed.
		// Empirically this seems to work: cropping a photo in the
		// photos.google.com UI causes its URL to change. And it makes
		// sense, looking at the ugliness of the URLs with all their
		// encoded/signed state.
		if !mediaURLsEqual(photoNode.Attr(attrMediaURL), photo.URL) {
			rc, err := getMediaBytes()
			if err != nil {
				return err
			}
			fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, rc)
			rc.Close()
			if err != nil {
				return err
			}
			fileRefStr = fileRef.String()
		}
	}

	title := strings.TrimSpace(photo.Description)
	if strings.Contains(title, "\n") {
		title = title[:strings.Index(title, "\n")]
	}
	if title == "" && schema.IsInterestingTitle(photo.Filename) {
		title = photo.Filename
	}

	// TODO(tgulacsi): add more attrs (comments ?)
	// for names, see http://schema.org/ImageObject and http://schema.org/CreativeWork
	attrs := []string{
		nodeattr.CamliContent, fileRefStr,
		attrPicasaId, photo.ID,
		nodeattr.Title, title,
		nodeattr.Description, photo.Description,
		nodeattr.LocationText, photo.Location,
		nodeattr.DateModified, schema.RFC3339FromTime(photo.Updated),
		nodeattr.DatePublished, schema.RFC3339FromTime(photo.Published),
		nodeattr.URL, photo.PageURL,
	}
	if photo.Latitude != 0 || photo.Longitude != 0 {
		attrs = append(attrs,
			nodeattr.Latitude, fmt.Sprintf("%f", photo.Latitude),
			nodeattr.Longitude, fmt.Sprintf("%f", photo.Longitude),
		)
	}
	if err := photoNode.SetAttrs(attrs...); err != nil {
		return err
	}
	if err := photoNode.SetAttrValues("tag", photo.Keywords); err != nil {
		return err
	}
	if photo.Position > 0 {
		if err := albumNode.SetAttr(
			nodeattr.CamliPathOrderColon+strconv.Itoa(photo.Position-1),
			photoNode.PermanodeRef().String()); err != nil {
			return err
		}
	}

	// Do this last, after we're sure the "camliContent" attribute
	// has been saved successfully, because this is the one that
	// causes us to do it again in the future or not.
	if err := photoNode.SetAttrs(attrMediaURL, photo.URL); err != nil {
		return err
	}
	return nil
}
Esempio n. 7
0
func (r *run) importAlbum(ctx context.Context, albumsNode *importer.Object, album picago.Album) (ret error) {
	if album.ID == "" {
		return errors.New("album has no ID")
	}
	albumNode, err := albumsNode.ChildPathObject(album.ID)
	if err != nil {
		return fmt.Errorf("importAlbum: error listing album: %v", err)
	}

	dateMod := schema.RFC3339FromTime(album.Updated)

	// Data reference: https://developers.google.com/picasa-web/docs/2.0/reference
	// TODO(tgulacsi): add more album info
	changes, err := albumNode.SetAttrs2(
		attrPicasaId, album.ID,
		nodeattr.Type, "picasaweb.google.com:album",
		nodeattr.Title, album.Title,
		nodeattr.DatePublished, schema.RFC3339FromTime(album.Published),
		nodeattr.LocationText, album.Location,
		nodeattr.Description, album.Description,
		nodeattr.URL, album.URL,
	)
	if err != nil {
		return fmt.Errorf("error setting album attributes: %v", err)
	}
	if !changes && r.incremental && albumNode.Attr(nodeattr.DateModified) == dateMod {
		return nil
	}
	defer func() {
		// Don't update DateModified on the album node until
		// we've successfully imported all the photos.
		if ret == nil {
			ret = albumNode.SetAttr(nodeattr.DateModified, dateMod)
		}
	}()

	log.Printf("Importing album %v: %v/%v (published %v, updated %v)", album.ID, album.Name, album.Title, album.Published, album.Updated)

	// TODO(bradfitz): GetPhotos does multiple HTTP requests to
	// return a slice of all photos. My "InstantUpload/Auto
	// Backup" album has 6678 photos (and growing) and this
	// currently takes like 40 seconds. Fix.
	photos, err := picago.GetPhotos(ctxutil.Client(ctx), "default", album.ID)
	if err != nil {
		return err
	}

	log.Printf("Importing %d photos from album %q (%s)", len(photos), albumNode.Attr(nodeattr.Title),
		albumNode.PermanodeRef())

	var grp syncutil.Group
	for i := range photos {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}
		photo := photos[i]
		r.photoGate.Start()
		grp.Go(func() error {
			defer r.photoGate.Done()
			return r.updatePhotoInAlbum(ctx, albumNode, photo)
		})
	}
	return grp.Err()
}
Esempio n. 8
0
func (id *IndexDeps) advanceTime() string {
	id.now = id.now.Add(1 * time.Second)
	return schema.RFC3339FromTime(id.now)
}
Esempio n. 9
0
// viaAPI is true if it came via the REST API, or false if it came via a zip file.
func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) {
	select {
	case <-r.Context().Done():
		r.errorf("Twitter importer: interrupted")
		return false, r.Context().Err()
	default:
	}
	id := tweet.ID()
	tweetNode, err := parent.ChildPathObject(id)
	if err != nil {
		return false, err
	}

	// Because the zip format and the API format differ a bit, and
	// might diverge more in the future, never use the zip content
	// to overwrite data fetched via the API. If we add new
	// support for different fields in the future, we might want
	// to revisit this decision.  Be wary of flip/flopping data if
	// modifying this, though.
	if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI {
		return true, nil
	}

	// e.g. "2014-06-12 19:11:51 +0000"
	createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700")
	if err != nil {
		return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err)
	}

	url := fmt.Sprintf("https://twitter.com/%s/status/%v",
		r.AccountNode().Attr(importer.AcctAttrUserName),
		id)

	attrs := []string{
		"twitterId", id,
		nodeattr.Type, "twitter.com:tweet",
		nodeattr.StartDate, schema.RFC3339FromTime(createdTime),
		nodeattr.Content, tweet.Text(),
		nodeattr.URL, url,
	}
	if lat, long, ok := tweet.LatLong(); ok {
		attrs = append(attrs,
			nodeattr.Latitude, fmt.Sprint(lat),
			nodeattr.Longitude, fmt.Sprint(long),
		)
	}
	if viaAPI {
		attrs = append(attrs, attrImportMethod, "api")
	} else {
		attrs = append(attrs, attrImportMethod, "zip")
	}

	for i, m := range tweet.Media() {
		filename := m.BaseFilename()
		if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") {
			// Don't re-import media we've already fetched.
			continue
		}
		tried, gotMedia := 0, false
		for _, mediaURL := range m.URLs() {
			tried++
			res, err := ctxutil.Client(r.Context()).Get(mediaURL)
			if err != nil {
				return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err)
			}
			if res.StatusCode == http.StatusNotFound {
				continue
			}
			if res.StatusCode != 200 {
				return false, fmt.Errorf("HTTP status %d fetching %s for tweet %s", res.StatusCode, mediaURL, url)
			}
			if !viaAPI {
				log.Printf("For zip tweet %s, reading %v", url, mediaURL)
			}
			fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body)
			res.Body.Close()
			if err != nil {
				return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err)
			}
			attrs = append(attrs, "camliPath:"+filename, fileRef.String())
			if i == 0 {
				attrs = append(attrs, "camliContentImage", fileRef.String())
			}
			log.Printf("Slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef())
			gotMedia = true
			break
		}
		if !gotMedia && tried > 0 {
			return false, fmt.Errorf("All media URLs 404s for tweet %s", url)
		}
	}

	changes, err := tweetNode.SetAttrs2(attrs...)
	if err == nil && changes {
		log.Printf("Imported tweet %s", url)
	}
	return !changes, err
}
Esempio n. 10
0
func (up *Uploader) uploadNodeRegularFile(n *node) (*client.PutResult, error) {
	m := schema.NewCommonFileMap(n.fullPath, n.fi)
	m["camliType"] = "file"
	file, err := up.open(n.fullPath)
	if err != nil {
		return nil, err
	}
	defer file.Close()
	if up.fileOpts.exifTime {
		ra, ok := file.(io.ReaderAt)
		if !ok {
			return nil, errors.New("Error asserting local file to io.ReaderAt")
		}
		modtime, err := schema.FileTime(ra)
		if err != nil {
			log.Printf("warning: getting time from EXIF failed for %v: %v", n.fullPath, err)
		} else {
			m["unixMtime"] = schema.RFC3339FromTime(modtime)
		}
	}

	size := n.fi.Size()
	var fileContents io.Reader = io.LimitReader(file, size)

	if up.fileOpts.wantVivify() {
		err := schema.WriteFileChunks(up.statReceiver(), m, fileContents)
		if err != nil {
			return nil, err
		}
		json, err := m.JSON()
		if err != nil {
			return nil, err
		}
		bref := blobref.SHA1FromString(json)
		h := &client.UploadHandle{
			BlobRef:  bref,
			Size:     int64(len(json)),
			Contents: strings.NewReader(json),
			Vivify:   true,
		}
		return up.Upload(h)
	}

	var (
		blobref *blobref.BlobRef // of file schemaref
		sum     string           // "sha1-xxxxx"
	)

	const dupCheckThreshold = 256 << 10
	if size > dupCheckThreshold {
		sumRef, err := up.wholeFileDigest(n.fullPath)
		if err == nil {
			sum = sumRef.String()
			if ref, ok := up.fileMapFromDuplicate(up.statReceiver(), m, sum); ok {
				blobref = ref
			}
		}
	}

	if blobref == nil {
		if sum == "" && up.fileOpts.wantFilePermanode() {
			fileContents = &trackDigestReader{r: fileContents}
		}
		blobref, err = schema.WriteFileMap(up.statReceiver(), m, fileContents)
		if err != nil {
			return nil, err
		}
	}

	// TODO(mpl): test that none of these claims get uploaded if they've already been done
	if up.fileOpts.wantFilePermanode() {
		if td, ok := fileContents.(*trackDigestReader); ok {
			sum = td.Sum()
		}
		// Use a fixed time value for signing; not using modtime
		// so two identical files don't have different modtimes?
		// TODO(bradfitz): consider this more?
		permaNodeSigTime := time.Unix(0, 0)
		permaNode, err := up.UploadPlannedPermanode(sum, permaNodeSigTime)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode for node %v: %v", n, err)
		}
		handleResult("node-permanode", permaNode, nil)

		// claimTime is both the time of the "claimDate" in the
		// JSON claim, as well as the date in the OpenPGP
		// header.
		// TODO(bradfitz): this is a little clumsy to do by hand.
		// There should probably be a method on *Uploader to do this
		// from an unsigned schema map. Maybe ditch the schema.Claimer
		// type and just have the Uploader override the claimDate.
		claimTime, err := time.Parse(time.RFC3339, m["unixMtime"].(string))
		if err != nil {
			return nil, fmt.Errorf("While parsing modtime for file %v: %v", n.fullPath, err)
		}
		contentAttr := schema.NewSetAttributeClaim(permaNode.BlobRef, "camliContent", blobref.String())
		contentAttr.SetClaimDate(claimTime)
		signed, err := up.SignMap(contentAttr, claimTime)
		if err != nil {
			return nil, fmt.Errorf("Failed to sign content claim for node %v: %v", n, err)
		}
		put, err := up.uploadString(signed)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode's attribute for node %v: %v", n, err)
		}
		handleResult("node-permanode-contentattr", put, nil)
		if tags := up.fileOpts.tags(); len(tags) > 0 {
			// TODO(mpl): do these claims concurrently, not in series
			for _, tag := range tags {
				m := schema.NewAddAttributeClaim(permaNode.BlobRef, "tag", tag)
				m.SetClaimDate(claimTime)
				// TODO(mpl): verify that SetClaimDate does modify the GPG signature date of the claim
				signed, err := up.SignMap(m, claimTime)
				if err != nil {
					return nil, fmt.Errorf("Failed to sign tag claim for node %v: %v", n, err)
				}
				put, err := up.uploadString(signed)
				if err != nil {
					return nil, fmt.Errorf("Error uploading permanode's tag attribute %v for node %v: %v", tag, n, err)
				}
				handleResult("node-permanode-tag", put, nil)
			}
		}
	}

	// TODO(bradfitz): faking a PutResult here to return
	// is kinda gross.  should instead make a
	// blobserver.Storage wrapper type (wrapping
	// statReceiver) that can track some of this?  or make
	// schemaWriteFileMap return it?
	json, _ := m.JSON()
	pr := &client.PutResult{BlobRef: blobref, Size: int64(len(json)), Skipped: false}
	return pr, nil
}