Beispiel #1
0
// b: the parsed file schema blob
// mm: keys to populate
func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) {
	var times []time.Time // all creation or mod times seen; may be zero
	times = append(times, b.ModTime())

	blobRef := b.BlobRef()
	fr, err := b.NewFileReader(fetcher)
	if err != nil {
		return err
	}
	defer fr.Close()
	mime, reader := magic.MIMETypeFromReader(fr)

	sha1 := sha1.New()
	var copyDest io.Writer = sha1
	var imageBuf *keepFirstN // or nil
	if strings.HasPrefix(mime, "image/") {
		imageBuf = &keepFirstN{N: 512 << 10}
		copyDest = io.MultiWriter(copyDest, imageBuf)
	}
	size, err := io.Copy(copyDest, reader)
	if err != nil {
		return err
	}
	wholeRef := blob.RefFromHash(sha1)

	if imageBuf != nil {
		conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes))
		// If our optimistic 512KB in-memory prefix from above was too short to get the dimensions, pass the whole thing instead and try again.
		if err == io.ErrUnexpectedEOF {
			var fr *schema.FileReader
			fr, err = b.NewFileReader(fetcher)
			if err == nil {
				conf, err = images.DecodeConfig(fr)
				fr.Close()
			}
		}
		if err == nil {
			mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
		}
		if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
			times = append(times, ft)
		} else {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
		}

		// TODO(mpl): find (generate?) more broken EXIF images to experiment with.
		err = indexEXIF(wholeRef, bytes.NewReader(imageBuf.Bytes), mm)
		if err == io.EOF {
			var fr *schema.FileReader
			fr, err = b.NewFileReader(fetcher)
			if err == nil {
				err = indexEXIF(wholeRef, fr, mm)
				fr.Close()
			}
		}
		if err != nil {
			log.Printf("error parsing EXIF: %v", err)
		}
	}

	var sortTimes []time.Time
	for _, t := range times {
		if !t.IsZero() {
			sortTimes = append(sortTimes, t)
		}
	}
	sort.Sort(types.ByTime(sortTimes))
	var time3339s string
	switch {
	case len(sortTimes) == 1:
		time3339s = types.Time3339(sortTimes[0]).String()
	case len(sortTimes) >= 2:
		oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
		time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
	}

	mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
	mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime, wholeRef))
	mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))

	if strings.HasPrefix(mime, "audio/") {
		indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm)
	}

	return nil
}
Beispiel #2
0
// b: the parsed file schema blob
// mm: keys to populate
func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) {
	var times []time.Time // all creation or mod times seen; may be zero
	times = append(times, b.ModTime())

	blobRef := b.BlobRef()
	fr, err := b.NewFileReader(fetcher)
	if err != nil {
		return err
	}
	defer fr.Close()
	mime, reader := magic.MIMETypeFromReader(fr)

	sha1 := sha1.New()
	var copyDest io.Writer = sha1
	var imageBuf *keepFirstN // or nil
	if strings.HasPrefix(mime, "image/") {
		// Emperically derived 1MiB assuming CR2 images require more than any
		// other filetype we support:
		//   https://gist.github.com/wathiede/7982372
		imageBuf = &keepFirstN{N: 1 << 20}
		copyDest = io.MultiWriter(copyDest, imageBuf)
	}
	size, err := io.Copy(copyDest, reader)
	if err != nil {
		return err
	}
	wholeRef := blob.RefFromHash(sha1)

	if imageBuf != nil {
		if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil {
			mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
		}
		if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
			times = append(times, ft)
		} else {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
		}

		indexEXIF(wholeRef, imageBuf.Bytes, mm)
	}

	var sortTimes []time.Time
	for _, t := range times {
		if !t.IsZero() {
			sortTimes = append(sortTimes, t)
		}
	}
	sort.Sort(types.ByTime(sortTimes))
	var time3339s string
	switch {
	case len(sortTimes) == 1:
		time3339s = types.Time3339(sortTimes[0]).String()
	case len(sortTimes) >= 2:
		oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
		time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
	}

	mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
	mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime))
	mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))

	if strings.HasPrefix(mime, "audio/") {
		indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm)
	}

	return nil
}
Beispiel #3
0
func (up *Uploader) uploadNodeRegularFile(n *node) (*client.PutResult, error) {
	var filebb *schema.Builder
	if up.fileOpts.contentsOnly {
		filebb = schema.NewFileMap("")
	} else {
		filebb = schema.NewCommonFileMap(n.fullPath, n.fi)
	}
	filebb.SetType("file")

	up.fdGate.Start()
	defer up.fdGate.Done()

	file, err := up.open(n.fullPath)
	if err != nil {
		return nil, err
	}
	defer file.Close()
	if !up.fileOpts.contentsOnly {
		if up.fileOpts.exifTime {
			ra, ok := file.(io.ReaderAt)
			if !ok {
				return nil, errors.New("Error asserting local file to io.ReaderAt")
			}
			modtime, err := schema.FileTime(ra)
			if err != nil {
				log.Printf("warning: getting time from EXIF failed for %v: %v", n.fullPath, err)
			} else {
				filebb.SetModTime(modtime)
			}
		}
		if up.fileOpts.wantCapCtime() {
			filebb.CapCreationTime()
		}
	}

	var (
		size                           = n.fi.Size()
		fileContents io.Reader         = io.LimitReader(file, size)
		br           blob.Ref          // of file schemaref
		sum          string            // sha1 hashsum of the file to upload
		pr           *client.PutResult // of the final "file" schema blob
	)

	const dupCheckThreshold = 256 << 10
	if size > dupCheckThreshold {
		sumRef, err := up.wholeFileDigest(n.fullPath)
		if err == nil {
			sum = sumRef.String()
			ok := false
			pr, ok = up.fileMapFromDuplicate(up.statReceiver(n), filebb, sum)
			if ok {
				br = pr.BlobRef
				android.NoteFileUploaded(n.fullPath, !pr.Skipped)
				if up.fileOpts.wantVivify() {
					// we can return early in that case, because the other options
					// are disallowed in the vivify case.
					return pr, nil
				}
			}
		}
	}

	if up.fileOpts.wantVivify() {
		// If vivify wasn't already done in fileMapFromDuplicate.
		err := schema.WriteFileChunks(up.noStatReceiver(up.statReceiver(n)), filebb, fileContents)
		if err != nil {
			return nil, err
		}
		json, err := filebb.JSON()
		if err != nil {
			return nil, err
		}
		br = blob.SHA1FromString(json)
		h := &client.UploadHandle{
			BlobRef:  br,
			Size:     uint32(len(json)),
			Contents: strings.NewReader(json),
			Vivify:   true,
		}
		pr, err = up.Upload(h)
		if err != nil {
			return nil, err
		}
		android.NoteFileUploaded(n.fullPath, true)
		return pr, nil
	}

	if !br.Valid() {
		// br still zero means fileMapFromDuplicate did not find the file on the server,
		// and the file has not just been uploaded subsequently to a vivify request.
		// So we do the full file + file schema upload here.
		if sum == "" && up.fileOpts.wantFilePermanode() {
			fileContents = &trackDigestReader{r: fileContents}
		}
		br, err = schema.WriteFileMap(up.noStatReceiver(up.statReceiver(n)), filebb, fileContents)
		if err != nil {
			return nil, err
		}
	}

	// The work for those planned permanodes (and the claims) is redone
	// everytime we get here (i.e past the stat cache). However, they're
	// caught by the have cache, so they won't be reuploaded for nothing
	// at least.
	if up.fileOpts.wantFilePermanode() {
		if td, ok := fileContents.(*trackDigestReader); ok {
			sum = td.Sum()
		}
		// claimTime is both the time of the "claimDate" in the
		// JSON claim, as well as the date in the OpenPGP
		// header.
		// TODO(bradfitz): this is a little clumsy to do by hand.
		// There should probably be a method on *Uploader to do this
		// from an unsigned schema map. Maybe ditch the schema.Claimer
		// type and just have the Uploader override the claimDate.
		claimTime, ok := filebb.ModTime()
		if !ok {
			return nil, fmt.Errorf("couldn't get modtime for file %v", n.fullPath)
		}
		err = up.uploadFilePermanode(sum, br, claimTime)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode for node %v: %v", n, err)
		}
	}

	// TODO(bradfitz): faking a PutResult here to return
	// is kinda gross.  should instead make a
	// blobserver.Storage wrapper type (wrapping
	// statReceiver) that can track some of this?  or make
	// schemaWriteFileMap return it?
	json, _ := filebb.JSON()
	pr = &client.PutResult{BlobRef: br, Size: uint32(len(json)), Skipped: false}
	return pr, nil
}
Beispiel #4
0
// blobref: of the file or schema blob
//      blob: the parsed file schema blob
//      bm: keys to populate
func (ix *Index) populateFile(b *schema.Blob, bm BatchMutation) error {
	var times []time.Time // all creation or mod times seen; may be zero
	times = append(times, b.ModTime())

	blobRef := b.BlobRef()
	seekFetcher := blob.SeekerFromStreamingFetcher(ix.BlobSource)
	fr, err := b.NewFileReader(seekFetcher)
	if err != nil {
		// TODO(bradfitz): propagate up a transient failure
		// error type, so we can retry indexing files in the
		// future if blobs are only temporarily unavailable.
		// Basically the same as the TODO just below.
		log.Printf("index: error indexing file, creating NewFileReader %s: %v", blobRef, err)
		return nil
	}
	defer fr.Close()
	mime, reader := magic.MIMETypeFromReader(fr)

	sha1 := sha1.New()
	var copyDest io.Writer = sha1
	var imageBuf *keepFirstN // or nil
	if strings.HasPrefix(mime, "image/") {
		imageBuf = &keepFirstN{N: 256 << 10}
		copyDest = io.MultiWriter(copyDest, imageBuf)
	}
	size, err := io.Copy(copyDest, reader)
	if err != nil {
		// TODO: job scheduling system to retry this spaced
		// out max n times.  Right now our options are
		// ignoring this error (forever) or returning the
		// error and making the indexing try again (likely
		// forever failing).  Both options suck.  For now just
		// log and act like all's okay.
		log.Printf("index: error indexing file %s: %v", blobRef, err)
		return nil
	}

	if imageBuf != nil {
		if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil {
			bm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
		}
		if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
			times = append(times, ft)
		} else {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
		}
	}

	var sortTimes []time.Time
	for _, t := range times {
		if !t.IsZero() {
			sortTimes = append(sortTimes, t)
		}
	}
	sort.Sort(types.ByTime(sortTimes))
	var time3339s string
	switch {
	case len(sortTimes) == 1:
		time3339s = types.Time3339(sortTimes[0]).String()
	case len(sortTimes) >= 2:
		oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
		time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
	}

	wholeRef := blob.RefFromHash(sha1)
	bm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
	bm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime))
	bm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))

	if strings.HasPrefix(mime, "audio/") {
		tag, err := taglib.Decode(fr, fr.Size())
		if err == nil {
			indexMusic(tag, wholeRef, bm)
		} else {
			log.Print("index: error parsing tag: ", err)
		}
	}

	return nil
}
Beispiel #5
0
func (up *Uploader) uploadNodeRegularFile(n *node) (*client.PutResult, error) {
	// TODO(mpl): maybe break this func into more maintainable pieces?
	filebb := schema.NewCommonFileMap(n.fullPath, n.fi)
	filebb.SetType("file")
	file, err := up.open(n.fullPath)
	if err != nil {
		return nil, err
	}
	defer file.Close()
	if up.fileOpts.exifTime {
		ra, ok := file.(io.ReaderAt)
		if !ok {
			return nil, errors.New("Error asserting local file to io.ReaderAt")
		}
		modtime, err := schema.FileTime(ra)
		if err != nil {
			log.Printf("warning: getting time from EXIF failed for %v: %v", n.fullPath, err)
		} else {
			filebb.SetModTime(modtime)
		}
	}

	var (
		size                           = n.fi.Size()
		fileContents io.Reader         = io.LimitReader(file, size)
		br           *blobref.BlobRef  // of file schemaref
		sum          string            // sha1 hashsum of the file to upload
		pr           *client.PutResult // of the final "file" schema blob
	)

	const dupCheckThreshold = 256 << 10
	if size > dupCheckThreshold {
		sumRef, err := up.wholeFileDigest(n.fullPath)
		if err == nil {
			sum = sumRef.String()
			ok := false
			pr, ok = up.fileMapFromDuplicate(up.statReceiver(n), filebb, sum)
			if ok {
				br = pr.BlobRef
				noteFileUploaded(n.fullPath, !pr.Skipped)
				if up.fileOpts.wantVivify() {
					// we can return early in that case, because the other options
					// are disallowed in the vivify case.
					return pr, nil
				}
			}
		}
	}

	if up.fileOpts.wantVivify() {
		// If vivify wasn't already done in fileMapFromDuplicate.
		err := schema.WriteFileChunks(up.statReceiver(n), filebb, fileContents)
		if err != nil {
			return nil, err
		}
		json, err := filebb.JSON()
		if err != nil {
			return nil, err
		}
		br = blobref.SHA1FromString(json)
		h := &client.UploadHandle{
			BlobRef:  br,
			Size:     int64(len(json)),
			Contents: strings.NewReader(json),
			Vivify:   true,
		}
		pr, err = up.Upload(h)
		if err != nil {
			return nil, err
		}
		noteFileUploaded(n.fullPath, true)
		return pr, nil
	}

	if br == nil {
		// br still nil means fileMapFromDuplicate did not find the file on the server,
		// and the file has not just been uploaded subsequently to a vivify request.
		// So we do the full file + file schema upload here.
		if sum == "" && up.fileOpts.wantFilePermanode() {
			fileContents = &trackDigestReader{r: fileContents}
		}
		br, err = schema.WriteFileMap(up.statReceiver(n), filebb, fileContents)
		if err != nil {
			return nil, err
		}
	}

	// TODO(mpl): test that none of these claims get uploaded if they've already been done
	if up.fileOpts.wantFilePermanode() {
		if td, ok := fileContents.(*trackDigestReader); ok {
			sum = td.Sum()
		}
		// Use a fixed time value for signing; not using modtime
		// so two identical files don't have different modtimes?
		// TODO(bradfitz): consider this more?
		permaNodeSigTime := time.Unix(0, 0)
		permaNode, err := up.UploadPlannedPermanode(sum, permaNodeSigTime)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode for node %v: %v", n, err)
		}
		handleResult("node-permanode", permaNode, nil)

		// claimTime is both the time of the "claimDate" in the
		// JSON claim, as well as the date in the OpenPGP
		// header.
		// TODO(bradfitz): this is a little clumsy to do by hand.
		// There should probably be a method on *Uploader to do this
		// from an unsigned schema map. Maybe ditch the schema.Claimer
		// type and just have the Uploader override the claimDate.
		claimTime, ok := filebb.ModTime()
		if !ok {
			return nil, fmt.Errorf("couldn't get modtime back for file %v", n.fullPath)
		}
		contentAttr := schema.NewSetAttributeClaim(permaNode.BlobRef, "camliContent", br.String())
		contentAttr.SetClaimDate(claimTime)
		signed, err := up.SignBlob(contentAttr, claimTime)
		if err != nil {
			return nil, fmt.Errorf("Failed to sign content claim for node %v: %v", n, err)
		}
		put, err := up.uploadString(signed)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode's attribute for node %v: %v", n, err)
		}
		handleResult("node-permanode-contentattr", put, nil)
		if tags := up.fileOpts.tags(); len(tags) > 0 {
			errch := make(chan error)
			for _, tag := range tags {
				go func(tag string) {
					m := schema.NewAddAttributeClaim(permaNode.BlobRef, "tag", tag)
					m.SetClaimDate(claimTime)
					signed, err := up.SignBlob(m, claimTime)
					if err != nil {
						errch <- fmt.Errorf("Failed to sign tag claim for node %v: %v", n, err)
						return
					}
					put, err := up.uploadString(signed)
					if err != nil {
						errch <- fmt.Errorf("Error uploading permanode's tag attribute %v for node %v: %v", tag, n, err)
						return
					}
					handleResult("node-permanode-tag", put, nil)
					errch <- nil
				}(tag)
			}

			for _ = range tags {
				if e := <-errch; e != nil && err == nil {
					err = e
				}
			}
			if err != nil {
				return nil, err
			}
		}
	}

	// TODO(bradfitz): faking a PutResult here to return
	// is kinda gross.  should instead make a
	// blobserver.Storage wrapper type (wrapping
	// statReceiver) that can track some of this?  or make
	// schemaWriteFileMap return it?
	json, _ := filebb.JSON()
	pr = &client.PutResult{BlobRef: br, Size: int64(len(json)), Skipped: false}
	return pr, nil
}
Beispiel #6
0
func (up *Uploader) uploadNodeRegularFile(n *node) (*client.PutResult, error) {
	m := schema.NewCommonFileMap(n.fullPath, n.fi)
	m["camliType"] = "file"
	file, err := up.open(n.fullPath)
	if err != nil {
		return nil, err
	}
	defer file.Close()
	if up.fileOpts.exifTime {
		ra, ok := file.(io.ReaderAt)
		if !ok {
			return nil, errors.New("Error asserting local file to io.ReaderAt")
		}
		modtime, err := schema.FileTime(ra)
		if err != nil {
			log.Printf("warning: getting time from EXIF failed for %v: %v", n.fullPath, err)
		} else {
			m["unixMtime"] = schema.RFC3339FromTime(modtime)
		}
	}

	size := n.fi.Size()
	var fileContents io.Reader = io.LimitReader(file, size)

	if up.fileOpts.wantVivify() {
		err := schema.WriteFileChunks(up.statReceiver(), m, fileContents)
		if err != nil {
			return nil, err
		}
		json, err := m.JSON()
		if err != nil {
			return nil, err
		}
		bref := blobref.SHA1FromString(json)
		h := &client.UploadHandle{
			BlobRef:  bref,
			Size:     int64(len(json)),
			Contents: strings.NewReader(json),
			Vivify:   true,
		}
		return up.Upload(h)
	}

	var (
		blobref *blobref.BlobRef // of file schemaref
		sum     string           // "sha1-xxxxx"
	)

	const dupCheckThreshold = 256 << 10
	if size > dupCheckThreshold {
		sumRef, err := up.wholeFileDigest(n.fullPath)
		if err == nil {
			sum = sumRef.String()
			if ref, ok := up.fileMapFromDuplicate(up.statReceiver(), m, sum); ok {
				blobref = ref
			}
		}
	}

	if blobref == nil {
		if sum == "" && up.fileOpts.wantFilePermanode() {
			fileContents = &trackDigestReader{r: fileContents}
		}
		blobref, err = schema.WriteFileMap(up.statReceiver(), m, fileContents)
		if err != nil {
			return nil, err
		}
	}

	// TODO(mpl): test that none of these claims get uploaded if they've already been done
	if up.fileOpts.wantFilePermanode() {
		if td, ok := fileContents.(*trackDigestReader); ok {
			sum = td.Sum()
		}
		// Use a fixed time value for signing; not using modtime
		// so two identical files don't have different modtimes?
		// TODO(bradfitz): consider this more?
		permaNodeSigTime := time.Unix(0, 0)
		permaNode, err := up.UploadPlannedPermanode(sum, permaNodeSigTime)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode for node %v: %v", n, err)
		}
		handleResult("node-permanode", permaNode, nil)

		// claimTime is both the time of the "claimDate" in the
		// JSON claim, as well as the date in the OpenPGP
		// header.
		// TODO(bradfitz): this is a little clumsy to do by hand.
		// There should probably be a method on *Uploader to do this
		// from an unsigned schema map. Maybe ditch the schema.Claimer
		// type and just have the Uploader override the claimDate.
		claimTime, err := time.Parse(time.RFC3339, m["unixMtime"].(string))
		if err != nil {
			return nil, fmt.Errorf("While parsing modtime for file %v: %v", n.fullPath, err)
		}
		contentAttr := schema.NewSetAttributeClaim(permaNode.BlobRef, "camliContent", blobref.String())
		contentAttr.SetClaimDate(claimTime)
		signed, err := up.SignMap(contentAttr, claimTime)
		if err != nil {
			return nil, fmt.Errorf("Failed to sign content claim for node %v: %v", n, err)
		}
		put, err := up.uploadString(signed)
		if err != nil {
			return nil, fmt.Errorf("Error uploading permanode's attribute for node %v: %v", n, err)
		}
		handleResult("node-permanode-contentattr", put, nil)
		if tags := up.fileOpts.tags(); len(tags) > 0 {
			// TODO(mpl): do these claims concurrently, not in series
			for _, tag := range tags {
				m := schema.NewAddAttributeClaim(permaNode.BlobRef, "tag", tag)
				m.SetClaimDate(claimTime)
				// TODO(mpl): verify that SetClaimDate does modify the GPG signature date of the claim
				signed, err := up.SignMap(m, claimTime)
				if err != nil {
					return nil, fmt.Errorf("Failed to sign tag claim for node %v: %v", n, err)
				}
				put, err := up.uploadString(signed)
				if err != nil {
					return nil, fmt.Errorf("Error uploading permanode's tag attribute %v for node %v: %v", tag, n, err)
				}
				handleResult("node-permanode-tag", put, nil)
			}
		}
	}

	// TODO(bradfitz): faking a PutResult here to return
	// is kinda gross.  should instead make a
	// blobserver.Storage wrapper type (wrapping
	// statReceiver) that can track some of this?  or make
	// schemaWriteFileMap return it?
	json, _ := m.JSON()
	pr := &client.PutResult{BlobRef: blobref, Size: int64(len(json)), Skipped: false}
	return pr, nil
}
Beispiel #7
0
// b: the parsed file schema blob
// mm: keys to populate
func (ix *Index) populateFile(b *schema.Blob, mm *mutationMap) (err error) {
	var times []time.Time // all creation or mod times seen; may be zero
	times = append(times, b.ModTime())

	blobRef := b.BlobRef()
	fetcher := &seekFetcherMissTracker{
		// TODO(bradfitz): cache this SeekFetcher on ix so it
		// it's have to be re-made each time? Probably small.
		src: blob.SeekerFromStreamingFetcher(ix.BlobSource),
	}
	defer func() {
		if err == nil {
			return
		}
		fetcher.mu.Lock()
		defer fetcher.mu.Unlock()
		if len(fetcher.missing) == 0 {
			return
		}
		// TODO(bradfitz): there was an error indexing this file, and
		// we failed to load the blobs in f.missing.  Add those as dependencies
		// somewhere so when we get one of those missing blobs, we kick off
		// a re-index of this file for whenever the indexer is idle.
	}()
	fr, err := b.NewFileReader(fetcher)
	if err != nil {
		// TODO(bradfitz): propagate up a transient failure
		// error type, so we can retry indexing files in the
		// future if blobs are only temporarily unavailable.
		// Basically the same as the TODO just below.
		//
		// We'll also want to bump the schemaVersion after this,
		// to fix anybody's index which is only partial due to
		// this old bug where it would return nil instead of doing
		// the necessary work.
		log.Printf("index: error indexing file, creating NewFileReader %s: %v", blobRef, err)
		return nil
	}
	defer fr.Close()
	mime, reader := magic.MIMETypeFromReader(fr)

	sha1 := sha1.New()
	var copyDest io.Writer = sha1
	var imageBuf *keepFirstN // or nil
	if strings.HasPrefix(mime, "image/") {
		// Emperically derived 1MiB assuming CR2 images require more than any
		// other filetype we support:
		//   https://gist.github.com/wathiede/7982372
		imageBuf = &keepFirstN{N: 1 << 20}
		copyDest = io.MultiWriter(copyDest, imageBuf)
	}
	size, err := io.Copy(copyDest, reader)
	if err != nil {
		// TODO: job scheduling system to retry this spaced
		// out max n times.  Right now our options are
		// ignoring this error (forever) or returning the
		// error and making the indexing try again (likely
		// forever failing).  Both options suck.  For now just
		// log and act like all's okay.
		//
		// See TODOs above, and the fetcher.missing stuff.
		log.Printf("index: error indexing file %s: %v", blobRef, err)
		return nil
	}
	wholeRef := blob.RefFromHash(sha1)

	if imageBuf != nil {
		if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil {
			mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
		}
		if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
			times = append(times, ft)
		} else {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
		}

		indexEXIF(wholeRef, imageBuf.Bytes, mm)
	}

	var sortTimes []time.Time
	for _, t := range times {
		if !t.IsZero() {
			sortTimes = append(sortTimes, t)
		}
	}
	sort.Sort(types.ByTime(sortTimes))
	var time3339s string
	switch {
	case len(sortTimes) == 1:
		time3339s = types.Time3339(sortTimes[0]).String()
	case len(sortTimes) >= 2:
		oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
		time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
	}

	mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
	mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime))
	mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))

	if strings.HasPrefix(mime, "audio/") {
		indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm)
	}

	return nil
}
Beispiel #8
0
// b: the parsed file schema blob
// mm: keys to populate
func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) {
	var times []time.Time // all creation or mod times seen; may be zero
	times = append(times, b.ModTime())

	blobRef := b.BlobRef()
	fr, err := b.NewFileReader(fetcher)
	if err != nil {
		return err
	}
	defer fr.Close()
	mime, mr := magic.MIMETypeFromReader(fr)

	sha1 := sha1.New()
	var copyDest io.Writer = sha1
	var imageBuf *keepFirstN // or nil
	if strings.HasPrefix(mime, "image/") {
		imageBuf = &keepFirstN{N: 512 << 10}
		copyDest = io.MultiWriter(copyDest, imageBuf)
	}
	size, err := io.Copy(copyDest, mr)
	if err != nil {
		return err
	}
	wholeRef := blob.RefFromHash(sha1)

	if imageBuf != nil {
		var conf images.Config
		decodeConfig := func(r filePrefixReader) error {
			conf, err = images.DecodeConfig(r)
			return err
		}
		if err := readPrefixOrFile(imageBuf.Bytes, fetcher, b, decodeConfig); err == nil {
			mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height)))
		}

		var ft time.Time
		fileTime := func(r filePrefixReader) error {
			ft, err = schema.FileTime(r)
			return err
		}
		if err = readPrefixOrFile(imageBuf.Bytes, fetcher, b, fileTime); err == nil {
			times = append(times, ft)
		}
		if exifDebug {
			log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err)
		}

		// TODO(mpl): find (generate?) more broken EXIF images to experiment with.
		indexEXIFData := func(r filePrefixReader) error {
			return indexEXIF(wholeRef, r, mm)
		}
		if err = readPrefixOrFile(imageBuf.Bytes, fetcher, b, indexEXIFData); err != nil {
			if exifDebug {
				log.Printf("error parsing EXIF: %v", err)
			}
		}
	}

	var sortTimes []time.Time
	for _, t := range times {
		if !t.IsZero() {
			sortTimes = append(sortTimes, t)
		}
	}
	sort.Sort(types.ByTime(sortTimes))
	var time3339s string
	switch {
	case len(sortTimes) == 1:
		time3339s = types.Time3339(sortTimes[0]).String()
	case len(sortTimes) >= 2:
		oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1]
		time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String()
	}

	mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1")
	mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime, wholeRef))
	mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s))

	if strings.HasPrefix(mime, "audio/") {
		indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm)
	}

	return nil
}