Beispiel #1
0
func (c *ContainerMatcher) identify(n string, rdr Reader, res chan core.Result) {
	// safe to call on a nil matcher (i.e. container matching switched off)
	if c == nil {
		close(res)
		return
	}
	id := c.newIdentifier(len(c.parts))
	var err error
	for err = rdr.Next(); err == nil; err = rdr.Next() {
		ct, ok := c.nameCTest[rdr.Name()]
		if !ok {
			continue
		}
		if config.Debug() {
			fmt.Fprintf(config.Out(), "{Name match - %s (container %d))}\n", rdr.Name(), c.conType)
		}
		// name has matched, let's test the CTests
		// ct.identify will generate a slice of hits which pass to
		// processHits which will return true if we can stop
		if c.processHits(ct.identify(c, id, rdr, rdr.Name()), id, ct, rdr.Name(), res) {
			break
		}
	}
	// send a default hit if no result and extension matches
	if c.extension != "" && !id.result && filepath.Ext(n) == "."+c.extension {
		res <- defaultHit(-1 - int(c.conType))
	}
	close(res)
}
Beispiel #2
0
// identify function - brings a new matcher into existence
func (b *Matcher) identify(buf *siegreader.Buffer, quit chan struct{}, r chan core.Result) {
	buf.Quit = quit
	incoming := b.scorer(buf, quit, r)
	rdr := siegreader.LimitReaderFrom(buf, b.maxBOF)
	// First test BOF frameset
	bfchan := b.bofFrames.index(buf, false, quit)
	for bf := range bfchan {
		if config.Debug() {
			fmt.Fprintln(config.Out(), strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true})
		}
		incoming <- strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true}
	}
	select {
	case <-quit: // the matcher has called quit
		for _ = range bfchan {
		} // drain first
		close(incoming)
		return
	default:
	}

	// Do an initial check of BOF sequences
	b.start(true) // start bof matcher if not yet started
	var bchan chan wac.Result
	bchan = b.bAho.Index(rdr)
	for br := range bchan {
		if br.Index[0] == -1 {
			incoming <- progressStrike(br.Offset, false)
			if br.Offset > 131072 && (b.maxBOF < 0 || b.maxBOF > b.maxEOF*5) { // del buf.Stream
				break
			}
		} else {
			if config.Debug() {
				fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false})
			}
			incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}
		}
	}
	select {
	case <-quit: // the matcher has called quit
		for _ = range bchan {
		} // drain first
		close(incoming)
		return
	default:
	}

	// Setup EOF tests
	efchan := b.eofFrames.index(buf, true, quit)
	b.start(false)
	rrdr := siegreader.LimitReverseReaderFrom(buf, b.maxEOF)
	echan := b.eAho.Index(rrdr)

	// if we have a maximum value on EOF do a sequential search
	if b.maxEOF >= 0 {
		if b.maxEOF != 0 {
			_, _ = buf.CanSeek(0, true) // force a full read to enable EOF scan to proceed for streams
		}
		for ef := range efchan {
			if config.Debug() {
				fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true})
			}
			incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}
		}
		// Scan complete EOF
		for er := range echan {
			if er.Index[0] == -1 {
				incoming <- progressStrike(er.Offset, true)
			} else {
				if config.Debug() {
					fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false})
				}
				incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}
			}
		}
		// send a final progress strike with the maximum EOF
		incoming <- progressStrike(int64(b.maxEOF), true)
		// Finally, finish BOF scan
		for br := range bchan {
			if br.Index[0] == -1 {
				incoming <- progressStrike(br.Offset, false)
			} else {
				if config.Debug() {
					fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false})
				}
				incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}
			}
		}
		close(incoming)
		return
	}
	// If no maximum on EOF do a parallel search
	for {
		select {
		case br, ok := <-bchan:
			if !ok {
				if b.maxBOF < 0 && b.maxEOF != 0 {
					_, _ = buf.CanSeek(0, true) // if we've a limit BOF reader, force a full read to enable EOF scan to proceed for streams
				}
				bchan = nil
			} else {
				if br.Index[0] == -1 {
					incoming <- progressStrike(br.Offset, false)
				} else {
					if config.Debug() {
						fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false})
					}
					incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}
				}
			}
		case ef, ok := <-efchan:
			if !ok {
				efchan = nil
			} else {
				if config.Debug() {
					fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true})
				}
				incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}
			}
		case er, ok := <-echan:
			if !ok {
				echan = nil
			} else {
				if er.Index[0] == -1 {
					incoming <- progressStrike(er.Offset, true)
				} else {
					if config.Debug() {
						fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false})
					}
					incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}
				}
			}
		}
		if bchan == nil && efchan == nil && echan == nil {
			close(incoming)
			return
		}
	}
}
Beispiel #3
0
func (b *Matcher) scorer(buf *siegreader.Buffer, q chan struct{}, r chan<- core.Result) chan<- strike {
	incoming := make(chan strike)
	waitSet := b.priorities.WaitSet()
	hits := make(map[int]*hitItem)
	strikes := make(map[int]*strikeItem)

	var bof int64
	var eof int64

	var quitting bool
	quit := func() {
		close(q)
		quitting = true
	}

	newHit := func(i int) *hitItem {
		l := len(b.keyFrames[i])
		hit := &hitItem{
			potentialIdxs: make([]int, l),
			partials:      make([][][2]int64, l),
		}
		hits[i] = hit
		return hit
	}

	// given the current bof and eof, is there anything worth waiting for?
	continueWaiting := func(w []int) bool {
		var keepScanning bool
		// now for each of the possible signatures we are either waiting on or have partial/potential matches for, check whether there are live contenders
		for _, v := range w {
			kf := b.keyFrames[v]
			for i, f := range kf {
				off := bof
				if f.typ > frames.PREV {
					off = eof
				}
				var waitfor, excludable bool
				if f.key.pMax == -1 || f.key.pMax+int64(f.key.lMax) > off {
					waitfor = true
				} else if hit, ok := hits[v]; ok {
					if hit.partials[i] != nil {
						waitfor = true
					} else if hit.potentialIdxs[i] > 0 && strikes[hit.potentialIdxs[i]-1].hasPotential() {
						waitfor, excludable = true, true
					}
				}
				// if we've got to the end of the signature, and have determined this is a live one - return immediately & continue scan
				if waitfor {
					if i == len(kf)-1 {
						if !config.Slow() || !config.Checkpoint(bof) {
							return true
						}
						keepScanning = true
						fmt.Fprintf(config.Out(), "waiting on: %d, potentially excludable: %t\n", v, excludable)
					}
					continue
				}
				break
			}
		}
		return keepScanning
	}

	testStrike := func(st strike) []kfHit {
		// the offsets we *record* are always BOF offsets - these can be interpreted as EOF offsets when necessary
		off := st.offset
		if st.reverse {
			off = buf.Size() - st.offset - int64(st.length)
		}
		// grab the relevant testTree
		t := b.tests[st.idxa+st.idxb]
		res := make([]kfHit, 0, 10)
		// immediately apply key frames for the completes
		for _, kf := range t.complete {
			if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) {
				res = append(res, kfHit{kf, off, st.length})
			}
		}
		// if there are no incompletes, we are done
		if len(t.incomplete) < 1 {
			return res
		}
		// see what incompletes are worth pursuing
		var checkl, checkr bool
		for _, v := range t.incomplete {
			if checkl && checkr {
				break
			}
			if b.keyFrames[v.kf[0]][v.kf[1]].check(st.offset) && waitSet.Check(v.kf[0]) {
				if v.l {
					checkl = true
				}
				if v.r {
					checkr = true
				}
			}
		}
		if !checkl && !checkr {
			return res
		}
		// calculate the offset and lengths for the left and right test slices
		var lslc, rslc []byte
		var lpos, rpos int64
		var llen, rlen int
		if st.reverse {
			lpos, llen = st.offset+int64(st.length), t.maxLeftDistance
			rpos, rlen = st.offset-int64(t.maxRightDistance), t.maxRightDistance
			if rpos < 0 {
				rlen = rlen + int(rpos)
				rpos = 0
			}
		} else {
			lpos, llen = st.offset-int64(t.maxLeftDistance), t.maxLeftDistance
			rpos, rlen = st.offset+int64(st.length), t.maxRightDistance
			if lpos < 0 {
				llen = llen + int(lpos)
				lpos = 0
			}
		}
		//  the partials slice has a mirror entry for each of the testTree incompletes
		partials := make([]partial, len(t.incomplete))
		// test left (if there are valid left tests to try)
		if checkl {
			if st.reverse {
				lslc, _ = buf.EofSlice(lpos, llen)
			} else {
				lslc, _ = buf.Slice(lpos, llen)
			}
			left := matchTestNodes(t.left, lslc, true)
			for _, lp := range left {
				if partials[lp.followUp].l {
					partials[lp.followUp].ldistances = append(partials[lp.followUp].ldistances, lp.distances...)
				} else {
					partials[lp.followUp].l = true
					partials[lp.followUp].ldistances = lp.distances
				}
			}
		}
		// test right (if there are valid right tests to try)
		if checkr {
			if st.reverse {
				rslc, _ = buf.EofSlice(rpos, rlen)
			} else {
				rslc, _ = buf.Slice(rpos, rlen)
			}
			right := matchTestNodes(t.right, rslc, false)
			for _, rp := range right {
				if partials[rp.followUp].r {
					partials[rp.followUp].rdistances = append(partials[rp.followUp].rdistances, rp.distances...)
				} else {
					partials[rp.followUp].r = true
					partials[rp.followUp].rdistances = rp.distances
				}
			}
		}
		// now iterate through the partials, checking whether they fulfil any of the incompletes
		for i, p := range partials {
			if p.l == t.incomplete[i].l && p.r == t.incomplete[i].r {
				kf := t.incomplete[i].kf
				if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) {
					if !p.l {
						p.ldistances = []int{0}
					}
					if !p.r {
						p.rdistances = []int{0}
					}
					if oneEnough(kf[1], b.keyFrames[kf[0]]) {
						res = append(res, kfHit{kf, off - int64(p.ldistances[0]), p.ldistances[0] + st.length + p.rdistances[0]})
						continue
					}
					for _, ldistance := range p.ldistances {
						for _, rdistance := range p.rdistances {
							res = append(res, kfHit{kf, off - int64(ldistance), ldistance + st.length + rdistance})
						}
					}
				}
			}
		}
		return res
	}

	applyKeyFrame := func(hit kfHit) (bool, string) {
		kfs := b.keyFrames[hit.id[0]]
		if len(kfs) == 1 {
			return true, fmt.Sprintf("byte match at %d, %d", hit.offset, hit.length)
		}
		h, ok := hits[hit.id[0]]
		if !ok {
			h = newHit(hit.id[0])
		}
		if h.partials[hit.id[1]] == nil {
			h.partials[hit.id[1]] = [][2]int64{[2]int64{hit.offset, int64(hit.length)}}
		} else {
			h.partials[hit.id[1]] = append(h.partials[hit.id[1]], [2]int64{hit.offset, int64(hit.length)})
		}
		for _, p := range h.partials {
			if p == nil {
				return false, ""
			}
		}
		prevOff := h.partials[0]
		basis := make([][][2]int64, len(kfs))
		basis[0] = prevOff
		prevKf := kfs[0]
		ok = false
		for i, kf := range kfs[1:] {
			var nextKf keyFrame
			if i+2 < len(kfs) {
				nextKf = kfs[i+2]
			}
			thisOff := h.partials[i+1]
			prevOff, ok = kf.checkRelated(prevKf, nextKf, thisOff, prevOff)
			if !ok {
				return false, ""
			}
			basis[i+1] = prevOff
			prevKf = kf
		}
		return true, fmt.Sprintf("byte match at %v", basis)
	}

	go func() {
		for in := range incoming {
			// if we've got a postive result, drain any remaining strikes from the matchers
			if quitting {
				continue
			}
			// if the strike reports progress, check if we should be continuing to wait
			if in.idxa == -1 {
				// update with the latest offset
				if in.reverse {
					eof = in.offset
				} else {
					bof = in.offset
				}
				w := waitSet.WaitingOn()
				// if any of the waitlists are nil, we will continue - unless we are past the known bof and known eof (points at which we *should* have got at least partial matches), in which case we will check if any partial/potential matches are live
				if w == nil {
					// keep going if we don't have a maximum known bof, or if our current bof/eof are less than the maximum known bof/eof
					if b.knownBOF < 0 || int64(b.knownBOF) > bof || int64(b.knownEOF) > eof {
						continue
					}
					// if we don't have a waitlist, and we are past the known bof and known eof, grab all the partials and potentials to check if any are live
					w = all(hits)
				}
				// exhausted all contenders, we can stop scanning
				if !continueWaiting(w) {
					quit()
				}
				continue
			}
			// now cache or satisfy the strike
			var hasPotential bool
			potentials := filterKF(b.tests[in.idxa+in.idxb].keyFrames(), waitSet)
			for _, pot := range potentials {
				// if any of the signatures are single keyframe we can satisfy immediately and skip cache
				if len(b.keyFrames[pot[0]]) == 1 {
					hasPotential = true
					break
				}
				if hit, ok := hits[pot[0]]; ok && hit.potentiallyComplete(pot[1], strikes) {
					hasPotential = true
					break
				}
			}
			if !hasPotential {
				// cache the strike
				s, ok := strikes[in.idxa+in.idxb]
				if !ok {
					s = &strikeItem{in, -1, nil}
					strikes[in.idxa+in.idxb] = s
				} else {
					if s.successive == nil {
						s.successive = make([][2]int64, 0, 10)
					}
					s.successive = append(s.successive, [2]int64{in.offset, int64(in.length)})
				}
				// range over the potentials, linking to the strike
				for _, pot := range potentials {
					if b.keyFrames[pot[0]][pot[1]].check(in.offset) {
						hit, ok := hits[pot[0]]
						if !ok {
							hit = newHit(pot[0])
						}
						hit.potentialIdxs[pot[1]] = in.idxa + in.idxb + 1
					}
				}
				goto end
			}
			// satisfy the strike
			for {
				ks := testStrike(in)
				for _, k := range ks {
					if match, basis := applyKeyFrame(k); match {
						if waitSet.Check(k.id[0]) {
							r <- result{k.id[0], basis}
							if waitSet.Put(k.id[0]) {
								quit()
								goto end
							}
						}
						if h, ok := hits[k.id[0]]; ok {
							h.matched = true
						}
					}
				}
				potentials = filterKF(potentials, waitSet)
				var ok bool
				for _, pot := range potentials {
					in, ok = hits[pot[0]].nextPotential(strikes)
					if ok {
						break
					}
				}
				if !ok {
					break
				}
			}
		end: // keep looping until incoming is closed
		}
		close(r)
	}()
	return incoming
}
Beispiel #4
0
// Identify identifies a stream or file object.
// It takes the name of the file/stream (if unknown, give an empty string) and an io.Reader
// It returns a channel of identifications and an error.
func (s *Siegfried) Identify(r io.Reader, name, mime string) (chan core.Identification, error) {
	buffer, err := s.buffers.Get(r)
	if err == io.EOF {
		err = nil
	}
	if err != nil && err != siegreader.ErrEmpty {
		return nil, fmt.Errorf("siegfried: error reading file; got %v", err)
	}
	res := make(chan core.Identification)
	recs := make([]core.Recorder, len(s.ids))
	for i, v := range s.ids {
		recs[i] = v.Recorder()
		if name != "" {
			recs[i].Active(core.ExtensionMatcher)
		}
		if mime != "" {
			recs[i].Active(core.MIMEMatcher)
		}
		if err == nil {
			//recs[i].Active(core.ContainerMatcher)
			//recs[i].Active(core.ByteMatcher)
			recs[i].Active(core.TextMatcher)
		}
	}
	// Extension Matcher
	if len(name) > 0 {
		ext := stringmatcher.NormaliseExt(name)
		ems, _ := s.em.Identify(ext, nil) // we don't care about an error here
		for v := range ems {
			for _, rec := range recs {
				if rec.Record(core.ExtensionMatcher, stringmatcher.ExtResult{v}) {
					break
				}
			}
		}
	}
	// MIME Matcher
	if len(mime) > 0 {
		mime = stringmatcher.NormaliseMIME(mime)
		mms, _ := s.mm.Identify(mime, nil) // we don't care about an error here
		for v := range mms {
			for _, rec := range recs {
				if rec.Record(core.MIMEMatcher, stringmatcher.MIMEResult{v}) {
					break
				}
			}
		}
	}
	// Container Matcher
	if s.cm != nil {
		if config.Debug() {
			fmt.Fprintln(config.Out(), ">>START CONTAINER MATCHER")
		}
		cms, cerr := s.cm.Identify(name, buffer)
		for v := range cms {
			for _, rec := range recs {
				if rec.Record(core.ContainerMatcher, v) {
					break
				}
			}
		}
		if err == nil {
			err = cerr
		}
	}
	satisfied := true
	for _, rec := range recs {
		if !rec.Satisfied(core.ByteMatcher) {
			satisfied = false
		}
	}
	// Byte Matcher
	if !satisfied {
		if config.Debug() {
			fmt.Fprintln(config.Out(), ">>START BYTE MATCHER")
		}
		ids, _ := s.bm.Identify("", buffer) // we don't care about an error here
		for v := range ids {
			for _, rec := range recs {
				if rec.Record(core.ByteMatcher, v) {
					break
				}
			}
		}
	}
	satisfied = true
	for _, rec := range recs {
		if !rec.Satisfied(core.TextMatcher) {
			satisfied = false
			break
		}
	}
	// Text Matcher
	if !satisfied {
		ids, _ := s.tm.Identify("", buffer) // we don't care about an error here
		for v := range ids {
			for _, rec := range recs {
				if rec.Record(core.TextMatcher, v) {
					break
				}
			}
		}
	}
	s.buffers.Put(buffer)
	go func() {
		for _, rec := range recs {
			rec.Report(res)
		}
		close(res)
	}()
	return res, err
}