func (c *ContainerMatcher) identify(n string, rdr Reader, res chan core.Result) { // safe to call on a nil matcher (i.e. container matching switched off) if c == nil { close(res) return } id := c.newIdentifier(len(c.parts)) var err error for err = rdr.Next(); err == nil; err = rdr.Next() { ct, ok := c.nameCTest[rdr.Name()] if !ok { continue } if config.Debug() { fmt.Fprintf(config.Out(), "{Name match - %s (container %d))}\n", rdr.Name(), c.conType) } // name has matched, let's test the CTests // ct.identify will generate a slice of hits which pass to // processHits which will return true if we can stop if c.processHits(ct.identify(c, id, rdr, rdr.Name()), id, ct, rdr.Name(), res) { break } } // send a default hit if no result and extension matches if c.extension != "" && !id.result && filepath.Ext(n) == "."+c.extension { res <- defaultHit(-1 - int(c.conType)) } close(res) }
// identify function - brings a new matcher into existence func (b *Matcher) identify(buf *siegreader.Buffer, quit chan struct{}, r chan core.Result, exclude ...int) { buf.Quit = quit waitSet := b.priorities.WaitSet(exclude...) var maxBOF, maxEOF int if len(exclude) > 0 { maxBOF, maxEOF = waitSet.MaxOffsets() } else { maxBOF, maxEOF = b.maxBOF, b.maxEOF } incoming := b.scorer(buf, waitSet, quit, r) rdr := siegreader.LimitReaderFrom(buf, maxBOF) // First test BOF frameset bfchan := b.bofFrames.index(buf, false, quit) for bf := range bfchan { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true}) } incoming <- strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true} } select { case <-quit: // the matcher has called quit for range bfchan { } // drain first close(incoming) return default: } // Do an initial check of BOF sequences b.start(true) // start bof matcher if not yet started var bchan chan wac.Result bchan = b.bAho.Index(rdr) for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) if br.Offset > 131072 && (maxBOF < 0 || maxBOF > maxEOF*5) { // del buf.Stream 2^16 65536 2^17 131072 break } } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } select { case <-quit: // the matcher has called quit for range bchan { } // drain first close(incoming) return default: } // Setup EOF tests efchan := b.eofFrames.index(buf, true, quit) b.start(false) rrdr := siegreader.LimitReverseReaderFrom(buf, maxEOF) echan := b.eAho.Index(rrdr) // if we have a maximum value on EOF do a sequential search if maxEOF >= 0 { if maxEOF != 0 { _, _ = buf.CanSeek(0, true) // force a full read to enable EOF scan to proceed for streams } for ef := range efchan { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } // Scan complete EOF for er := range echan { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } // send a final progress strike with the maximum EOF incoming <- progressStrike(int64(maxEOF), true) // Finally, finish BOF scan for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } close(incoming) return } // If no maximum on EOF do a parallel search for { select { case br, ok := <-bchan: if !ok { if maxBOF < 0 && maxEOF != 0 { _, _ = buf.CanSeek(0, true) // if we've a limit BOF reader, force a full read to enable EOF scan to proceed for streams } bchan = nil } else { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } case ef, ok := <-efchan: if !ok { efchan = nil } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } case er, ok := <-echan: if !ok { echan = nil } else { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } } if bchan == nil && efchan == nil && echan == nil { close(incoming) return } } }
func (b *Matcher) scorer(buf *siegreader.Buffer, waitSet *priority.WaitSet, q chan struct{}, r chan<- core.Result) chan<- strike { incoming := make(chan strike) hits := make(map[int]*hitItem) strikes := make(map[int]*strikeItem) var bof int64 var eof int64 var quitting bool quit := func() { close(q) quitting = true } newHit := func(i int) *hitItem { l := len(b.keyFrames[i]) hit := &hitItem{ potentialIdxs: make([]int, l), partials: make([][][2]int64, l), } hits[i] = hit return hit } // given the current bof and eof, is there anything worth waiting for? continueWaiting := func(w []int) bool { var keepScanning bool // now for each of the possible signatures we are either waiting on or have partial/potential matches for, check whether there are live contenders for _, v := range w { kf := b.keyFrames[v] for i, f := range kf { off := bof if f.typ > frames.PREV { off = eof } var waitfor, excludable bool if f.key.pMax == -1 || f.key.pMax+int64(f.key.lMax) > off { waitfor = true } else if hit, ok := hits[v]; ok { if hit.partials[i] != nil { waitfor = true } else if hit.potentialIdxs[i] > 0 && strikes[hit.potentialIdxs[i]-1].hasPotential() { waitfor, excludable = true, true } } // if we've got to the end of the signature, and have determined this is a live one - return immediately & continue scan if waitfor { if i == len(kf)-1 { if !config.Slow() || !config.Checkpoint(bof) { return true } keepScanning = true fmt.Fprintf(config.Out(), "waiting on: %d, potentially excludable: %t\n", v, excludable) } continue } break } } return keepScanning } testStrike := func(st strike) []kfHit { // the offsets we *record* are always BOF offsets - these can be interpreted as EOF offsets when necessary off := st.offset if st.reverse { off = buf.Size() - st.offset - int64(st.length) } // grab the relevant testTree t := b.tests[st.idxa+st.idxb] res := make([]kfHit, 0, 10) // immediately apply key frames for the completes for _, kf := range t.complete { if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) { res = append(res, kfHit{kf, off, st.length}) } } // if there are no incompletes, we are done if len(t.incomplete) < 1 { return res } // see what incompletes are worth pursuing var checkl, checkr bool for _, v := range t.incomplete { if checkl && checkr { break } if b.keyFrames[v.kf[0]][v.kf[1]].check(st.offset) && waitSet.Check(v.kf[0]) { if v.l { checkl = true } if v.r { checkr = true } } } if !checkl && !checkr { return res } // calculate the offset and lengths for the left and right test slices var lslc, rslc []byte var lpos, rpos int64 var llen, rlen int if st.reverse { lpos, llen = st.offset+int64(st.length), t.maxLeftDistance rpos, rlen = st.offset-int64(t.maxRightDistance), t.maxRightDistance if rpos < 0 { rlen = rlen + int(rpos) rpos = 0 } } else { lpos, llen = st.offset-int64(t.maxLeftDistance), t.maxLeftDistance rpos, rlen = st.offset+int64(st.length), t.maxRightDistance if lpos < 0 { llen = llen + int(lpos) lpos = 0 } } // the partials slice has a mirror entry for each of the testTree incompletes partials := make([]partial, len(t.incomplete)) // test left (if there are valid left tests to try) if checkl { if st.reverse { lslc, _ = buf.EofSlice(lpos, llen) } else { lslc, _ = buf.Slice(lpos, llen) } left := matchTestNodes(t.left, lslc, true) for _, lp := range left { if partials[lp.followUp].l { partials[lp.followUp].ldistances = append(partials[lp.followUp].ldistances, lp.distances...) } else { partials[lp.followUp].l = true partials[lp.followUp].ldistances = lp.distances } } } // test right (if there are valid right tests to try) if checkr { if st.reverse { rslc, _ = buf.EofSlice(rpos, rlen) } else { rslc, _ = buf.Slice(rpos, rlen) } right := matchTestNodes(t.right, rslc, false) for _, rp := range right { if partials[rp.followUp].r { partials[rp.followUp].rdistances = append(partials[rp.followUp].rdistances, rp.distances...) } else { partials[rp.followUp].r = true partials[rp.followUp].rdistances = rp.distances } } } // now iterate through the partials, checking whether they fulfil any of the incompletes for i, p := range partials { if p.l == t.incomplete[i].l && p.r == t.incomplete[i].r { kf := t.incomplete[i].kf if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) { if !p.l { p.ldistances = []int{0} } if !p.r { p.rdistances = []int{0} } if oneEnough(kf[1], b.keyFrames[kf[0]]) { res = append(res, kfHit{kf, off - int64(p.ldistances[0]), p.ldistances[0] + st.length + p.rdistances[0]}) continue } for _, ldistance := range p.ldistances { for _, rdistance := range p.rdistances { res = append(res, kfHit{kf, off - int64(ldistance), ldistance + st.length + rdistance}) } } } } } return res } applyKeyFrame := func(hit kfHit) (bool, string) { kfs := b.keyFrames[hit.id[0]] if len(kfs) == 1 { return true, fmt.Sprintf("byte match at %d, %d", hit.offset, hit.length) } h, ok := hits[hit.id[0]] if !ok { h = newHit(hit.id[0]) } if h.partials[hit.id[1]] == nil { h.partials[hit.id[1]] = [][2]int64{{hit.offset, int64(hit.length)}} } else { h.partials[hit.id[1]] = append(h.partials[hit.id[1]], [2]int64{hit.offset, int64(hit.length)}) } for _, p := range h.partials { if p == nil { return false, "" } } prevOff := h.partials[0] basis := make([][][2]int64, len(kfs)) basis[0] = prevOff prevKf := kfs[0] ok = false for i, kf := range kfs[1:] { var nextKf keyFrame if i+2 < len(kfs) { nextKf = kfs[i+2] } thisOff := h.partials[i+1] prevOff, ok = kf.checkRelated(prevKf, nextKf, thisOff, prevOff) if !ok { return false, "" } basis[i+1] = prevOff prevKf = kf } return true, fmt.Sprintf("byte match at %v", basis) } go func() { for in := range incoming { // if we've got a positive result, drain any remaining strikes from the matchers if quitting { continue } // if the strike reports progress, check if we should be continuing to wait if in.idxa == -1 { // update with the latest offset if in.reverse { eof = in.offset } else { bof = in.offset } w := waitSet.WaitingOnAt(bof, eof) // if any of the waitlists are nil, we will continue - unless we are past the known bof and known eof (points at which we *should* have got at least partial matches), in which case we will check if any partial/potential matches are live if w == nil { // keep going if we don't have a maximum known bof, or if our current bof/eof are less than the maximum known bof/eof if b.knownBOF < 0 || int64(b.knownBOF) > bof || int64(b.knownEOF) > eof { continue } // if we don't have a waitlist, and we are past the known bof and known eof, grab all the partials and potentials to check if any are live w = all(hits) } // exhausted all contenders, we can stop scanning if !continueWaiting(w) { quit() } continue } // now cache or satisfy the strike var hasPotential bool potentials := filterKF(b.tests[in.idxa+in.idxb].keyFrames(), waitSet) for _, pot := range potentials { // if any of the signatures are single keyframe we can satisfy immediately and skip cache if len(b.keyFrames[pot[0]]) == 1 { hasPotential = true break } if hit, ok := hits[pot[0]]; ok && hit.potentiallyComplete(pot[1], strikes) { hasPotential = true break } } if !hasPotential { // cache the strike s, ok := strikes[in.idxa+in.idxb] if !ok { s = &strikeItem{in, -1, nil} strikes[in.idxa+in.idxb] = s } else { if s.successive == nil { s.successive = make([][2]int64, 0, 10) } s.successive = append(s.successive, [2]int64{in.offset, int64(in.length)}) } // range over the potentials, linking to the strike for _, pot := range potentials { if b.keyFrames[pot[0]][pot[1]].check(in.offset) { hit, ok := hits[pot[0]] if !ok { hit = newHit(pot[0]) } hit.potentialIdxs[pot[1]] = in.idxa + in.idxb + 1 } } goto end } // satisfy the strike for { ks := testStrike(in) for _, k := range ks { if match, basis := applyKeyFrame(k); match { if waitSet.Check(k.id[0]) { r <- result{k.id[0], basis} if waitSet.PutAt(k.id[0], bof, eof) { quit() goto end } } if h, ok := hits[k.id[0]]; ok { h.matched = true } } } potentials = filterKF(potentials, waitSet) var ok bool for _, pot := range potentials { in, ok = hits[pot[0]].nextPotential(strikes) if ok { break } } if !ok { break } } end: // keep looping until incoming is closed } close(r) }() return incoming }
// IdentifyBuffer identifies a siegreader buffer. Supply the error from Get as the second argument. func (s *Siegfried) IdentifyBuffer(buffer *siegreader.Buffer, err error, name, mime string) ([]core.Identification, error) { if err != nil && err != siegreader.ErrEmpty { return nil, fmt.Errorf("siegfried: error reading file; got %v", err) } recs := make([]core.Recorder, len(s.ids)) for i, v := range s.ids { recs[i] = v.Recorder() if name != "" { recs[i].Active(core.NameMatcher) } if mime != "" { recs[i].Active(core.MIMEMatcher) } if err == nil { recs[i].Active(core.XMLMatcher) recs[i].Active(core.TextMatcher) } } // Log name for debug/slow if config.Debug() || config.Slow() { fmt.Fprintf(config.Out(), "[FILE] %s\n", name) } // Name Matcher if len(name) > 0 && s.nm != nil { nms, _ := s.nm.Identify(name, nil) // we don't care about an error here for v := range nms { for _, rec := range recs { if rec.Record(core.NameMatcher, v) { break } } } } // MIME Matcher if len(mime) > 0 && s.mm != nil { mms, _ := s.mm.Identify(mime, nil) // we don't care about an error here for v := range mms { for _, rec := range recs { if rec.Record(core.MIMEMatcher, v) { break } } } } // Container Matcher if s.cm != nil { if config.Debug() { fmt.Fprintln(config.Out(), ">>START CONTAINER MATCHER") } cms, cerr := s.cm.Identify(name, buffer) for v := range cms { for _, rec := range recs { if rec.Record(core.ContainerMatcher, v) { break } } } if err == nil { err = cerr } } satisfied := true // XML Matcher if s.xm != nil { for _, rec := range recs { if ok, _ := rec.Satisfied(core.XMLMatcher); !ok { satisfied = false break } } if !satisfied { if config.Debug() { fmt.Fprintln(config.Out(), ">>START XML MATCHER") } xms, xerr := s.xm.Identify("", buffer) for v := range xms { for _, rec := range recs { if rec.Record(core.XMLMatcher, v) { break } } } if err == nil { err = xerr } } } satisfied = true // RIFF Matcher if s.rm != nil { for _, rec := range recs { if ok, _ := rec.Satisfied(core.RIFFMatcher); !ok { satisfied = false break } } if !satisfied { if config.Debug() { fmt.Fprintln(config.Out(), ">>START RIFF MATCHER") } rms, rerr := s.rm.Identify("", buffer) for v := range rms { for _, rec := range recs { if rec.Record(core.RIFFMatcher, v) { break } } } if err == nil { err = rerr } } } satisfied = true exclude := make([]int, 0, len(recs)) for _, rec := range recs { ok, ex := rec.Satisfied(core.ByteMatcher) if !ok { satisfied = false } else { exclude = append(exclude, ex) } } // Byte Matcher if s.bm != nil && !satisfied { if config.Debug() { fmt.Fprintln(config.Out(), ">>START BYTE MATCHER") } ids, _ := s.bm.Identify("", buffer, exclude...) // we don't care about an error here for v := range ids { for _, rec := range recs { if rec.Record(core.ByteMatcher, v) { break } } } } satisfied = true for _, rec := range recs { if ok, _ := rec.Satisfied(core.TextMatcher); !ok { satisfied = false break } } // Text Matcher if s.tm != nil && !satisfied { ids, _ := s.tm.Identify("", buffer) // we don't care about an error here for v := range ids { for _, rec := range recs { if rec.Record(core.TextMatcher, v) { break } } } } if len(recs) < 2 { return recs[0].Report(), err } var res []core.Identification for idx, rec := range recs { if config.Slow() || config.Debug() { for _, id := range rec.Report() { fmt.Fprintf(config.Out(), "matched: %s\n", id.String()) } } if idx == 0 { res = rec.Report() continue } res = append(res, rec.Report()...) } return res, err }
func (m Matcher) Identify(na string, b *siegreader.Buffer, exclude ...int) (chan core.Result, error) { buf, err := b.Slice(0, 8) if err != nil || buf[0] != 'R' || buf[1] != 'I' || buf[2] != 'F' || buf[3] != 'F' { res := make(chan core.Result) close(res) return res, nil } rcc, rrdr, err := riff.NewReader(siegreader.ReaderFrom(b)) if err != nil { res := make(chan core.Result) close(res) return res, nil } // now make structures for testing uniqs := make(map[riff.FourCC]bool) res := make(chan core.Result) waitset := m.priorities.WaitSet(exclude...) // send and report if satisified send := func(cc riff.FourCC) bool { if config.Debug() { fmt.Fprintf(config.Out(), "riff match %s\n", string(cc[:])) } if uniqs[cc] { return false } uniqs[cc] = true for _, hit := range m.riffs[cc] { if waitset.Check(hit) { if config.Debug() { fmt.Fprintf(config.Out(), "sending riff match %s\n", string(cc[:])) } res <- result{hit, cc} if waitset.Put(hit) { return true } } } return false } // riff walk var descend func(*riff.Reader) bool descend = func(r *riff.Reader) bool { for { chunkID, chunkLen, chunkData, err := r.Next() if err != nil || send(chunkID) { return true } if chunkID == riff.LIST { listType, list, err := riff.NewListReader(chunkLen, chunkData) if err != nil || send(listType) { return true } if descend(list) { return true } } } } // go time go func() { if send(rcc) { close(res) return } descend(rrdr) close(res) }() return res, nil }