func identifyRdr(w writer, s *siegfried.Siegfried, r io.Reader, sz int64, path, mime, mod string) { lg.set(path) c, err := s.Identify(r, path, mime) lg.err(err) if c == nil { w.writeFile(path, sz, mod, nil, err, nil) lg.reset() return } var b *siegreader.Buffer var cs []byte if checksum != nil { b = s.Buffer() var i int64 l := checksum.BlockSize() for ; ; i += int64(l) { buf, _ := b.Slice(i, l) if buf == nil { break } checksum.Write(buf) } cs = checksum.Sum(nil) checksum.Reset() } a := w.writeFile(path, sz, mod, cs, err, idChan(c)) lg.reset() if !*archive || a == config.None { return } var d decompressor if b == nil { b = s.Buffer() } switch a { case config.Zip: d, err = newZip(siegreader.ReaderFrom(b), path, sz) case config.Gzip: d, err = newGzip(b, path) case config.Tar: d, err = newTar(siegreader.ReaderFrom(b), path) case config.ARC: d, err = newARC(siegreader.ReaderFrom(b), path) case config.WARC: d, err = newWARC(siegreader.ReaderFrom(b), path) } if err != nil { writeError(w, path, sz, mod, fmt.Errorf("failed to decompress, got: %v", err)) return } for err = d.next(); err == nil; err = d.next() { if *droido { for _, v := range d.dirs() { w.writeFile(v, -1, "", nil, nil, nil) } } identifyRdr(w, s, d.reader(), d.size(), d.path(), d.mime(), d.mod()) } }
func newGzip(b *siegreader.Buffer, path string) (decompressor, error) { _ = b.SizeNow() // in case of a stream, force full read buf, err := b.EofSlice(0, 4) // gzip stores uncompressed size in last 4 bytes of the stream if err != nil { return nil, err } sz := int64(uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24) g, err := gzip.NewReader(siegreader.ReaderFrom(b)) return &gzipD{sz: sz, p: path, rdr: g}, err }
func (fs *frameSet) index(buf *siegreader.Buffer, rev bool, quit chan struct{}) chan fsmatch { ret := make(chan fsmatch) go func() { var i int for { select { case <-quit: close(ret) return default: } if i >= len(fs.set) { close(ret) return } f := fs.set[i] var match bool var matches []int if rev { slc, err := buf.EofSlice(0, frames.TotalLength(f)) if err != nil { close(ret) return } match, matches = f.MatchR(slc) } else { slc, err := buf.Slice(0, frames.TotalLength(f)) if err != nil { close(ret) return } match, matches = f.Match(slc) } if match { var min int if !rev { min, _ = f.Length() } for _, off := range matches { ret <- fsmatch{i, int64(off - min), min} } } i++ } close(ret) }() return ret }
func (m *Matcher) Identify(na string, buf *siegreader.Buffer) (chan core.Result, error) { if *m > 0 { tt := buf.Text() if tt != characterize.DATA { res := make(chan core.Result, *m) for i := 1; i < int(*m)+1; i++ { res <- result{ idx: i, basis: "text match " + tt.String(), } } close(res) return res, nil } } res := make(chan core.Result) close(res) return res, nil }
func (m Matcher) Identify(n string, b siegreader.Buffer) (chan core.Result, error) { res := make(chan core.Result) // check trigger buf, err := b.Slice(0, 8) if err != nil { close(res) return res, nil } for _, c := range m { if c.trigger(buf) { rdr, err := c.rdr(b) if err != nil { close(res) return res, err } go c.identify(n, rdr, res) return res, nil } } // nothing ... move on close(res) return res, nil }
// Identify function - brings a new matcher into existence func (b *Matcher) identify(buf siegreader.Buffer, quit chan struct{}, r chan core.Result) { buf.SetQuit(quit) incoming := b.newScorer(buf, quit, r) rdr := siegreader.LimitReaderFrom(buf, b.maxBOF) // First test BOF frameset bfchan := b.bofFrames.index(buf, false, quit) for bf := range bfchan { if config.Debug() { fmt.Println(strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true, true}) } incoming <- strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true, true} } select { case <-quit: // the matcher has called quit close(incoming) return default: } // Do an initial check of BOF sequences b.start(true) // start bof matcher if not yet started var bchan chan wac.Result bchan = b.bAho.Index(rdr) for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) if br.Offset > 2048 { break } } else { if config.Debug() { fmt.Println(strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false, br.Final}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false, br.Final} } } select { case <-quit: // the matcher has called quit for _ = range bchan { } // drain first close(incoming) return default: } // Check EOF frame tests efchan := b.eofFrames.index(buf, true, quit) for ef := range efchan { if config.Debug() { fmt.Println(strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true, true} } // Setup EOF sequences test b.start(false) rrdr := siegreader.LimitReverseReaderFrom(buf, b.maxEOF) echan := b.eAho.Index(rrdr) // Now enter main search loop for { select { case br, ok := <-bchan: if !ok { bchan = nil } else { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Println(strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false, br.Final}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false, br.Final} } } case er, ok := <-echan: if !ok { echan = nil } else { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Println(strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false, er.Final}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false, er.Final} } } } if bchan == nil && echan == nil { close(incoming) return } } }
// identify function - brings a new matcher into existence func (b *Matcher) identify(buf *siegreader.Buffer, quit chan struct{}, r chan core.Result) { buf.Quit = quit incoming := b.scorer(buf, quit, r) rdr := siegreader.LimitReaderFrom(buf, b.maxBOF) // First test BOF frameset bfchan := b.bofFrames.index(buf, false, quit) for bf := range bfchan { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true}) } incoming <- strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true} } select { case <-quit: // the matcher has called quit for _ = range bfchan { } // drain first close(incoming) return default: } // Do an initial check of BOF sequences b.start(true) // start bof matcher if not yet started var bchan chan wac.Result bchan = b.bAho.Index(rdr) for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) if br.Offset > 131072 && (b.maxBOF < 0 || b.maxBOF > b.maxEOF*5) { // del buf.Stream break } } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } select { case <-quit: // the matcher has called quit for _ = range bchan { } // drain first close(incoming) return default: } // Setup EOF tests efchan := b.eofFrames.index(buf, true, quit) b.start(false) rrdr := siegreader.LimitReverseReaderFrom(buf, b.maxEOF) echan := b.eAho.Index(rrdr) // if we have a maximum value on EOF do a sequential search if b.maxEOF >= 0 { if b.maxEOF != 0 { _, _ = buf.CanSeek(0, true) // force a full read to enable EOF scan to proceed for streams } for ef := range efchan { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } // Scan complete EOF for er := range echan { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } // send a final progress strike with the maximum EOF incoming <- progressStrike(int64(b.maxEOF), true) // Finally, finish BOF scan for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } close(incoming) return } // If no maximum on EOF do a parallel search for { select { case br, ok := <-bchan: if !ok { if b.maxBOF < 0 && b.maxEOF != 0 { _, _ = buf.CanSeek(0, true) // if we've a limit BOF reader, force a full read to enable EOF scan to proceed for streams } bchan = nil } else { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } case ef, ok := <-efchan: if !ok { efchan = nil } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } case er, ok := <-echan: if !ok { echan = nil } else { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Fprintln(config.Out(), strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } } if bchan == nil && efchan == nil && echan == nil { close(incoming) return } } }
func (b *Matcher) scorer(buf *siegreader.Buffer, q chan struct{}, r chan<- core.Result) chan<- strike { incoming := make(chan strike) waitSet := b.priorities.WaitSet() hits := make(map[int]*hitItem) strikes := make(map[int]*strikeItem) var bof int64 var eof int64 var quitting bool quit := func() { close(q) quitting = true } newHit := func(i int) *hitItem { l := len(b.keyFrames[i]) hit := &hitItem{ potentialIdxs: make([]int, l), partials: make([][][2]int64, l), } hits[i] = hit return hit } // given the current bof and eof, is there anything worth waiting for? continueWaiting := func(w []int) bool { var keepScanning bool // now for each of the possible signatures we are either waiting on or have partial/potential matches for, check whether there are live contenders for _, v := range w { kf := b.keyFrames[v] for i, f := range kf { off := bof if f.typ > frames.PREV { off = eof } var waitfor, excludable bool if f.key.pMax == -1 || f.key.pMax+int64(f.key.lMax) > off { waitfor = true } else if hit, ok := hits[v]; ok { if hit.partials[i] != nil { waitfor = true } else if hit.potentialIdxs[i] > 0 && strikes[hit.potentialIdxs[i]-1].hasPotential() { waitfor, excludable = true, true } } // if we've got to the end of the signature, and have determined this is a live one - return immediately & continue scan if waitfor { if i == len(kf)-1 { if !config.Slow() || !config.Checkpoint(bof) { return true } keepScanning = true fmt.Fprintf(config.Out(), "waiting on: %d, potentially excludable: %t\n", v, excludable) } continue } break } } return keepScanning } testStrike := func(st strike) []kfHit { // the offsets we *record* are always BOF offsets - these can be interpreted as EOF offsets when necessary off := st.offset if st.reverse { off = buf.Size() - st.offset - int64(st.length) } // grab the relevant testTree t := b.tests[st.idxa+st.idxb] res := make([]kfHit, 0, 10) // immediately apply key frames for the completes for _, kf := range t.complete { if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) { res = append(res, kfHit{kf, off, st.length}) } } // if there are no incompletes, we are done if len(t.incomplete) < 1 { return res } // see what incompletes are worth pursuing var checkl, checkr bool for _, v := range t.incomplete { if checkl && checkr { break } if b.keyFrames[v.kf[0]][v.kf[1]].check(st.offset) && waitSet.Check(v.kf[0]) { if v.l { checkl = true } if v.r { checkr = true } } } if !checkl && !checkr { return res } // calculate the offset and lengths for the left and right test slices var lslc, rslc []byte var lpos, rpos int64 var llen, rlen int if st.reverse { lpos, llen = st.offset+int64(st.length), t.maxLeftDistance rpos, rlen = st.offset-int64(t.maxRightDistance), t.maxRightDistance if rpos < 0 { rlen = rlen + int(rpos) rpos = 0 } } else { lpos, llen = st.offset-int64(t.maxLeftDistance), t.maxLeftDistance rpos, rlen = st.offset+int64(st.length), t.maxRightDistance if lpos < 0 { llen = llen + int(lpos) lpos = 0 } } // the partials slice has a mirror entry for each of the testTree incompletes partials := make([]partial, len(t.incomplete)) // test left (if there are valid left tests to try) if checkl { if st.reverse { lslc, _ = buf.EofSlice(lpos, llen) } else { lslc, _ = buf.Slice(lpos, llen) } left := matchTestNodes(t.left, lslc, true) for _, lp := range left { if partials[lp.followUp].l { partials[lp.followUp].ldistances = append(partials[lp.followUp].ldistances, lp.distances...) } else { partials[lp.followUp].l = true partials[lp.followUp].ldistances = lp.distances } } } // test right (if there are valid right tests to try) if checkr { if st.reverse { rslc, _ = buf.EofSlice(rpos, rlen) } else { rslc, _ = buf.Slice(rpos, rlen) } right := matchTestNodes(t.right, rslc, false) for _, rp := range right { if partials[rp.followUp].r { partials[rp.followUp].rdistances = append(partials[rp.followUp].rdistances, rp.distances...) } else { partials[rp.followUp].r = true partials[rp.followUp].rdistances = rp.distances } } } // now iterate through the partials, checking whether they fulfil any of the incompletes for i, p := range partials { if p.l == t.incomplete[i].l && p.r == t.incomplete[i].r { kf := t.incomplete[i].kf if b.keyFrames[kf[0]][kf[1]].check(st.offset) && waitSet.Check(kf[0]) { if !p.l { p.ldistances = []int{0} } if !p.r { p.rdistances = []int{0} } if oneEnough(kf[1], b.keyFrames[kf[0]]) { res = append(res, kfHit{kf, off - int64(p.ldistances[0]), p.ldistances[0] + st.length + p.rdistances[0]}) continue } for _, ldistance := range p.ldistances { for _, rdistance := range p.rdistances { res = append(res, kfHit{kf, off - int64(ldistance), ldistance + st.length + rdistance}) } } } } } return res } applyKeyFrame := func(hit kfHit) (bool, string) { kfs := b.keyFrames[hit.id[0]] if len(kfs) == 1 { return true, fmt.Sprintf("byte match at %d, %d", hit.offset, hit.length) } h, ok := hits[hit.id[0]] if !ok { h = newHit(hit.id[0]) } if h.partials[hit.id[1]] == nil { h.partials[hit.id[1]] = [][2]int64{[2]int64{hit.offset, int64(hit.length)}} } else { h.partials[hit.id[1]] = append(h.partials[hit.id[1]], [2]int64{hit.offset, int64(hit.length)}) } for _, p := range h.partials { if p == nil { return false, "" } } prevOff := h.partials[0] basis := make([][][2]int64, len(kfs)) basis[0] = prevOff prevKf := kfs[0] ok = false for i, kf := range kfs[1:] { var nextKf keyFrame if i+2 < len(kfs) { nextKf = kfs[i+2] } thisOff := h.partials[i+1] prevOff, ok = kf.checkRelated(prevKf, nextKf, thisOff, prevOff) if !ok { return false, "" } basis[i+1] = prevOff prevKf = kf } return true, fmt.Sprintf("byte match at %v", basis) } go func() { for in := range incoming { // if we've got a postive result, drain any remaining strikes from the matchers if quitting { continue } // if the strike reports progress, check if we should be continuing to wait if in.idxa == -1 { // update with the latest offset if in.reverse { eof = in.offset } else { bof = in.offset } w := waitSet.WaitingOn() // if any of the waitlists are nil, we will continue - unless we are past the known bof and known eof (points at which we *should* have got at least partial matches), in which case we will check if any partial/potential matches are live if w == nil { // keep going if we don't have a maximum known bof, or if our current bof/eof are less than the maximum known bof/eof if b.knownBOF < 0 || int64(b.knownBOF) > bof || int64(b.knownEOF) > eof { continue } // if we don't have a waitlist, and we are past the known bof and known eof, grab all the partials and potentials to check if any are live w = all(hits) } // exhausted all contenders, we can stop scanning if !continueWaiting(w) { quit() } continue } // now cache or satisfy the strike var hasPotential bool potentials := filterKF(b.tests[in.idxa+in.idxb].keyFrames(), waitSet) for _, pot := range potentials { // if any of the signatures are single keyframe we can satisfy immediately and skip cache if len(b.keyFrames[pot[0]]) == 1 { hasPotential = true break } if hit, ok := hits[pot[0]]; ok && hit.potentiallyComplete(pot[1], strikes) { hasPotential = true break } } if !hasPotential { // cache the strike s, ok := strikes[in.idxa+in.idxb] if !ok { s = &strikeItem{in, -1, nil} strikes[in.idxa+in.idxb] = s } else { if s.successive == nil { s.successive = make([][2]int64, 0, 10) } s.successive = append(s.successive, [2]int64{in.offset, int64(in.length)}) } // range over the potentials, linking to the strike for _, pot := range potentials { if b.keyFrames[pot[0]][pot[1]].check(in.offset) { hit, ok := hits[pot[0]] if !ok { hit = newHit(pot[0]) } hit.potentialIdxs[pot[1]] = in.idxa + in.idxb + 1 } } goto end } // satisfy the strike for { ks := testStrike(in) for _, k := range ks { if match, basis := applyKeyFrame(k); match { if waitSet.Check(k.id[0]) { r <- result{k.id[0], basis} if waitSet.Put(k.id[0]) { quit() goto end } } if h, ok := hits[k.id[0]]; ok { h.matched = true } } } potentials = filterKF(potentials, waitSet) var ok bool for _, pot := range potentials { in, ok = hits[pot[0]].nextPotential(strikes) if ok { break } } if !ok { break } } end: // keep looping until incoming is closed } close(r) }() return incoming }
func zipRdr(b *siegreader.Buffer) (Reader, error) { r, err := zip.NewReader(siegreader.ReaderFrom(b), b.SizeNow()) return &zipReader{idx: -1, rdr: r}, err }
// Identify function - brings a new matcher into existence func (b *Matcher) identify(buf siegreader.Buffer, quit chan struct{}, r chan core.Result) { buf.SetQuit(quit) incoming := b.scorer(buf, quit, r) rdr := siegreader.LimitReaderFrom(buf, b.maxBOF) // First test BOF frameset bfchan := b.bofFrames.index(buf, false, quit) for bf := range bfchan { if config.Debug() { fmt.Println(strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true}) } incoming <- strike{b.bofFrames.testTreeIndex[bf.idx], 0, bf.off, bf.length, false, true} } select { case <-quit: // the matcher has called quit for _ = range bfchan { } // drain first close(incoming) return default: } // Do an initial check of BOF sequences b.start(true) // start bof matcher if not yet started var bchan chan wac.Result bchan = b.bAho.Index(rdr) for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) if !buf.Stream() && br.Offset > 131072 && (b.maxBOF < 0 || b.maxBOF > b.maxEOF*5) { break } } else { if config.Debug() { fmt.Println(strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } select { case <-quit: // the matcher has called quit for _ = range bchan { } // drain first close(incoming) return default: } // Setup EOF tests efchan := b.eofFrames.index(buf, true, quit) b.start(false) rrdr := siegreader.LimitReverseReaderFrom(buf, b.maxEOF) echan := b.eAho.Index(rrdr) // if we have a maximum value on EOF do a sequential search if b.maxEOF >= 0 { for ef := range efchan { if config.Debug() { fmt.Println(strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } // Scan complete EOF for er := range echan { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Println(strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } // let the scorer known we have reached the end of the EOF scan incoming <- progressStrike(-1, true) // Finally, finish BOF scan for br := range bchan { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Println(strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } close(incoming) return } // If no maximum on EOF do a parallel search for { select { case br, ok := <-bchan: if !ok { bchan = nil } else { if br.Index[0] == -1 { incoming <- progressStrike(br.Offset, false) } else { if config.Debug() { fmt.Println(strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false}) } incoming <- strike{b.bofSeq.testTreeIndex[br.Index[0]], br.Index[1], br.Offset, br.Length, false, false} } } case ef, ok := <-efchan: if !ok { efchan = nil } else { if config.Debug() { fmt.Println(strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true}) } incoming <- strike{b.eofFrames.testTreeIndex[ef.idx], 0, ef.off, ef.length, true, true} } case er, ok := <-echan: if !ok { echan = nil } else { if er.Index[0] == -1 { incoming <- progressStrike(er.Offset, true) } else { if config.Debug() { fmt.Println(strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false}) } incoming <- strike{b.eofSeq.testTreeIndex[er.Index[0]], er.Index[1], er.Offset, er.Length, true, false} } } } if bchan == nil && efchan == nil && echan == nil { close(incoming) return } } }