// Searches the given Nucleotide-Sequence file-path for the given 'searchString' using sophisticated compression func search(fileName string, sequenceToSearchFor string) (bool, []int) { var isPresent bool = false var master []byte var offsets []int lines, err := readLines(fileName) if err != nil { log.Fatalf("readLines: %s", err) } else { for _, line := range lines { master = *(compress(&line)) } } // dat, _ := ioutil.ReadFile("./encoded.txt") index := suffixarray.New(master) searchString := sequenceToSearchFor searchBytes := *(compress(&searchString)) //https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#190 //Gets exactly the first match alone // offsets = index.Lookup([]byte(searchBytes), -1) //https://code.google.com/p/go/source/browse/src/pkg/index/suffixarray/suffixarray.go?name=release#174 //Gets all matches offsets = index.Lookup([]byte(searchBytes), 1) if len(offsets) == 1 { isPresent = true } return isPresent, offsets }
func main() { //A set of words delimited by space words := "a apple sphere atom atmosphere" //A suffix array created in golang by converting the given string into bytes index := suffixarray.New([]byte(words)) //Lookup Time complexity = O(log(N)*len(s) + len(result)) // N : the size of the indexed data // s : substring to be seached for // result : array containing integers which represent the index of the given substring 's' in the suffix array //NOTE // Let's take the following example // var s string = "apple" // fmt.Println([]byte(s)) would print = [97 112 112 108 101]. // Golang Suffix array uses the byte representation of the sub-string "s" in order to perform most optimal comutation offsets1 := index.Lookup([]byte("sphere"), -1) // the list of all indices where s occurs in data //Prints unsorted array of integers which are the indices of the given substring fmt.Println(offsets1) }
func _indexSuffixArray() string { docs := []string{ "mercury", "venus", "earth", "mars", "jupiter", "saturn", "uranus", "pluto", } var data []byte var offsets []int for _, d := range docs { data = append(data, []byte(d)...) offsets = append(offsets, len(data)) } sfx := suffixarray.New(data) query := "earth" idxs := sfx.Lookup([]byte(query), -1) var results []int for _, idx := range idxs { i := sort.Search(len(offsets), func(i int) bool { return offsets[i] > idx }) if idx+len(query) <= offsets[i] { results = append(results, i) } } return fmt.Sprintf("%q is in documents %v\n", query, results) }
func NewTruncIndex() *TruncIndex { return &TruncIndex{ index: suffixarray.New([]byte{' '}), ids: make(map[string]bool), bytes: []byte{' '}, } }
func main() { const N = 6e5 const M = 100 const I = 20 const J = 10 const P = 2 data := make([]byte, N) for i := 0; i < N; i++ { data[i] = byte(rand.Intn(255)) } done := make(chan bool, P) for p := 0; p < P; p++ { go func() { for i := 0; i < I; i++ { suffix := suffixarray.New(data) for j := 0; j < J; j++ { str := make([]byte, M) for m := 0; m < M; m++ { str[m] = byte(rand.Intn(255)) } _ = suffix.Lookup(str, 10) } } done <- true }() } for p := 0; p < P; p++ { <-done } }
func (idx *TruncIndex) Add(id string) error { idx.Lock() defer idx.Unlock() if err := idx.addId(id); err != nil { return err } idx.index = suffixarray.New(idx.bytes) return nil }
func BenchmarkBuildSuffixArray(b *testing.B) { words := getDictWords() b.ResetTimer() for i := 0; i < b.N; i++ { data := []byte("\x00" + strings.Join(words, "\x00") + "\x00") _ = suffixarray.New(data) } }
func (waf *WAF) GoSufArray_UserAgentIsClean(UA []byte) bool { index := suffixarray.New(UA) for _, robot := range waf.bad_robots { if len(index.Lookup(robot, 1)) > 0 { return false } } return true }
func check_url(substr, str string) bool { var dst []byte substr_bytes := strconv.AppendQuoteToASCII(dst, substr) str_bytes := strconv.AppendQuoteToASCII(dst, str) index := suffixarray.New(str_bytes) offsets := index.Lookup(substr_bytes, -1) if offsets == nil { return false } return offsets[0] == 0 }
func benchFast(line string, substr string) time.Duration { index := fs.New([]byte(line)) start := time.Now() for i := 0; i < LOOPS; i++ { index.Lookup([]byte(substr), 1) } end := time.Now() delta := end.Sub(start) fmt.Printf("%10s: %20s\t%16s %10s\n", "fast", substr, delta, delta/LOOPS) return delta }
func NewTruncIndex(ids []string) (idx *TruncIndex) { idx = &TruncIndex{ ids: make(map[string]bool), bytes: []byte{' '}, } for _, id := range ids { idx.ids[id] = true idx.bytes = append(idx.bytes, []byte(id+" ")...) } idx.index = suffixarray.New(idx.bytes) return }
func (s *Searcher) indexFile(path string, info os.FileInfo, err error) error { // only index 1/4 of the files per server if int(path[len(path)-1])%4 != s.id { return nil } if info.Mode().IsRegular() && info.Size() < (1<<20) { name := strings.TrimPrefix(path, s.base_path) data, _ := ioutil.ReadFile(path) s.files[name] = suffixarray.New(data) } return nil }
func (idx *TruncIndex) Add(id string) error { if strings.Contains(id, " ") { return fmt.Errorf("Illegal character: ' '") } if _, exists := idx.ids[id]; exists { return fmt.Errorf("Id already exists: %s", id) } idx.ids[id] = true idx.bytes = append(idx.bytes, []byte(id+" ")...) idx.index = suffixarray.New(idx.bytes) return nil }
func BenchmarkLookupXX(b *testing.B) { words := getDictWords() data := []byte("\x00" + strings.Join(words, "\x00") + "\x00") sa := suffixarray.New(data) b.ResetTimer() for i := 0; i < b.N; i++ { indices := sa.Lookup([]byte(XXwordToTry), 1) if len(indices) > 0 { _ = getStringFromIndex(data, indices[0]) } } }
func (idx *TruncIndex) Delete(id string) error { if _, exists := idx.ids[id]; !exists { return fmt.Errorf("No such id: %s", id) } before, after, err := idx.lookup(id) if err != nil { return err } delete(idx.ids, id) idx.bytes = append(idx.bytes[:before], idx.bytes[after:]...) idx.index = suffixarray.New(idx.bytes) return nil }
// Index builds a new suffixarray for the package names previously registered // with this instance. func (idx *ManualIndex) Index() error { var buf bytes.Buffer for pkg := range idx.packages { _, err := fmt.Fprintf(&buf, "\x00%s", pkg) if err != nil { return err } } idx.index = suffixarray.New(buf.Bytes()) return nil }
func (idx *SuffixIndex) sort() { if idx.index == nil { keys := make([][]byte, len(idx.tiles)) i := 0 for k := range idx.tiles { keys[i] = []byte(k) i++ } d := []byte{zero} b := bytes.Join(keys, d) //join w/ zeros idx.indexed = bytes.Join([][]byte{d, d}, b) //pad w/ zeros idx.index = suffixarray.New(idx.indexed) } }
func TestSuffixArrayFind(t *testing.T) { words := getDictWords() data := []byte("\x00" + strings.Join(words, "\x00") + "\x00") sa := suffixarray.New(data) buf := &bytes.Buffer{} sa.Write(buf) fmt.Println("size:", buf.Len()) indices := sa.Lookup([]byte("yrate"), 1) if indices == nil || len(indices) < 1 { t.Fatal("not found") } }
func test(data, what []byte) { s := suffixarray.New(data) idx0 := s.Lookup(what, -1) idx1 := simple(data, what) if len(idx0) != len(idx1) { panic(fmt.Sprintf("len mismatch: %+v, %+v", idx0, idx1)) } sort.Ints(idx0) for i, x := range idx0 { if x != idx1[i] { panic(fmt.Sprintf("data mismatch: %+v, %+v", idx0, idx1)) } } }
func main() { sometext, err := ioutil.ReadFile(os.Args[1]) if err != nil { log.Println(err) } index := suffixarray.New(sometext) offsets := index.Lookup([]byte("*"), -1) for _, value := range offsets[0:100] { log.Println(value, string(sometext[value])) } }
// Takes the known dictionary listing and creates a suffix array // model for these terms. If a model already existed, it is discarded func (model *Model) updateSuffixArr() { if !model.UseAutocomplete { return } model.RLock() termArr := make([]string, 0, 1000) for term, count := range model.Data { if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold? termArr = append(termArr, term) } } model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00" model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat)) model.SuffDivergence = 0 model.RUnlock() }
// golang.org网站的全文搜索是基于suffix array实现的【http://t.cn/hBJekg】, // 可能觉得效果不错,就把suffix array添加到golang的标准库里面了。【http://t.cn/hBJekd】 // http://blog.csdn.net/fxsjy/article/details/6297523 func main() { fmt.Println("Hello, 世界") str := `The Go programming language is an open source project to make programmers more productive. Go is expressive, concise, clean, and efficient. Its concurrency mechanisms make it easy to write programs that get the most out of multicore and networked machines, hyd, while its novel type system enables flexible and modular program construction. Go compiles quickly to machine code yet has the convenience of garbage collection and the power of run-time reflection. It's a fast, statically typed, compiled language that feels like a dynamically typed, interpreted language.` index := suffixarray.New([]byte(str)) offsets1 := index.Lookup([]byte("hyd"), -1) for _, i := range offsets1 { fmt.Println(str[i:]) } }
func test_sa() { data := []byte("i am a test i am a test i am a test i am a test i am a test") s := data[2:4] // create index for some data index := suffixarray.New(data) // lookup byte slice s offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data fmt.Println("test_sa") fmt.Println(string(s)) fmt.Println(offsets1) fmt.Println(offsets2) for _, i := range offsets1 { m := data[i : i+2] fmt.Println(string(m)) } }
func main() { str, _ := ioutil.ReadFile("large.in") next := func(str string) func() string { reg, _ := regexp.Compile("\\S+") is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1) return func() (result string) { if len(is) < 1 { return "" } result = str[is[0][0]:is[0][1]] is = is[1:] return } }(string(str)) t := time.Now().UnixNano() for i := 0; i < 100000000; i++ { next() } log.Println(time.Now().UnixNano() - t) }
func main() { str := `1234 2 3333 4 5 aaaaa s aaa ` next := func(str string) func() string { reg, _ := regexp.Compile("\\S+") is := suffixarray.New([]byte(str)).FindAllIndex(reg, -1) return func() (result string) { if len(is) < 1 { return "" } result = str[is[0][0]:is[0][1]] is = is[1:] return } }(str) for x := next(); x != ""; x = next() { log.Println(x) } }
// Convert ... func Convert(path string) error { file, err := os.Open(path) if err != nil { return err } defer file.Close() d := createDictReader(createTokenReader(file)) var buffer bytes.Buffer _, _ = buffer.WriteRune('@') for !d.isDone() { g, err := d.nextGroup() if err != nil { return err } for _, w := range g.words { _, _ = buffer.WriteString(w.name) _, _ = buffer.WriteRune('@') } } sa := suffixarray.New(buffer.Bytes()[:]) flags := os.O_CREATE | os.O_WRONLY dictFile, err := os.OpenFile("morph.dict", flags, 0666) if err != nil { return err } defer dictFile.Close() err = sa.Write(dictFile) if err != nil { return err } return nil }
func main() { words := []string{ "banana", "apple", "pear", "tangerine", "orange", "lemon", "peach", "persimmon", } // Combine all words into a single byte slice, separated by \x00 bytes (which // do not appear in words), adding one on each end too. data := []byte("\x00" + strings.Join(words, "\x00") + "\x00") sa := suffixarray.New(data) indices := sa.Lookup([]byte("an"), -1) if len(indices) > 0 { fmt.Println("Lookup returns:", indices) } else { fmt.Println("Lookup: not found") } // Reconstruct matches from indices found by Lookup. for _, idx := range indices { fmt.Println(getStringFromIndex(data, idx)) } // Here using a completely "literal" regexp, similar to the usage of Lookup, // to compare what the two methods return. FindAllIndex can take an arbitrary // regexp - but beware of the caveat discussed in the blog post. r := regexp.MustCompile("an") matches := sa.FindAllIndex(r, -1) fmt.Println("FindAllIndex returns:", matches) }
// NewIndex creates a new index for the .go files // in the directories given by dirnames. // func NewIndex(dirnames <-chan string, fulltextIndex bool, throttle float64) *Index { var x Indexer th := NewThrottle(throttle, 100*time.Millisecond) // run at least 0.1s at a time // initialize Indexer // (use some reasonably sized maps to start) x.fset = token.NewFileSet() x.packages = make(map[string]*Pak, 256) x.words = make(map[string]*IndexResult, 8192) // index all files in the directories given by dirnames for dirname := range dirnames { list, err := fs.ReadDir(dirname) if err != nil { continue // ignore this directory } for _, f := range list { if !f.IsDir() { x.visitFile(dirname, f, fulltextIndex) } th.Throttle() } } if !fulltextIndex { // the file set, the current file, and the sources are // not needed after indexing if no text index is built - // help GC and clear them x.fset = nil x.sources.Reset() x.current = nil // contains reference to fset! } // for each word, reduce the RunLists into a LookupResult; // also collect the word with its canonical spelling in a // word list for later computation of alternative spellings words := make(map[string]*LookupResult) var wlist RunList for w, h := range x.words { decls := reduce(h.Decls) others := reduce(h.Others) words[w] = &LookupResult{ Decls: decls, Others: others, } wlist = append(wlist, &wordPair{canonical(w), w}) th.Throttle() } x.stats.Words = len(words) // reduce the word list {canonical(w), w} into // a list of AltWords runs {canonical(w), {w}} alist := wlist.reduce(lessWordPair, newAltWords) // convert alist into a map of alternative spellings alts := make(map[string]*AltWords) for i := 0; i < len(alist); i++ { a := alist[i].(*AltWords) alts[a.Canon] = a } // create text index var suffixes *suffixarray.Index if fulltextIndex { suffixes = suffixarray.New(x.sources.Bytes()) } return &Index{x.fset, suffixes, words, alts, x.snippets, x.stats} }
// NewIndex creates a new index for the .go files provided by the corpus. func (c *Corpus) NewIndex() *Index { // initialize Indexer // (use some reasonably sized maps to start) x := &Indexer{ c: c, fset: token.NewFileSet(), fsOpenGate: make(chan bool, maxOpenFiles), strings: make(map[string]string), packages: make(map[Pak]*Pak, 256), words: make(map[string]*IndexResult, 8192), throttle: util.NewThrottle(c.throttle(), 100*time.Millisecond), // run at least 0.1s at a time importCount: make(map[string]int), packagePath: make(map[string]map[string]bool), exports: make(map[string]map[string]SpotKind), idents: make(map[SpotKind]map[string][]Ident, 4), } // index all files in the directories given by dirnames var wg sync.WaitGroup // outstanding ReadDir + visitFile dirGate := make(chan bool, maxOpenDirs) for dirname := range c.fsDirnames() { if c.IndexDirectory != nil && !c.IndexDirectory(dirname) { continue } dirGate <- true wg.Add(1) go func(dirname string) { defer func() { <-dirGate }() defer wg.Done() list, err := c.fs.ReadDir(dirname) if err != nil { log.Printf("ReadDir(%q): %v; skipping directory", dirname, err) return // ignore this directory } for _, fi := range list { wg.Add(1) go func(fi os.FileInfo) { defer wg.Done() x.visitFile(dirname, fi) }(fi) } }(dirname) } wg.Wait() if !c.IndexFullText { // the file set, the current file, and the sources are // not needed after indexing if no text index is built - // help GC and clear them x.fset = nil x.sources.Reset() x.current = nil // contains reference to fset! } // for each word, reduce the RunLists into a LookupResult; // also collect the word with its canonical spelling in a // word list for later computation of alternative spellings words := make(map[string]*LookupResult) var wlist RunList for w, h := range x.words { decls := reduce(h.Decls) others := reduce(h.Others) words[w] = &LookupResult{ Decls: decls, Others: others, } wlist = append(wlist, &wordPair{canonical(w), w}) x.throttle.Throttle() } x.stats.Words = len(words) // reduce the word list {canonical(w), w} into // a list of AltWords runs {canonical(w), {w}} alist := wlist.reduce(lessWordPair, newAltWords) // convert alist into a map of alternative spellings alts := make(map[string]*AltWords) for i := 0; i < len(alist); i++ { a := alist[i].(*AltWords) alts[a.Canon] = a } // create text index var suffixes *suffixarray.Index if c.IndexFullText { suffixes = suffixarray.New(x.sources.Bytes()) } for _, idMap := range x.idents { for _, ir := range idMap { sort.Sort(byPackage(ir)) } } return &Index{ fset: x.fset, suffixes: suffixes, words: words, alts: alts, snippets: x.snippets, stats: x.stats, importCount: x.importCount, packagePath: x.packagePath, exports: x.exports, idents: x.idents, opts: indexOptions{ Docs: x.c.IndexDocs, GoCode: x.c.IndexGoCode, FullText: x.c.IndexFullText, MaxResults: x.c.MaxResults, }, } }
// NewIndex creates a new index for the .go files // in the directories given by dirnames. // func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index { var x Indexer // initialize Indexer x.fset = token.NewFileSet() x.words = make(map[string]*IndexResult) // index all files in the directories given by dirnames for dirname := range dirnames { list, err := ioutil.ReadDir(dirname) if err != nil { continue // ignore this directory } for _, f := range list { if !f.IsDirectory() { x.visitFile(dirname, f, fulltextIndex) } } } if !fulltextIndex { // the file set, the current file, and the sources are // not needed after indexing if no text index is built - // help GC and clear them x.fset = nil x.sources.Reset() x.current = nil // contains reference to fset! } // for each word, reduce the RunLists into a LookupResult; // also collect the word with its canonical spelling in a // word list for later computation of alternative spellings words := make(map[string]*LookupResult) var wlist RunList for w, h := range x.words { decls := reduce(&h.Decls) others := reduce(&h.Others) words[w] = &LookupResult{ Decls: decls, Others: others, } wlist.Push(&wordPair{canonical(w), w}) } x.stats.Words = len(words) // reduce the word list {canonical(w), w} into // a list of AltWords runs {canonical(w), {w}} alist := wlist.reduce(lessWordPair, newAltWords) // convert alist into a map of alternative spellings alts := make(map[string]*AltWords) for i := 0; i < alist.Len(); i++ { a := alist.At(i).(*AltWords) alts[a.Canon] = a } // convert snippet vector into a list snippets := make([]*Snippet, x.snippets.Len()) for i := 0; i < x.snippets.Len(); i++ { snippets[i] = x.snippets.At(i).(*Snippet) } // create text index var suffixes *suffixarray.Index if fulltextIndex { suffixes = suffixarray.New(x.sources.Bytes()) } return &Index{x.fset, suffixes, words, alts, snippets, x.stats} }