// Create a new index. func (col *Col) Index(indexPath []string) error { jointPath := strings.Join(indexPath, INDEX_PATH_SEP) // Check whether the index already exists if _, alreadyExists := col.SecIndexes[jointPath]; alreadyExists { return errors.New(fmt.Sprintf("Path %v is already indexed in collection %s", indexPath, col.BaseDir)) } // Make the new index indexBaseDir := path.Join(col.BaseDir, HASHTABLE_DIRNAME_MAGIC+jointPath) col.openIndex(indexPath, indexBaseDir) // Put all documents on the new index newIndex := col.SecIndexes[jointPath] col.ForAll(func(id uint64, doc map[string]interface{}) bool { for _, toBeIndexed := range GetIn(doc, indexPath) { if toBeIndexed != nil { // Figure out where to put it hash := chunk.StrHash(toBeIndexed) dest := newIndex[hash%col.NumChunksI64] lock := dest.Mutex lock.Lock() dest.Put(hash, id) lock.Unlock() } } return true }) return nil }
// Execute value equity check ("attribute == value") using hash lookup or collection scan. func Lookup(lookupValue interface{}, expr map[string]interface{}, src *Col, result *map[uint64]struct{}) (err error) { // Figure out lookup path - JSON array "in" path, hasPath := expr["in"] if !hasPath { return errors.New("Missing lookup path `in`") } vecPath := make([]string, 0) if vecPathInterface, ok := path.([]interface{}); ok { for _, v := range vecPathInterface { vecPath = append(vecPath, fmt.Sprint(v)) } } else { return errors.New(fmt.Sprintf("Expecting vector lookup path `in`, but %v given", path)) } // Figure out result number limit intLimit := uint64(0) if limit, hasLimit := expr["limit"]; hasLimit { if floatLimit, ok := limit.(float64); ok { intLimit = uint64(floatLimit) } else { return errors.New(fmt.Sprintf("Expecting `limit` as a number, but %v given", limit)) } } lookupStrValue := fmt.Sprint(lookupValue) // the value to match lookupValueHash := chunk.StrHash(lookupStrValue) scanPath := strings.Join(vecPath, INDEX_PATH_SEP) // Is it PK index? if path == uid.PK_NAME { // Convert lookup string value (which is the Persistent ID) to integer and put it into result strint, err := strconv.ParseUint(lookupStrValue, 10, 64) if err != nil { return err } (*result)[strint] = struct{}{} return nil } // It might be a secondary index if secIndex, ok := src.SecIndexes[scanPath]; ok { num := lookupValueHash % src.NumChunksI64 ht := secIndex[num] ht.Mutex.RLock() _, vals := ht.Get(lookupValueHash, intLimit) ht.Mutex.RUnlock() for _, v := range vals { (*result)[v] = struct{}{} } return } // Neither PK or secondary index... return errors.New(fmt.Sprintf("Please index %v and retry query %v", scanPath, expr)) }
// Remove the document from all secondary indexes. func (col *Col) unindexDoc(id uint64, doc interface{}) { for _, index := range col.SecIndexes { for _, toBeIndexed := range GetIn(doc, index[0].Path) { if toBeIndexed != nil { // Figure out where it was put hashKey := chunk.StrHash(toBeIndexed) num := hashKey % col.NumChunksI64 ht := index[num] ht.Mutex.Lock() index[num].Remove(hashKey, id) ht.Mutex.Unlock() } } } }
func SecIndexContainsAll(path string, col *Col, expectedKV map[uint64][]uint64) bool { // expectedKV is a mapping between expected Hash Value VS PK values for k, ids := range expectedKV { fmt.Printf("Looking for key %v, id %v\n", k, ids) keys, vals := col.HashScan(path, k, 0) if len(keys) == 0 || len(vals) == 0 { fmt.Printf("Hash table does not have the key\n") return false } if len(vals) != len(ids) { fmt.Printf("Number not matched: %v %v\n", vals, ids) return false } for _, id := range ids { fmt.Printf("Checking for ID %s match among physical IDs %v\n", id, vals) var doc interface{} _, err := col.Read(id, &doc) if err != nil { fmt.Printf("ID given by function parameter does not exist %s\n", id) panic(err) } match := false for _, v := range vals { if uint64(id) == v { match = true break } } if !match { fmt.Printf("Hash table value does not match with ID hash %v %v\n", chunk.StrHash(id), vals[0]) return false } } } return true }
func TestIndex(t *testing.T) { fmt.Println("Running index test") tmp := "/tmp/tiedot_col_test" os.RemoveAll(tmp) defer os.RemoveAll(tmp) col, err := OpenCol(tmp, 4) if err != nil { t.Fatalf("Failed to open: %v", err) return } docs := []string{ `{"a": {"b": {"c": 1}}, "d": 0}`, `{"a": {"b": [{"c": 2}]}, "d": 0}`, `{"a": [{"b": {"c": 3}}], "d": 0}`, `{"a": [{"b": {"c": [4]}}, {"b": {"c": [5, 6]}}], "d": [0, 9]}`, `{"a": {"b": {"c": null}}, "d": null}`} var jsonDoc [4]map[string]interface{} json.Unmarshal([]byte(docs[0]), &jsonDoc[0]) json.Unmarshal([]byte(docs[1]), &jsonDoc[1]) json.Unmarshal([]byte(docs[2]), &jsonDoc[2]) json.Unmarshal([]byte(docs[3]), &jsonDoc[3]) var ids [4]uint64 // Insert a document, create two indexes and verify them ids[0], _ = col.Insert(jsonDoc[0]) col.Index([]string{"a", "b", "c"}) col.Index([]string{"d"}) if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("1"): []uint64{ids[0]}}) { t.Fatal() } if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("1"): []uint64{ids[0]}}) { t.Fatal() } // Do the following: // 1. Insert second and third document // 2. Replace the third document by the fourth document // 3. Remove the second document ids[1], _ = col.Insert(jsonDoc[1]) ids[2], _ = col.Insert(jsonDoc[2]) col.Update(ids[2], jsonDoc[3]) col.Delete(ids[1]) // Now the first and fourth documents are left, scrub and reopen the collection and verify index // col.Scrub() col.Close() col, err = OpenCol(tmp, 4) if err != nil { t.Fatalf("Failed to reopen: %v", err) } if !SecIndexContainsAll("d", col, map[uint64][]uint64{chunk.StrHash("0"): []uint64{ids[0], ids[2]}}) { t.Fatal() } if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("1"): []uint64{ids[0]}}) { t.Fatal() } if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("4"): []uint64{ids[2]}}) { t.Fatal() } // Insert one more document and verify indexes newID, _ := col.Insert(jsonDoc[0]) if !SecIndexContainsAll("d", col, map[uint64][]uint64{chunk.StrHash("0"): []uint64{ids[0], ids[2], newID}}) { t.Fatal() } if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("1"): []uint64{ids[0], newID}}) { t.Fatal() } if !SecIndexContainsAll("a,b,c", col, map[uint64][]uint64{chunk.StrHash("4"): []uint64{ids[2]}}) { t.Fatal() } if err = col.Flush(); err != nil { t.Fatal(err) } col.Close() }
// Scan hash table or collection documents using an integer range. func IntRange(intFrom interface{}, expr map[string]interface{}, src *Col, result *map[uint64]struct{}) (err error) { path, hasPath := expr["in"] if !hasPath { return errors.New("Missing path `in`") } // Figure out the path vecPath := make([]string, 0) if vecPathInterface, ok := path.([]interface{}); ok { for _, v := range vecPathInterface { vecPath = append(vecPath, fmt.Sprint(v)) } } else { return errors.New(fmt.Sprintf("Expecting vector path `in`, but %v given", path)) } if vecPath[0] == uid.PK_NAME { return errors.New("_pk is the primary index, integer range scan on _pk is meaningless") } // Figure out result number limit intLimit := int(0) if limit, hasLimit := expr["limit"]; hasLimit { if floatLimit, ok := limit.(float64); ok { intLimit = int(floatLimit) } else { return errors.New(fmt.Sprintf("Expecting `limit` as a number, but %v given", limit)) } } // Figure out the range ("from" value & "to" value) from, to := int(0), int(0) if floatFrom, ok := intFrom.(float64); ok { from = int(floatFrom) } else { return errors.New(fmt.Sprintf("Expecting `int-from` as an integer, but %v given", from)) } if intTo, ok := expr["int-to"]; ok { if floatTo, ok := intTo.(float64); ok { to = int(floatTo) } else { return errors.New(fmt.Sprintf("Expecting `int-to` as an integer, but %v given", to)) } } else if intTo, ok := expr["int to"]; ok { if floatTo, ok := intTo.(float64); ok { to = int(floatTo) } else { return errors.New(fmt.Sprintf("Expecting `int-to` as an integer, but %v given", to)) } } else { return errors.New(fmt.Sprintf("Missing `int-to`")) } if to > from && to-from > 1000 || from > to && from-to > 1000 { tdlog.Printf("Query %v is an index lookup of more than 1000 values, which may be inefficient", expr) } counter := int(0) // Number of results already collected htPath := strings.Join(vecPath, ",") if _, indexScan := src.SecIndexes[htPath]; indexScan { // Use index scan if it is available if from < to { // Forward scan - from low value to high value for lookupValue := from; lookupValue <= to; lookupValue++ { lookupStrValue := fmt.Sprint(lookupValue) hashValue := chunk.StrHash(lookupStrValue) _, vals := src.HashScan(htPath, hashValue, uint64(intLimit)) for _, docID := range vals { if intLimit > 0 && counter == intLimit { break } counter += 1 (*result)[docID] = struct{}{} } } } else { // Backward scan - from high value to low value for lookupValue := from; lookupValue >= to; lookupValue-- { lookupStrValue := fmt.Sprint(lookupValue) hashValue := chunk.StrHash(lookupStrValue) _, vals := src.HashScan(htPath, hashValue, uint64(intLimit)) for _, docID := range vals { if intLimit > 0 && counter == intLimit { break } counter += 1 (*result)[docID] = struct{}{} } } } } else { return errors.New(fmt.Sprintf("Please index %v and retry query %v", vecPath, expr)) } return }