func benchmarkDeduplication(b *testing.B, numTrn, factsPerTrn, eLen, aLen, vLen int) { domain := "test" engine := randStorageWRepeats(domain, numTrn, factsPerTrn, eLen, aLen, vLen) log, err := view.OpenLog(engine, domain, "commit") if err != nil { b.Fatal(err) } b.ResetTimer() for i := 0; i < b.N; i++ { now := log.Now() iter := view.Deduplicate(now) if err = iter.Err(); err != nil { b.Fatal(err) } // The Deduplicate() operation is lazy, and most of the actual work happens during Next(), // so to evaluate the true cost of deduplication we need to time how long it takes to step // through the resulting iterator. _, err = testNext(iter) if err != nil { b.Fatal(err) } } }
func TestLogExcludeDuplicates(t *testing.T) { domain := "test" // number of transactions n := 100 // number of facts per transaction m := 100 eLen, aLen, vLen := 2, 3, 4 engine := randStorageWRepeats(domain, n, m, eLen, aLen, vLen) // Open the commit log. log, err := view.OpenLog(engine, domain, "commit") if err != nil { t.Fatal(err) } // first check the total number of facts num, err := testNext(log.Now()) if err != nil { t.Fatal(err) } if num != n*m { t.Errorf("expected %d total facts, got %d", n*m, num) } // Now check that Next() works on the deduplicated stream, // and verify the number of unique facts. iter := view.Deduplicate(log.Now()) if err := iter.Err(); err != nil { t.Fatal(err) } num, err = testNext(iter) if err != nil { t.Fatal(err) } // With the dictionary size being very small (e.g. 24) compared to the number of generated facts (e.g. 10,000), // the probability that any of the possible facts didn't get generated is negligible if num != eLen*aLen*vLen { t.Errorf("expected %d unique facts, got %d", eLen*aLen*vLen, num) } }