// Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor. // Shingle refers to the level of shuffling. // E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c" func Fingerprint(r io.Reader, shingle int) uint64 { if shingle < 1 { shingle = 1 } // collect the features via this cf channel. cf := make(chan string, 1000) cs := make(chan uint64, 1000) v := simhash.Vector{} // Tokenize and then Generate Features. . go func() { defer close(cf) z := html.NewTokenizer(r) // TODO - export the max token count as an function argument. count := 0 for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() { t := z.Token() count++ genFeatures(&t, cf) } }() // Collect the features. go func() { defer close(cs) a := make([][]byte, shingle) for f := <-cf; f != ""; f = <-cf { // shingle: generate the k-gram token as a single feature. a = append(a[1:], []byte(f)) // fmt.Printf("%#v\n", a) // fmt.Printf("%s\n", bytes.Join(a, []byte(" "))) cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum() // cs <- simhash.NewFeature([]byte(f)).Sum() } }() // from the checksum (of feature), append to vector. for s := <-cs; s != 0; s = <-cs { for i := uint8(0); i < 64; i++ { bit := ((s >> i) & 1) if bit == 1 { v[i]++ } else { v[i]-- } } } return simhash.Fingerprint(v) }
func Compute(r io.Reader, N, shingle int) (uint64, error) { if shingle < 1 { shingle = 1 } chFeature := make(chan string, 128) z := html.NewTokenizer(r) go func() { count := 1 for tt := z.Next(); count < N && tt != html.ErrorToken; tt = z.Next() { t := z.Token() count++ genFeature(&t, chFeature) } close(chFeature) }() ch := make(chan uint64, 128) go func() { // Avoid allocation s := make([][]byte, shingle) joined := make([][]byte, 2*shingle-1) space := []byte(" ") var i, n int for f := range chFeature { // Collect enough features if n < shingle { s[n] = []byte(f) if n++; n == shingle { goto JOIN } continue } // Shift array to produce one space for i = 0; i < shingle-1; i++ { s[i] = s[i+1] } s[i] = []byte(f) JOIN: for i, f := range s { joined[2*i] = f if i+1 != len(s) { joined[2*i+1] = space } } ch <- hash(joined...) } close(ch) }() v := simhash.Vector{} var i uint var bit int for n := range ch { for i = 0; i < 64; i++ { bit = int((n >> i) & 1) // bit == 1 ? 1 : -1 v[i] += (bit ^ (bit - 1)) } } if err := z.Err(); err != io.EOF { return 0, err } return simhash.Fingerprint(v), nil }