func testBytesFind(t *testing.T, line, token []byte, startat int, exp []int) { got := []int{} tokenlen := len(token) for { foundat := tekstus.BytesFind(line, token, startat) if foundat < 0 { break } got = append(got, foundat) startat = foundat + tokenlen } assert(t, exp, got, true) }
func searchForward(atx, aty int, x, y *int, oldleft, newleft *[]byte) ( adds, dels tekstus.Chunks, ) { oldleftlen := len(*oldleft) newleftlen := len(*newleft) minlen := DefMatchLen if oldleftlen < minlen { minlen = oldleftlen } // Loop through old line to find matching token xaty := -1 xx := 1 for ; xx < oldleftlen-minlen; xx++ { token := (*oldleft)[xx : xx+minlen] xaty = tekstus.BytesFind(*newleft, token, 0) if xaty > 0 { break } } minlen = DefMatchLen if newleftlen < minlen { minlen = newleftlen } yatx := -1 yy := 1 for ; yy < newleftlen-minlen; yy++ { token := (*newleft)[yy : yy+minlen] yatx = tekstus.BytesFind(*oldleft, token, 0) if yatx > 0 { break } } if xaty < 0 && yatx < 0 { // still no token found, means whole chunk has been replaced. dels = append(dels, tekstus.Chunk{StartAt: atx + *x, V: *oldleft}) adds = append(adds, tekstus.Chunk{StartAt: aty + *y, V: *newleft}) *oldleft = []byte{} *newleft = []byte{} return adds, dels } // Some chunk has been replaced. v := (*oldleft)[:xx] dels = append(dels, tekstus.Chunk{StartAt: atx + *x, V: v}) *oldleft = (*oldleft)[xx:] *x = *x + xx v = (*newleft)[:yy] adds = append(adds, tekstus.Chunk{StartAt: aty + *y, V: v}) *newleft = (*newleft)[yy:] *y = *y + yy return adds, dels }
/* Lines given two similar lines, find and return the differences (additions and deletion) between them. Case 1: addition on new or deletion on old. old: 00000 new: 00000111 or old: 00000111 new: 00000 Case 2: addition on new line old: 000000 new: 0001000 Case 3: deletion on old line (reverse of case 2) old: 0001000 new: 000000 Case 4: change happened in the beginning old: 11000 new: 22000 Case 5: both changed old: 0001000 new: 0002000 */ func Lines(old, new []byte, atx, aty int) (adds, dels tekstus.Chunks) { oldlen := len(old) newlen := len(new) minlen := 0 if oldlen < newlen { minlen = oldlen } else { minlen = newlen } // Find the position of unmatched byte from the beginning. x, y := 0, 0 for ; x < minlen; x++ { if old[x] != new[x] { break } } y = x // Case 1: Check if addition or deletion is at the end. if x == minlen { if oldlen < newlen { v := new[y:] adds = append(adds, tekstus.Chunk{StartAt: atx + y, V: v}) } else { v := old[x:] dels = append(dels, tekstus.Chunk{StartAt: atx + x, V: v}) } return } // Find the position of unmatched byte from the end xend := oldlen - 1 yend := newlen - 1 for xend >= x && yend >= y { if old[xend] != new[yend] { break } xend-- yend-- } // Case 2: addition in new line. if x == xend+1 { v := new[y : yend+1] adds = append(adds, tekstus.Chunk{StartAt: aty + y, V: v}) return } // Case 3: deletion in old line. if y == yend+1 { v := old[x : xend+1] dels = append(dels, tekstus.Chunk{StartAt: atx + x, V: v}) return } // Calculate possible match len. // After we found similar bytes in the beginning and end of line, now // we have `n` number of bytes left in old and new. oldleft := old[x : xend+1] newleft := new[y : yend+1] oldleftlen := len(oldleft) newleftlen := len(newleft) // Get minimal token to search in the new left over. minlen = DefMatchLen if oldleftlen < DefMatchLen { minlen = oldleftlen } xtoken := oldleft[:minlen] xaty := tekstus.BytesFind(newleft, xtoken, 0) // Get miniminal token to search in the old left over. minlen = DefMatchLen if newleftlen < DefMatchLen { minlen = newleftlen } ytoken := newleft[:minlen] yatx := tekstus.BytesFind(oldleft, ytoken, 0) // Case 4: // We did not find matching token of x in y, its mean the some chunk // in x and y has been replaced. if xaty < 0 && yatx < 0 { addsleft, delsleft := searchForward(atx, aty, &x, &y, &oldleft, &newleft) if len(addsleft) > 0 { adds = append(adds, addsleft...) } if len(delsleft) > 0 { dels = append(dels, delsleft...) } // Check for possible empty left if len(oldleft) == 0 { if len(newleft) > 0 { adds = append(adds, tekstus.Chunk{ StartAt: atx + x, V: newleft, }) } return } if len(newleft) == 0 { if len(oldleft) > 0 { dels = append(dels, tekstus.Chunk{ StartAt: aty + y, V: oldleft, }) } return } } // Case 5: is combination of case 2 and 3. // Case 2: We found x token at y: xaty. Previous byte before that must // be an addition. if xaty >= 0 { v := new[y : y+xaty] adds = append(adds, tekstus.Chunk{StartAt: aty + y, V: v}) newleft = new[y+xaty : yend+1] } else { if yatx >= 0 { // Case 3: We found y token at x: yatx. Previous byte before that must // be a deletion. v := old[x : x+yatx] dels = append(dels, tekstus.Chunk{StartAt: atx + x, V: v}) oldleft = old[x+yatx : xend+1] } } addsleft, delsleft := Lines(oldleft, newleft, atx+x, aty+y) if len(addsleft) > 0 { adds = append(adds, addsleft...) } if len(delsleft) > 0 { dels = append(dels, delsleft...) } return }
/* BytesRatio compare two slice of bytes and return ratio of matching bytes. The ratio in in range of 0.0 to 1.0, where 1.0 if both are similar, and 0.0 if no matchs even found. `minTokenLen` define the minimum length of token for searching in both of slice. */ func BytesRatio(old, new []byte, minTokenLen int) ( ratio float32, m int, maxlen int, ) { x, y := 0, 0 oldlen := len(old) newlen := len(new) minlen := oldlen maxlen = newlen if newlen < oldlen { minlen = newlen maxlen = oldlen } if minTokenLen < 0 { minTokenLen = DefMatchLen } for { // Count matching bytes from beginning of slice. for x < minlen { if old[x] != new[y] { break } m++ x++ y++ } if x == minlen { // All bytes is matched but probably some trailing in // one of them. break } // Count matching bytes from end of slice xend := oldlen - 1 yend := newlen - 1 for xend >= x && yend >= y { if old[xend] != new[yend] { break } m++ xend-- yend-- } // One of the line have changes in the middle. if xend == x || yend == y { break } // Cut the matching bytes old = old[x : xend+1] new = new[y : yend+1] oldlen = len(old) newlen = len(new) // Get minimal token to search in the new left over. minlen = minTokenLen if oldlen < minlen { minlen = oldlen } // Search old token in new, chunk by chunk. x = 0 y = -1 max := oldlen - minlen for ; x < max; x++ { token := old[x : x+minlen] y = tekstus.BytesFind(new, token, 0) if y > 0 { break } } if y < 0 { // We did not found anything. break } // Cut the changes old = old[x:] new = new[y:] oldlen = len(old) newlen = len(new) minlen = oldlen if newlen < minlen { minlen = newlen } x, y = 0, 0 // start again from begining... } ratio = float32(m) / float32(maxlen) return ratio, m, maxlen }