Пример #1
0
// vector of 4 32-bit unsigned integers
func (u Uint32Div) DivSSE4(a M128i) M128i {
	t1 := sse2.MulEpu32(a, u.multiplier)  // 32x32->64 bit unsigned multiplication of a[0] and a[2]
	t2 := sse2.SrliEpi64(t1, 32)          // high dword of result 0 and 2
	t3 := sse2.SrliEpi64(a, 32)           // get a[1] and a[3] into position for multiplication
	t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3]
	t5 := sse2.SetEpi32(-1, 0, -1, 0)     // mask of dword 1 and 3
	t7 := sse4.BlendvEpi8(t2, t4, t5)     // blend two results
	t8 := sse2.SubEpi32(a, t7)            // subtract
	t9 := sse2.SrlEpi32(t8, u.shift1)     // shift right logical
	t10 := sse2.AddEpi32(t7, t9)          // add
	return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical
}
Пример #2
0
// vector of 4 32-bit unsigned integers
func (d Uint32Div) Div(a M128i) M128i {
	t1 := sse2.MulEpu32(a, u.multiplier)  // 32x32->64 bit unsigned multiplication of a[0] and a[2]
	t2 := sse2.SrliEpi64(t1, 32)          // high dword of result 0 and 2
	t3 := sse2.SrliEpi64(a, 32)           // get a[1] and a[3] into position for multiplication
	t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3]
	t5 := sse2.SetEpi32(-1, 0, -1, 0)     // mask of dword 1 and 3
	t6 := sse2.AndSi128(t4, t5)           // high dword of result 1 and 3
	t7 := sse2.OrSi128(t2, t6)            // combine all four results into one vector
	t8 := sse2.SubEpi32(a, t7)            // subtract
	t9 := sse2.SrlEpi32(t8, u.shift1)     // shift right logical
	t10 := sse2.AddEpi32(t7, t9)          // add
	return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical
}