// vector of 4 32-bit unsigned integers func (u Uint32Div) DivSSE4(a M128i) M128i { t1 := sse2.MulEpu32(a, u.multiplier) // 32x32->64 bit unsigned multiplication of a[0] and a[2] t2 := sse2.SrliEpi64(t1, 32) // high dword of result 0 and 2 t3 := sse2.SrliEpi64(a, 32) // get a[1] and a[3] into position for multiplication t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3] t5 := sse2.SetEpi32(-1, 0, -1, 0) // mask of dword 1 and 3 t7 := sse4.BlendvEpi8(t2, t4, t5) // blend two results t8 := sse2.SubEpi32(a, t7) // subtract t9 := sse2.SrlEpi32(t8, u.shift1) // shift right logical t10 := sse2.AddEpi32(t7, t9) // add return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical }
// vector of 4 32-bit unsigned integers func (d Uint32Div) Div(a M128i) M128i { t1 := sse2.MulEpu32(a, u.multiplier) // 32x32->64 bit unsigned multiplication of a[0] and a[2] t2 := sse2.SrliEpi64(t1, 32) // high dword of result 0 and 2 t3 := sse2.SrliEpi64(a, 32) // get a[1] and a[3] into position for multiplication t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3] t5 := sse2.SetEpi32(-1, 0, -1, 0) // mask of dword 1 and 3 t6 := sse2.AndSi128(t4, t5) // high dword of result 1 and 3 t7 := sse2.OrSi128(t2, t6) // combine all four results into one vector t8 := sse2.SubEpi32(a, t7) // subtract t9 := sse2.SrlEpi32(t8, u.shift1) // shift right logical t10 := sse2.AddEpi32(t7, t9) // add return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical }