// AbsPi32: Compute the absolute value of packed 32-bit integers in 'a', and // store the unsigned results in 'dst'. // // FOR j := 0 to 1 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // Instruction: 'PABSD'. Intrinsic: '_mm_abs_pi32'. // Requires SSSE3. func AbsPi32(a x86.M64) (dst x86.M64) { return x86.M64(absPi32(a)) }
// ShufflePi8: Shuffle packed 8-bit integers in 'a' according to shuffle // control mask in the corresponding 8-bit element of 'b', and store the // results in 'dst'. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7] == 1 // dst[i+7:i] := 0 // ELSE // index[2:0] := b[i+2:i] // dst[i+7:i] := a[index*8+7:index*8] // FI // ENDFOR // // Instruction: 'PSHUFB'. Intrinsic: '_mm_shuffle_pi8'. // Requires SSSE3. func ShufflePi8(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(shufflePi8(a, b)) }
// SignPi8: Negate packed 8-bit integers in 'a' when the corresponding signed // 8-bit integer in 'b' is negative, and store the results in 'dst'. Element in // 'dst' are zeroed out when the corresponding element in 'b' is zero. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7:i] < 0 // dst[i+7:i] := NEG(a[i+7:i]) // ELSE IF b[i+7:i] = 0 // dst[i+7:i] := 0 // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // // Instruction: 'PSIGNB'. Intrinsic: '_mm_sign_pi8'. // Requires SSSE3. func SignPi8(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(signPi8(a, b)) }
// MaddubsPi16: Vertically multiply each unsigned 8-bit integer from 'a' with // the corresponding signed 8-bit integer from 'b', producing intermediate // signed 16-bit integers. Horizontally add adjacent pairs of intermediate // signed 16-bit integers, and pack the saturated results in 'dst'. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) // ENDFOR // // Instruction: 'PMADDUBSW'. Intrinsic: '_mm_maddubs_pi16'. // Requires SSSE3. func MaddubsPi16(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(maddubsPi16(a, b)) }
// MulhrsPi16: Multiply packed 16-bit integers in 'a' and 'b', producing // intermediate signed 32-bit integers. Truncate each intermediate integer to // the 18 most significant bits, round by adding 1, and store bits [16:1] to // 'dst'. // // FOR j := 0 to 3 // i := j*16 // tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 // dst[i+15:i] := tmp[16:1] // ENDFOR // // Instruction: 'PMULHRSW'. Intrinsic: '_mm_mulhrs_pi16'. // Requires SSSE3. func MulhrsPi16(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(mulhrsPi16(a, b)) }
// HsubsPi16: Horizontally subtract adjacent pairs of 16-bit integers in 'a' // and 'b' using saturation, and pack the signed 16-bit results in 'dst'. // // dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16]) // dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48]) // dst[47:32] = Saturate_To_Int16(b[15:0] - b[31:16]) // dst[63:48] = Saturate_To_Int16(b[47:32] - b[63:48]) // // Instruction: 'PHSUBSW'. Intrinsic: '_mm_hsubs_pi16'. // Requires SSSE3. func HsubsPi16(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(hsubsPi16(a, b)) }
// HsubPi32: Horizontally subtract adjacent pairs of 32-bit integers in 'a' and // 'b', and pack the signed 32-bit results in 'dst'. // // dst[31:0] := a[31:0] - a[63:32] // dst[63:32] := b[31:0] - b[63:32] // // Instruction: 'PHSUBD'. Intrinsic: '_mm_hsub_pi32'. // Requires SSSE3. func HsubPi32(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(hsubPi32(a, b)) }
// HaddsPi16: Horizontally add adjacent pairs of 16-bit integers in 'a' and 'b' // using saturation, and pack the signed 16-bit results in 'dst'. // // dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0]) // dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32]) // dst[47:32] = Saturate_To_Int16(b[31:16] + b[15:0]) // dst[63:48] = Saturate_To_Int16(b[63:48] + b[47:32]) // // Instruction: 'PHADDSW'. Intrinsic: '_mm_hadds_pi16'. // Requires SSSE3. func HaddsPi16(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(haddsPi16(a, b)) }
// HaddPi32: Horizontally add adjacent pairs of 32-bit integers in 'a' and 'b', // and pack the signed 32-bit results in 'dst'. // // dst[31:0] := a[63:32] + a[31:0] // dst[63:32] := b[63:32] + b[31:0] // // Instruction: 'PHADDW'. Intrinsic: '_mm_hadd_pi32'. // Requires SSSE3. func HaddPi32(a x86.M64, b x86.M64) (dst x86.M64) { return x86.M64(haddPi32(a, b)) }
// AlignrPi8: Concatenate 8-byte blocks in 'a' and 'b' into a 16-byte temporary // result, shift the result right by 'count' bytes, and store the low 16 bytes // in 'dst'. // // tmp[127:0] := ((a[63:0] << 64) OR b[63:0]) >> (count[7:0]*8) // dst[63:0] := tmp[63:0] // // Instruction: 'PALIGNR'. Intrinsic: '_mm_alignr_pi8'. // Requires SSSE3. func AlignrPi8(a x86.M64, b x86.M64, count int) (dst x86.M64) { return x86.M64(alignrPi8(a, b, count)) }