// MaskzConflictEpi32: Test each 32-bit element of 'a' for equality with all // other elements in 'a' closer to the least significant bit using zeromask 'k' // (elements are zeroed out when the corresponding mask bit is not set). Each // element's comparison forms a zero extended bit vector in 'dst'. // // FOR j := 0 to 3 // i := j*32 // IF k[i] // FOR l := 0 to j-1 // m := l*32 // dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 // ENDFOR // dst[i+31:i+j] := 0 // ELSE // dst[i+31:i] := 0 // FI // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPCONFLICTD'. Intrinsic: '_mm_maskz_conflict_epi32'. // Requires AVX512CD. func MaskzConflictEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i) { return x86.M128i(maskzConflictEpi32(uint8(k), [16]byte(a))) }
// ShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle // control mask in the corresponding 8-bit element of 'b', and store the // results in 'dst'. // // FOR j := 0 to 15 // i := j*8 // IF b[i+7] == 1 // dst[i+7:i] := 0 // ELSE // index[3:0] := b[i+3:i] // dst[i+7:i] := a[index*8+7:index*8] // FI // ENDFOR // // Instruction: 'PSHUFB'. Intrinsic: '_mm_shuffle_epi8'. // Requires SSSE3. func ShuffleEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(shuffleEpi8([16]byte(a), [16]byte(b))) }
// Sha1rnds4Epu32: Perform four rounds of SHA1 operation using an initial SHA1 // state (A,B,C,D) from 'a' and some pre-computed sum of the next 4 round // message values (unsigned 32-bit integers), and state variable E from 'b', // and store the updated SHA1 state (A,B,C,D) in 'dst'. 'func' contains the // logic functions and round constants. // // IF (func[1:0] = 0) THEN // f() := f0(), K := K0; // ELSE IF (func[1:0] = 1) THEN // f() := f1(), K := K1; // ELSE IF (func[1:0] = 2) THEN // f() := f2(), K := K2; // ELSE IF (func[1:0] = 3) THEN // f() := f3(), K := K3; // FI; // // A := a[127:96]; // B := a[95:64]; // C := a[63:32]; // D := a[31:0]; // // W[0] := b[127:96]; // W[1] := b[95:64]; // W[2] := b[63:32]; // W[3] := b[31:0]; // // A[1] := f(B, C, D) + (A <<< 5) + W[0] + K; // B[1] := A; // C[1] := B <<< 30; // D[1] := C; // E[1] := D; // // FOR i = 1 to 3 // A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K; // B[i+1] := A[i]; // C[i+1] := B[i] <<< 30; // D[i+1] := C[i]; // E[i+1] := D[i]; // ENDFOR; // // dst[127:96] := A[4]; // dst[95:64] := B[4]; // dst[63:32] := C[4]; // dst[31:0] := D[4]; // // Instruction: 'SHA1RNDS4'. Intrinsic: '_mm_sha1rnds4_epu32'. // Requires SHA. func Sha1rnds4Epu32(a x86.M128i, b x86.M128i, fnc int) (dst x86.M128i) { return x86.M128i(sha1rnds4Epu32([16]byte(a), [16]byte(b), fnc)) }
// Sha256rnds2Epu32: Perform 2 rounds of SHA256 operation using an initial // SHA256 state (C,D,G,H) from 'a', an initial SHA256 state (A,B,E,F) from 'b', // and a pre-computed sum of the next 2 round message values (unsigned 32-bit // integers) and the corresponding round constants from 'k', and store the // updated SHA256 state (A,B,E,F) in 'dst'. // // A[0] := b[127:96]; // B[0] := b[95:64]; // C[0] := a[127:96]; // D[0] := a[95:64]; // E[0] := b[63:32]; // F[0] := b[31:0]; // G[0] := a[63:32]; // H[0] := a[31:0]; // // W_K0 := k[31:0]; // W_K1 := k[63:32]; // // FOR i = 0 to 1 // A_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]); // B_(i+1) := A[i]; // C_(i+1) := B[i]; // D_(i+1) := C[i]; // E_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + D[i]; // F_(i+1) := E[i]; // G_(i+1) := F[i]; // H_(i+1) := G[i]; // ENDFOR; // // dst[127:96] := A[2]; // dst[95:64] := B[2]; // dst[63:32] := E[2]; // dst[31:0] := F[2]; // // Instruction: 'SHA256RNDS2'. Intrinsic: '_mm_sha256rnds2_epu32'. // Requires SHA. func Sha256rnds2Epu32(a x86.M128i, b x86.M128i, k x86.M128i) (dst x86.M128i) { return x86.M128i(sha256rnds2Epu32([16]byte(a), [16]byte(b), [16]byte(k))) }
// Clmulepi64Si128: Perform a carry-less multiplication of two 64-bit integers, // selected from 'a' and 'b' according to 'imm8', and store the results in // 'dst'. // // IF (imm8[0] = 0) // TEMP1 := a[63:0]; // ELSE // TEMP1 := a[127:64]; // FI // IF (imm8[4] = 0) // TEMP2 := b[63:0]; // ELSE // TEMP2 := b[127:64]; // FI // // FOR i := 0 to 63 // TEMP[i] := (TEMP1[0] and TEMP2[i]); // FOR j := 1 to i // TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j]) // ENDFOR // dst[i] := TEMP[i]; // ENDFOR // FOR i := 64 to 127 // TEMP [i] := 0; // FOR j := (i - 63) to 63 // TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j]) // ENDFOR // dst[i] := TEMP[i]; // ENDFOR // dst[127] := 0 // // Instruction: 'PCLMULQDQ'. Intrinsic: '_mm_clmulepi64_si128'. // Requires PCLMULQDQ. // // FIXME: Requires compiler support (has immediate) func Clmulepi64Si128(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i) { return x86.M128i(clmulepi64Si128([16]byte(a), [16]byte(b), imm8)) }
// AesenclastSi128: Perform the last round of an AES encryption flow on data // (state) in 'a' using the round key in 'RoundKey', and store the result in // 'dst'." // // state := a // a[127:0] := ShiftRows(a[127:0]) // a[127:0] := SubBytes(a[127:0]) // dst[127:0] := a[127:0] XOR RoundKey[127:0] // // Instruction: 'AESENCLAST'. Intrinsic: '_mm_aesenclast_si128'. // Requires AES. func AesenclastSi128(a x86.M128i, RoundKey x86.M128i) (dst x86.M128i) { return x86.M128i(aesenclastSi128([16]byte(a), [16]byte(RoundKey))) }
// MaskzLzcntEpi64: Counts the number of leading zero bits in each packed // 64-bit integer in 'a', and store the results in 'dst' using zeromask 'k' // (elements are zeroed out when the corresponding mask bit is not set). // // FOR j := 0 to 1 // i := j*64 // IF k[j] // tmp := 63 // dst[i+63:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+63:i] := dst[i+63:i] + 1 // OD // ELSE // dst[i+63:i] := 0 // FI // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_maskz_lzcnt_epi64'. // Requires AVX512CD. func MaskzLzcntEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i) { return x86.M128i(maskzLzcntEpi64(uint8(k), [16]byte(a))) }
// CvtpsPh: Convert packed single-precision (32-bit) floating-point elements in // 'a' to packed half-precision (16-bit) floating-point elements, and store the // results in 'dst'. // Rounding is done according to the 'rounding' parameter, which can be one // of: // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions // _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE // // FOR j := 0 to 3 // i := 16*j // l := 32*j // dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_cvtps_ph'. // Requires FP16C. func CvtpsPh(a x86.M128, rounding int) (dst x86.M128i) { return x86.M128i(cvtpsPh([4]float32(a), rounding)) }
// LzcntEpi64: Counts the number of leading zero bits in each packed 64-bit // integer in 'a', and store the results in 'dst'. // // FOR j := 0 to 1 // i := j*64 // tmp := 63 // dst[i+63:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+63:i] := dst[i+63:i] + 1 // OD // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_lzcnt_epi64'. // Requires AVX512CD. func LzcntEpi64(a x86.M128i) (dst x86.M128i) { return x86.M128i(lzcntEpi64([16]byte(a))) }
// MaskLzcntEpi64: Counts the number of leading zero bits in each packed 64-bit // integer in 'a', and store the results in 'dst' using writemask 'k' (elements // are copied from 'src' when the corresponding mask bit is not set). // // FOR j := 0 to 1 // i := j*64 // IF k[j] // tmp := 63 // dst[i+63:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+63:i] := dst[i+63:i] + 1 // OD // ELSE // dst[i+63:i] := src[i+63:i] // FI // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_mask_lzcnt_epi64'. // Requires AVX512CD. func MaskLzcntEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i) { return x86.M128i(maskLzcntEpi64([16]byte(src), uint8(k), [16]byte(a))) }
// BroadcastmwEpi32: Broadcast the low 16-bits from input mask 'k' to all // 32-bit elements of 'dst'. // // FOR j := 0 to 3 // i := j*32 // dst[i+31:i] := ZeroExtend(k[15:0]) // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPBROADCASTMW2D'. Intrinsic: '_mm_broadcastmw_epi32'. // Requires AVX512CD. func BroadcastmwEpi32(k x86.Mmask16) (dst x86.M128i) { return x86.M128i(broadcastmwEpi32(uint16(k))) }
// ConflictEpi64: Test each 64-bit element of 'a' for equality with all other // elements in 'a' closer to the least significant bit. Each element's // comparison forms a zero extended bit vector in 'dst'. // // FOR j := 0 to 1 // i := j*64 // FOR k := 0 to j-1 // m := k*64 // dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 // ENDFOR // dst[i+63:i+j] := 0 // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm_conflict_epi64'. // Requires AVX512CD. func ConflictEpi64(a x86.M128i) (dst x86.M128i) { return x86.M128i(conflictEpi64([16]byte(a))) }
// BroadcastmbEpi64: Broadcast the low 8-bits from input mask 'k' to all 64-bit // elements of 'dst'. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := ZeroExtend(k[7:0]) // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPBROADCASTMB2Q'. Intrinsic: '_mm_broadcastmb_epi64'. // Requires AVX512CD. func BroadcastmbEpi64(k x86.Mmask8) (dst x86.M128i) { return x86.M128i(broadcastmbEpi64(uint8(k))) }
// SignEpi8: Negate packed 8-bit integers in 'a' when the corresponding signed // 8-bit integer in 'b' is negative, and store the results in 'dst'. Element in // 'dst' are zeroed out when the corresponding element in 'b' is zero. // // FOR j := 0 to 15 // i := j*8 // IF b[i+7:i] < 0 // dst[i+7:i] := NEG(a[i+7:i]) // ELSE IF b[i+7:i] = 0 // dst[i+7:i] := 0 // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // // Instruction: 'PSIGNB'. Intrinsic: '_mm_sign_epi8'. // Requires SSSE3. func SignEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(signEpi8([16]byte(a), [16]byte(b))) }
// AlignrEpi8: Concatenate 16-byte blocks in 'a' and 'b' into a 32-byte // temporary result, shift the result right by 'count' bytes, and store the low // 16 bytes in 'dst'. // // tmp[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8) // dst[127:0] := tmp[127:0] // // Instruction: 'PALIGNR'. Intrinsic: '_mm_alignr_epi8'. // Requires SSSE3. func AlignrEpi8(a x86.M128i, b x86.M128i, count int) (dst x86.M128i) { return x86.M128i(alignrEpi8([16]byte(a), [16]byte(b), count)) }
// AbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and // store the unsigned results in 'dst'. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // Instruction: 'PABSB'. Intrinsic: '_mm_abs_epi8'. // Requires SSSE3. func AbsEpi8(a x86.M128i) (dst x86.M128i) { return x86.M128i(absEpi8([16]byte(a))) }
// HaddEpi16: Horizontally add adjacent pairs of 16-bit integers in 'a' and // 'b', and pack the signed 16-bit results in 'dst'. // // dst[15:0] := a[31:16] + a[15:0] // dst[31:16] := a[63:48] + a[47:32] // dst[47:32] := a[95:80] + a[79:64] // dst[63:48] := a[127:112] + a[111:96] // dst[79:64] := b[31:16] + b[15:0] // dst[95:80] := b[63:48] + b[47:32] // dst[111:96] := b[95:80] + b[79:64] // dst[127:112] := b[127:112] + b[111:96] // // Instruction: 'PHADDW'. Intrinsic: '_mm_hadd_epi16'. // Requires SSSE3. func HaddEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(haddEpi16([16]byte(a), [16]byte(b))) }
// M256CvtpsPh: Convert packed single-precision (32-bit) floating-point // elements in 'a' to packed half-precision (16-bit) floating-point elements, // and store the results in 'dst'. // Rounding is done according to the 'rounding' parameter, which can be one // of: // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions // _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE // // FOR j := 0 to 7 // i := 16*j // l := 32*j // dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_cvtps_ph'. // Requires FP16C. func M256CvtpsPh(a x86.M256, rounding int) (dst x86.M128i) { return x86.M128i(m256CvtpsPh([8]float32(a), rounding)) }
// HsubEpi32: Horizontally subtract adjacent pairs of 32-bit integers in 'a' // and 'b', and pack the signed 32-bit results in 'dst'. // // dst[31:0] := a[31:0] - a[63:32] // dst[63:32] := a[95:64] - a[127:96] // dst[95:64] := b[31:0] - b[63:32] // dst[127:96] := b[95:64] - b[127:96] // // Instruction: 'PHSUBD'. Intrinsic: '_mm_hsub_epi32'. // Requires SSSE3. func HsubEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(hsubEpi32([16]byte(a), [16]byte(b))) }
// AeskeygenassistSi128: Assist in expanding the AES cipher key by computing // steps towards generating a round key for encryption cipher using data from // 'a' and an 8-bit round constant specified in 'imm8', and store the result in // 'dst'." // // X3[31:0] := a[127:96] // X2[31:0] := a[95:64] // X1[31:0] := a[63:32] // X0[31:0] := a[31:0] // RCON[31:0] := ZeroExtend(imm8[7:0]); // dst[31:0] := SubWord(X1) // dst[63:32] := (RotWord(SubWord(X1)) XOR RCON; // dst[95:64] := SubWord(X3) // dst[127:96] := RotWord(SubWord(X3)) XOR RCON; // // Instruction: 'AESKEYGENASSIST'. Intrinsic: '_mm_aeskeygenassist_si128'. // Requires AES. // // FIXME: Requires compiler support (has immediate) func AeskeygenassistSi128(a x86.M128i, imm8 byte) (dst x86.M128i) { return x86.M128i(aeskeygenassistSi128([16]byte(a), imm8)) }
// HsubsEpi16: Horizontally subtract adjacent pairs of 16-bit integers in 'a' // and 'b' using saturation, and pack the signed 16-bit results in 'dst'. // // dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16]) // dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48]) // dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80]) // dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112]) // dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16]) // dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48]) // dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80]) // dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112]) // // Instruction: 'PHSUBSW'. Intrinsic: '_mm_hsubs_epi16'. // Requires SSSE3. func HsubsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(hsubsEpi16([16]byte(a), [16]byte(b))) }
// AesimcSi128: Perform the InvMixColumns transformation on 'a' and store the // result in 'dst'. // // dst[127:0] := InvMixColumns(a[127:0]) // // Instruction: 'AESIMC'. Intrinsic: '_mm_aesimc_si128'. // Requires AES. func AesimcSi128(a x86.M128i) (dst x86.M128i) { return x86.M128i(aesimcSi128([16]byte(a))) }
// MaddubsEpi16: Vertically multiply each unsigned 8-bit integer from 'a' with // the corresponding signed 8-bit integer from 'b', producing intermediate // signed 16-bit integers. Horizontally add adjacent pairs of intermediate // signed 16-bit integers, and pack the saturated results in 'dst'. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) // ENDFOR // // Instruction: 'PMADDUBSW'. Intrinsic: '_mm_maddubs_epi16'. // Requires SSSE3. func MaddubsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(maddubsEpi16([16]byte(a), [16]byte(b))) }
// Sha256msg2Epu32: Perform the final calculation for the next four SHA256 // message values (unsigned 32-bit integers) using previous message values from // 'a' and 'b', and store the result in 'dst'." // // W14 := b[95:64]; // W15 := b[127:96]; // W16 := a[31:0] + sigma1(W14); // W17 := a[63:32] + sigma1(W15); // W18 := a[95:64] + sigma1(W16); // W19 := a[127:96] + sigma1(W17); // // dst[127:96] := W19; // dst[95:64] := W18; // dst[63:32] := W17; // dst[31:0] := W16; // // Instruction: 'SHA256MSG2'. Intrinsic: '_mm_sha256msg2_epu32'. // Requires SHA. func Sha256msg2Epu32(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(sha256msg2Epu32([16]byte(a), [16]byte(b))) }
// MulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing // intermediate signed 32-bit integers. Truncate each intermediate integer to // the 18 most significant bits, round by adding 1, and store bits [16:1] to // 'dst'. // // FOR j := 0 to 7 // i := j*16 // tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1 // dst[i+15:i] := tmp[16:1] // ENDFOR // // Instruction: 'PMULHRSW'. Intrinsic: '_mm_mulhrs_epi16'. // Requires SSSE3. func MulhrsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(mulhrsEpi16([16]byte(a), [16]byte(b))) }
// Sha1nexteEpu32: Calculate SHA1 state variable E after four rounds of // operation from the current SHA1 state variable 'a', add that value to the // scheduled values (unsigned 32-bit integers) in 'b', and store the result in // 'dst'. // // tmp := (a[127:96] <<< 30); // dst[127:96] := b[127:96] + tmp; // dst[95:64] := b[95:64]; // dst[63:32] := b[63:32]; // dst[31:0] := b[31:0]; // // Instruction: 'SHA1NEXTE'. Intrinsic: '_mm_sha1nexte_epu32'. // Requires SHA. func Sha1nexteEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i) { return x86.M128i(sha1nexteEpu32([16]byte(a), [16]byte(b))) }
// MaskConflictEpi32: Test each 32-bit element of 'a' for equality with all // other elements in 'a' closer to the least significant bit using writemask // 'k' (elements are copied from 'src' when the corresponding mask bit is not // set). Each element's comparison forms a zero extended bit vector in 'dst'. // // FOR j := 0 to 3 // i := j*32 // IF k[i] // FOR l := 0 to j-1 // m := l*32 // dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 // ENDFOR // dst[i+31:i+j] := 0 // ELSE // dst[i+31:i] := src[i+31:i] // FI // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VPCONFLICTD'. Intrinsic: '_mm_mask_conflict_epi32'. // Requires AVX512CD. func MaskConflictEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i) { return x86.M128i(maskConflictEpi32([16]byte(src), uint8(k), [16]byte(a))) }