Example #1
0
// MaskzConflictEpi32: Test each 32-bit element of 'a' for equality with all
// other elements in 'a' closer to the least significant bit using zeromask 'k'
// (elements are zeroed out when the corresponding mask bit is not set). Each
// element's comparison forms a zero extended bit vector in 'dst'.
//
//		FOR j := 0 to 3
//			i := j*32
//			IF k[i]
//				FOR l := 0 to j-1
//					m := l*32
//					dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
//				ENDFOR
//				dst[i+31:i+j] := 0
//			ELSE
//				dst[i+31:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPCONFLICTD'. Intrinsic: '_mm_maskz_conflict_epi32'.
// Requires AVX512CD.
func MaskzConflictEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i) {
	return x86.M128i(maskzConflictEpi32(uint8(k), [16]byte(a)))
}
Example #2
0
// ShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle
// control mask in the corresponding 8-bit element of 'b', and store the
// results in 'dst'.
//
//		FOR j := 0 to 15
//			i := j*8
//			IF b[i+7] == 1
//				dst[i+7:i] := 0
//			ELSE
//				index[3:0] := b[i+3:i]
//				dst[i+7:i] := a[index*8+7:index*8]
//			FI
//		ENDFOR
//
// Instruction: 'PSHUFB'. Intrinsic: '_mm_shuffle_epi8'.
// Requires SSSE3.
func ShuffleEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(shuffleEpi8([16]byte(a), [16]byte(b)))
}
Example #3
0
// Sha1rnds4Epu32: Perform four rounds of SHA1 operation using an initial SHA1
// state (A,B,C,D) from 'a' and some pre-computed sum of the next 4 round
// message values (unsigned 32-bit integers), and state variable E from 'b',
// and store the updated SHA1 state (A,B,C,D) in 'dst'. 'func' contains the
// logic functions and round constants.
//
//		IF (func[1:0] = 0) THEN
//			f() := f0(), K := K0;
//		ELSE IF (func[1:0] = 1) THEN
//			f() := f1(), K := K1;
//		ELSE IF (func[1:0] = 2) THEN
//			f() := f2(), K := K2;
//		ELSE IF (func[1:0] = 3) THEN
//			f() := f3(), K := K3;
//		FI;
//
//		A := a[127:96];
//		B := a[95:64];
//		C := a[63:32];
//		D := a[31:0];
//
//		W[0] := b[127:96];
//		W[1] := b[95:64];
//		W[2] := b[63:32];
//		W[3] := b[31:0];
//
//		A[1] := f(B, C, D) + (A <<< 5) + W[0] + K;
//		B[1] := A;
//		C[1] := B <<< 30;
//		D[1] := C;
//		E[1] := D;
//
//		FOR i = 1 to 3
//				A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K;
//				B[i+1] := A[i];
//				C[i+1] := B[i] <<< 30;
//				D[i+1] := C[i];
//				E[i+1] := D[i];
//		ENDFOR;
//
//		dst[127:96] := A[4];
//		dst[95:64] := B[4];
//		dst[63:32] := C[4];
//		dst[31:0] := D[4];
//
// Instruction: 'SHA1RNDS4'. Intrinsic: '_mm_sha1rnds4_epu32'.
// Requires SHA.
func Sha1rnds4Epu32(a x86.M128i, b x86.M128i, fnc int) (dst x86.M128i) {
	return x86.M128i(sha1rnds4Epu32([16]byte(a), [16]byte(b), fnc))
}
Example #4
0
// Sha256rnds2Epu32: Perform 2 rounds of SHA256 operation using an initial
// SHA256 state (C,D,G,H) from 'a', an initial SHA256 state (A,B,E,F) from 'b',
// and a pre-computed sum of the next 2 round message values (unsigned 32-bit
// integers) and the corresponding round constants from 'k', and store the
// updated SHA256 state (A,B,E,F) in 'dst'.
//
//		A[0] := b[127:96];
//		B[0] := b[95:64];
//		C[0] := a[127:96];
//		D[0] := a[95:64];
//		E[0] := b[63:32];
//		F[0] := b[31:0];
//		G[0] := a[63:32];
//		H[0] := a[31:0];
//
//		W_K0 := k[31:0];
//		W_K1 := k[63:32];
//
//		FOR i = 0 to 1
//				A_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]);
//				B_(i+1) := A[i];
//				C_(i+1) := B[i];
//				D_(i+1) := C[i];
//				E_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + D[i];
//				F_(i+1) := E[i];
//				G_(i+1) := F[i];
//				H_(i+1) := G[i];
//		ENDFOR;
//
//		dst[127:96] := A[2];
//		dst[95:64] := B[2];
//		dst[63:32] := E[2];
//		dst[31:0] := F[2];
//
// Instruction: 'SHA256RNDS2'. Intrinsic: '_mm_sha256rnds2_epu32'.
// Requires SHA.
func Sha256rnds2Epu32(a x86.M128i, b x86.M128i, k x86.M128i) (dst x86.M128i) {
	return x86.M128i(sha256rnds2Epu32([16]byte(a), [16]byte(b), [16]byte(k)))
}
Example #5
0
// Clmulepi64Si128: Perform a carry-less multiplication of two 64-bit integers,
// selected from 'a' and 'b' according to 'imm8', and store the results in
// 'dst'.
//
//		IF (imm8[0] = 0)
//			TEMP1 := a[63:0];
//		ELSE
//			TEMP1 := a[127:64];
//		FI
//		IF (imm8[4] = 0)
//			TEMP2 := b[63:0];
//		ELSE
//			TEMP2 := b[127:64];
//		FI
//
//		FOR i := 0 to 63
//			TEMP[i] := (TEMP1[0] and TEMP2[i]);
//			FOR j := 1 to i
//				TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
//			ENDFOR
//			dst[i] := TEMP[i];
//		ENDFOR
//		FOR i := 64 to 127
//			TEMP [i] := 0;
//			FOR j := (i - 63) to 63
//				TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
//			ENDFOR
//			dst[i] := TEMP[i];
//		ENDFOR
//		dst[127] := 0
//
// Instruction: 'PCLMULQDQ'. Intrinsic: '_mm_clmulepi64_si128'.
// Requires PCLMULQDQ.
//
// FIXME: Requires compiler support (has immediate)
func Clmulepi64Si128(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i) {
	return x86.M128i(clmulepi64Si128([16]byte(a), [16]byte(b), imm8))
}
Example #6
0
// AesenclastSi128: Perform the last round of an AES encryption flow on data
// (state) in 'a' using the round key in 'RoundKey', and store the result in
// 'dst'."
//
//		state := a
//		a[127:0] := ShiftRows(a[127:0])
//		a[127:0] := SubBytes(a[127:0])
//		dst[127:0] := a[127:0] XOR RoundKey[127:0]
//
// Instruction: 'AESENCLAST'. Intrinsic: '_mm_aesenclast_si128'.
// Requires AES.
func AesenclastSi128(a x86.M128i, RoundKey x86.M128i) (dst x86.M128i) {
	return x86.M128i(aesenclastSi128([16]byte(a), [16]byte(RoundKey)))
}
Example #7
0
// MaskzLzcntEpi64: Counts the number of leading zero bits in each packed
// 64-bit integer in 'a', and store the results in 'dst' using zeromask 'k'
// (elements are zeroed out when the corresponding mask bit is not set).
//
//		FOR j := 0 to 1
//			i := j*64
//			IF k[j]
//				tmp := 63
//				dst[i+63:i] := 0
//				DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//					tmp := tmp - 1
//					dst[i+63:i] := dst[i+63:i] + 1
//				OD
//			ELSE
//				dst[i+63:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_maskz_lzcnt_epi64'.
// Requires AVX512CD.
func MaskzLzcntEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i) {
	return x86.M128i(maskzLzcntEpi64(uint8(k), [16]byte(a)))
}
Example #8
0
// CvtpsPh: Convert packed single-precision (32-bit) floating-point elements in
// 'a' to packed half-precision (16-bit) floating-point elements, and store the
// results in 'dst'.
// 	Rounding is done according to the 'rounding' parameter, which can be one
// of:
//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
//     _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
//
//		FOR j := 0 to 3
//			i := 16*j
//			l := 32*j
//			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_cvtps_ph'.
// Requires FP16C.
func CvtpsPh(a x86.M128, rounding int) (dst x86.M128i) {
	return x86.M128i(cvtpsPh([4]float32(a), rounding))
}
Example #9
0
// LzcntEpi64: Counts the number of leading zero bits in each packed 64-bit
// integer in 'a', and store the results in 'dst'.
//
//		FOR j := 0 to 1
//			i := j*64
//			tmp := 63
//			dst[i+63:i] := 0
//			DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//				tmp := tmp - 1
//				dst[i+63:i] := dst[i+63:i] + 1
//			OD
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_lzcnt_epi64'.
// Requires AVX512CD.
func LzcntEpi64(a x86.M128i) (dst x86.M128i) {
	return x86.M128i(lzcntEpi64([16]byte(a)))
}
Example #10
0
// MaskLzcntEpi64: Counts the number of leading zero bits in each packed 64-bit
// integer in 'a', and store the results in 'dst' using writemask 'k' (elements
// are copied from 'src' when the corresponding mask bit is not set).
//
//		FOR j := 0 to 1
//			i := j*64
//			IF k[j]
//				tmp := 63
//				dst[i+63:i] := 0
//				DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//					tmp := tmp - 1
//					dst[i+63:i] := dst[i+63:i] + 1
//				OD
//			ELSE
//				dst[i+63:i] := src[i+63:i]
//			FI
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPLZCNTQ'. Intrinsic: '_mm_mask_lzcnt_epi64'.
// Requires AVX512CD.
func MaskLzcntEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i) {
	return x86.M128i(maskLzcntEpi64([16]byte(src), uint8(k), [16]byte(a)))
}
Example #11
0
// BroadcastmwEpi32: Broadcast the low 16-bits from input mask 'k' to all
// 32-bit elements of 'dst'.
//
//		FOR j := 0 to 3
//			i := j*32
//			dst[i+31:i] := ZeroExtend(k[15:0])
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPBROADCASTMW2D'. Intrinsic: '_mm_broadcastmw_epi32'.
// Requires AVX512CD.
func BroadcastmwEpi32(k x86.Mmask16) (dst x86.M128i) {
	return x86.M128i(broadcastmwEpi32(uint16(k)))
}
Example #12
0
// ConflictEpi64: Test each 64-bit element of 'a' for equality with all other
// elements in 'a' closer to the least significant bit. Each element's
// comparison forms a zero extended bit vector in 'dst'.
//
//		FOR j := 0 to 1
//			i := j*64
//			FOR k := 0 to j-1
//				m := k*64
//				dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
//			ENDFOR
//			dst[i+63:i+j] := 0
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm_conflict_epi64'.
// Requires AVX512CD.
func ConflictEpi64(a x86.M128i) (dst x86.M128i) {
	return x86.M128i(conflictEpi64([16]byte(a)))
}
Example #13
0
// BroadcastmbEpi64: Broadcast the low 8-bits from input mask 'k' to all 64-bit
// elements of 'dst'.
//
//		FOR j := 0 to 1
//			i := j*64
//			dst[i+63:i] := ZeroExtend(k[7:0])
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPBROADCASTMB2Q'. Intrinsic: '_mm_broadcastmb_epi64'.
// Requires AVX512CD.
func BroadcastmbEpi64(k x86.Mmask8) (dst x86.M128i) {
	return x86.M128i(broadcastmbEpi64(uint8(k)))
}
Example #14
0
// SignEpi8: Negate packed 8-bit integers in 'a' when the corresponding signed
// 8-bit integer in 'b' is negative, and store the results in 'dst'. Element in
// 'dst' are zeroed out when the corresponding element in 'b' is zero.
//
//		FOR j := 0 to 15
//			i := j*8
//			IF b[i+7:i] < 0
//				dst[i+7:i] := NEG(a[i+7:i])
//			ELSE IF b[i+7:i] = 0
//				dst[i+7:i] := 0
//			ELSE
//				dst[i+7:i] := a[i+7:i]
//			FI
//		ENDFOR
//
// Instruction: 'PSIGNB'. Intrinsic: '_mm_sign_epi8'.
// Requires SSSE3.
func SignEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(signEpi8([16]byte(a), [16]byte(b)))
}
Example #15
0
// AlignrEpi8: Concatenate 16-byte blocks in 'a' and 'b' into a 32-byte
// temporary result, shift the result right by 'count' bytes, and store the low
// 16 bytes in 'dst'.
//
//		tmp[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)
//		dst[127:0] := tmp[127:0]
//
// Instruction: 'PALIGNR'. Intrinsic: '_mm_alignr_epi8'.
// Requires SSSE3.
func AlignrEpi8(a x86.M128i, b x86.M128i, count int) (dst x86.M128i) {
	return x86.M128i(alignrEpi8([16]byte(a), [16]byte(b), count))
}
Example #16
0
// AbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and
// store the unsigned results in 'dst'.
//
//		FOR j := 0 to 15
//			i := j*8
//			dst[i+7:i] := ABS(a[i+7:i])
//		ENDFOR
//
// Instruction: 'PABSB'. Intrinsic: '_mm_abs_epi8'.
// Requires SSSE3.
func AbsEpi8(a x86.M128i) (dst x86.M128i) {
	return x86.M128i(absEpi8([16]byte(a)))
}
Example #17
0
// HaddEpi16: Horizontally add adjacent pairs of 16-bit integers in 'a' and
// 'b', and pack the signed 16-bit results in 'dst'.
//
//		dst[15:0] := a[31:16] + a[15:0]
//		dst[31:16] := a[63:48] + a[47:32]
//		dst[47:32] := a[95:80] + a[79:64]
//		dst[63:48] := a[127:112] + a[111:96]
//		dst[79:64] := b[31:16] + b[15:0]
//		dst[95:80] := b[63:48] + b[47:32]
//		dst[111:96] := b[95:80] + b[79:64]
//		dst[127:112] := b[127:112] + b[111:96]
//
// Instruction: 'PHADDW'. Intrinsic: '_mm_hadd_epi16'.
// Requires SSSE3.
func HaddEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(haddEpi16([16]byte(a), [16]byte(b)))
}
Example #18
0
// M256CvtpsPh: Convert packed single-precision (32-bit) floating-point
// elements in 'a' to packed half-precision (16-bit) floating-point elements,
// and store the results in 'dst'.
// 	Rounding is done according to the 'rounding' parameter, which can be one
// of:
//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
//     _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
//
//		FOR j := 0 to 7
//			i := 16*j
//			l := 32*j
//			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_cvtps_ph'.
// Requires FP16C.
func M256CvtpsPh(a x86.M256, rounding int) (dst x86.M128i) {
	return x86.M128i(m256CvtpsPh([8]float32(a), rounding))
}
Example #19
0
// HsubEpi32: Horizontally subtract adjacent pairs of 32-bit integers in 'a'
// and 'b', and pack the signed 32-bit results in 'dst'.
//
//		dst[31:0] := a[31:0] - a[63:32]
//		dst[63:32] := a[95:64] - a[127:96]
//		dst[95:64] := b[31:0] - b[63:32]
//		dst[127:96] := b[95:64] - b[127:96]
//
// Instruction: 'PHSUBD'. Intrinsic: '_mm_hsub_epi32'.
// Requires SSSE3.
func HsubEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(hsubEpi32([16]byte(a), [16]byte(b)))
}
Example #20
0
// AeskeygenassistSi128: Assist in expanding the AES cipher key by computing
// steps towards generating a round key for encryption cipher using data from
// 'a' and an 8-bit round constant specified in 'imm8', and store the result in
// 'dst'."
//
//		X3[31:0] := a[127:96]
//		X2[31:0] := a[95:64]
//		X1[31:0] := a[63:32]
//		X0[31:0] := a[31:0]
//		RCON[31:0] := ZeroExtend(imm8[7:0]);
//		dst[31:0] := SubWord(X1)
//		dst[63:32] := (RotWord(SubWord(X1)) XOR RCON;
//		dst[95:64] := SubWord(X3)
//		dst[127:96] := RotWord(SubWord(X3)) XOR RCON;
//
// Instruction: 'AESKEYGENASSIST'. Intrinsic: '_mm_aeskeygenassist_si128'.
// Requires AES.
//
// FIXME: Requires compiler support (has immediate)
func AeskeygenassistSi128(a x86.M128i, imm8 byte) (dst x86.M128i) {
	return x86.M128i(aeskeygenassistSi128([16]byte(a), imm8))
}
Example #21
0
// HsubsEpi16: Horizontally subtract adjacent pairs of 16-bit integers in 'a'
// and 'b' using saturation, and pack the signed 16-bit results in 'dst'.
//
//		dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
//		dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
//		dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
//		dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
//		dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
//		dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
//		dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
//		dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
//
// Instruction: 'PHSUBSW'. Intrinsic: '_mm_hsubs_epi16'.
// Requires SSSE3.
func HsubsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(hsubsEpi16([16]byte(a), [16]byte(b)))
}
Example #22
0
// AesimcSi128: Perform the InvMixColumns transformation on 'a' and store the
// result in 'dst'.
//
//		dst[127:0] := InvMixColumns(a[127:0])
//
// Instruction: 'AESIMC'. Intrinsic: '_mm_aesimc_si128'.
// Requires AES.
func AesimcSi128(a x86.M128i) (dst x86.M128i) {
	return x86.M128i(aesimcSi128([16]byte(a)))
}
Example #23
0
// MaddubsEpi16: Vertically multiply each unsigned 8-bit integer from 'a' with
// the corresponding signed 8-bit integer from 'b', producing intermediate
// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
// signed 16-bit integers, and pack the saturated results in 'dst'.
//
//		FOR j := 0 to 7
//			i := j*16
//			dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
//		ENDFOR
//
// Instruction: 'PMADDUBSW'. Intrinsic: '_mm_maddubs_epi16'.
// Requires SSSE3.
func MaddubsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(maddubsEpi16([16]byte(a), [16]byte(b)))
}
Example #24
0
// Sha256msg2Epu32: Perform the final calculation for the next four SHA256
// message values (unsigned 32-bit integers) using previous message values from
// 'a' and 'b', and store the result in 'dst'."
//
//		W14 := b[95:64];
//		W15 := b[127:96];
//		W16 := a[31:0] + sigma1(W14);
//		W17 := a[63:32] + sigma1(W15);
//		W18 := a[95:64] + sigma1(W16);
//		W19 := a[127:96] + sigma1(W17);
//
//		dst[127:96] := W19;
//		dst[95:64] := W18;
//		dst[63:32] := W17;
//		dst[31:0] := W16;
//
// Instruction: 'SHA256MSG2'. Intrinsic: '_mm_sha256msg2_epu32'.
// Requires SHA.
func Sha256msg2Epu32(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(sha256msg2Epu32([16]byte(a), [16]byte(b)))
}
Example #25
0
// MulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing
// intermediate signed 32-bit integers. Truncate each intermediate integer to
// the 18 most significant bits, round by adding 1, and store bits [16:1] to
// 'dst'.
//
//		FOR j := 0 to 7
//			i := j*16
//			tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
//			dst[i+15:i] := tmp[16:1]
//		ENDFOR
//
// Instruction: 'PMULHRSW'. Intrinsic: '_mm_mulhrs_epi16'.
// Requires SSSE3.
func MulhrsEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(mulhrsEpi16([16]byte(a), [16]byte(b)))
}
Example #26
0
// Sha1nexteEpu32: Calculate SHA1 state variable E after four rounds of
// operation from the current SHA1 state variable 'a', add that value to the
// scheduled values (unsigned 32-bit integers) in 'b', and store the result in
// 'dst'.
//
//		tmp := (a[127:96] <<< 30);
//		dst[127:96] := b[127:96] + tmp;
//		dst[95:64] := b[95:64];
//		dst[63:32] := b[63:32];
//		dst[31:0] := b[31:0];
//
// Instruction: 'SHA1NEXTE'. Intrinsic: '_mm_sha1nexte_epu32'.
// Requires SHA.
func Sha1nexteEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i) {
	return x86.M128i(sha1nexteEpu32([16]byte(a), [16]byte(b)))
}
Example #27
0
// MaskConflictEpi32: Test each 32-bit element of 'a' for equality with all
// other elements in 'a' closer to the least significant bit using writemask
// 'k' (elements are copied from 'src' when the corresponding mask bit is not
// set). Each element's comparison forms a zero extended bit vector in 'dst'.
//
//		FOR j := 0 to 3
//			i := j*32
//			IF k[i]
//				FOR l := 0 to j-1
//					m := l*32
//					dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
//				ENDFOR
//				dst[i+31:i+j] := 0
//			ELSE
//				dst[i+31:i] := src[i+31:i]
//			FI
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VPCONFLICTD'. Intrinsic: '_mm_mask_conflict_epi32'.
// Requires AVX512CD.
func MaskConflictEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i) {
	return x86.M128i(maskConflictEpi32([16]byte(src), uint8(k), [16]byte(a)))
}