Beispiel #1
0
// M512MaskPermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using
// the corresponding index in 'idx', and store the results in 'dst' using
// writemask 'k' (elements are copied from 'src' when the corresponding mask
// bit is not set).
//
//		FOR j := 0 to 63
//			i := j*8
//			id := idx[i+5:i]*8
//			IF k[j]
//				dst[i+7:i] := a[id+7:id]
//			ELSE
//				dst[i+7:i] := src[i+7:i]
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPERMB'. Intrinsic: '_mm512_mask_permutexvar_epi8'.
// Requires AVX512VBMI.
func M512MaskPermutexvarEpi8(src x86.M512i, k x86.Mmask64, idx x86.M512i, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskPermutexvarEpi8([64]byte(src), uint64(k), [64]byte(idx), [64]byte(a)))
}
Beispiel #2
0
// M512MaskzPermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using
// the corresponding index in 'idx', and store the results in 'dst' using
// zeromask 'k' (elements are zeroed out when the corresponding mask bit is not
// set).
//
//		FOR j := 0 to 63
//			i := j*8
//			id := idx[i+5:i]*8
//			IF k[j]
//				dst[i+7:i] := a[id+7:id]
//			ELSE
//				dst[i+7:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPERMB'. Intrinsic: '_mm512_maskz_permutexvar_epi8'.
// Requires AVX512VBMI.
func M512MaskzPermutexvarEpi8(k x86.Mmask64, idx x86.M512i, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskzPermutexvarEpi8(uint64(k), [64]byte(idx), [64]byte(a)))
}
Beispiel #3
0
// M512Mask2Permutex2varEpi8: Shuffle 8-bit integers in 'a' and 'b' across
// lanes using the corresponding selector and index in 'idx', and store the
// results in 'dst' using writemask 'k' (elements are copied from 'a' when the
// corresponding mask bit is not set).
//
//		FOR j := 0 to 63
//			i := j*8
//			IF k[j]
//				off := 8*idx[i+5:i]
//				dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
//			ELSE
//				dst[i+7:i] := a[i+7:i]
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPERMI2B'. Intrinsic: '_mm512_mask2_permutex2var_epi8'.
// Requires AVX512VBMI.
func M512Mask2Permutex2varEpi8(a x86.M512i, idx x86.M512i, k x86.Mmask64, b x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512Mask2Permutex2varEpi8([64]byte(a), [64]byte(idx), uint64(k), [64]byte(b)))
}
Beispiel #4
0
// M512Permutex2varEpi8: Shuffle 8-bit integers in 'a' and 'b' across lanes
// using the corresponding selector and index in 'idx', and store the results
// in 'dst'.
//
//		FOR j := 0 to 63
//			i := j*8
//			off := 8*idx[i+5:i]
//			dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPERMI2B'. Intrinsic: '_mm512_permutex2var_epi8'.
// Requires AVX512VBMI.
func M512Permutex2varEpi8(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512Permutex2varEpi8([64]byte(a), [64]byte(idx), [64]byte(b)))
}
Beispiel #5
0
// M512ConflictEpi64: Test each 64-bit element of 'a' for equality with all
// other elements in 'a' closer to the least significant bit. Each element's
// comparison forms a zero extended bit vector in 'dst'.
//
//		FOR j := 0 to 7
//			i := j*64
//			FOR k := 0 to j-1
//				m := k*64
//				dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
//			ENDFOR
//			dst[i+63:i+j] := 0
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm512_conflict_epi64'.
// Requires AVX512CD.
func M512ConflictEpi64(a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512ConflictEpi64([64]byte(a)))
}
Beispiel #6
0
// M512Madd52loEpu64: Multiply packed unsigned 52-bit integers in each 64-bit
// element of 'b' and 'c' to form a 104-bit intermediate result. Add the low
// 52-bit unsigned integer from the intermediate result with the corresponding
// unsigned 64-bit integer in 'a', and store the results in 'dst'.
//
//		FOR j := 0 to 7
//			i := j*64
//			tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
//			dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMADD52LUQ'. Intrinsic: '_mm512_madd52lo_epu64'.
// Requires AVX512IFMA52.
func M512Madd52loEpu64(a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512Madd52loEpu64([64]byte(a), [64]byte(b), [64]byte(c)))
}
Beispiel #7
0
// M512MaskLzcntEpi64: Counts the number of leading zero bits in each packed
// 64-bit integer in 'a', and store the results in 'dst' using writemask 'k'
// (elements are copied from 'src' when the corresponding mask bit is not set).
//
//		FOR j := 0 to 7
//			i := j*64
//			IF k[j]
//				tmp := 63
//				dst[i+63:i] := 0
//				DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//					tmp := tmp - 1
//					dst[i+63:i] := dst[i+63:i] + 1
//				OD
//			ELSE
//				dst[i+63:i] := src[i+63:i]
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPLZCNTQ'. Intrinsic: '_mm512_mask_lzcnt_epi64'.
// Requires AVX512CD.
func M512MaskLzcntEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskLzcntEpi64([64]byte(src), uint8(k), [64]byte(a)))
}
Beispiel #8
0
// M512MaskzMultishiftEpi64Epi8: For each 64-bit element in 'b', select 8
// unaligned bytes using a byte-granular shift control within the corresponding
// 64-bit element of 'a', and store the 8 assembled bytes to the corresponding
// 64-bit element of 'dst' using zeromask 'k' (elements are zeroed out when the
// corresponding mask bit is not set).
//
//		FOR i := 0 to 7
//			q := i * 64
//			FOR j := 0 to 7
//				tmp8 := 0
//				ctrl := a[q+j*8+7:q+j*8] & 63
//				FOR l := 0 to 7
//					tmp8[k] := b[q+((ctrl+k) & 63)]
//				ENDFOR
//				IF k[i*8+j]
//					dst[q+j*8+7:q+j*8] := tmp8[7:0]
//				ELSE
//					dst[q+j*8+7:q+j*8] := 0
//				FI
//			ENDFOR
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_maskz_multishift_epi64_epi8'.
// Requires AVX512VBMI.
func M512MaskzMultishiftEpi64Epi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskzMultishiftEpi64Epi8(uint64(k), [64]byte(a), [64]byte(b)))
}
Beispiel #9
0
// M512MaskzLzcntEpi32: Counts the number of leading zero bits in each packed
// 32-bit integer in 'a', and store the results in 'dst' using zeromask 'k'
// (elements are zeroed out when the corresponding mask bit is not set).
//
//		FOR j := 0 to 15
//			i := j*32
//			IF k[j]
//				tmp := 31
//				dst[i+31:i] := 0
//				DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//					tmp := tmp - 1
//					dst[i+31:i] := dst[i+31:i] + 1
//				OD
//			ELSE
//				dst[i+31:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPLZCNTD'. Intrinsic: '_mm512_maskz_lzcnt_epi32'.
// Requires AVX512CD.
func M512MaskzLzcntEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskzLzcntEpi32(uint16(k), [64]byte(a)))
}
Beispiel #10
0
// M512LzcntEpi64: Counts the number of leading zero bits in each packed 64-bit
// integer in 'a', and store the results in 'dst'.
//
//		FOR j := 0 to 7
//			i := j*64
//			tmp := 63
//			dst[i+63:i] := 0
//			DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//				tmp := tmp - 1
//				dst[i+63:i] := dst[i+63:i] + 1
//			OD
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPLZCNTQ'. Intrinsic: '_mm512_lzcnt_epi64'.
// Requires AVX512CD.
func M512LzcntEpi64(a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512LzcntEpi64([64]byte(a)))
}
Beispiel #11
0
// M512MaskLzcntEpi32: Counts the number of leading zero bits in each packed
// 32-bit integer in 'a', and store the results in 'dst' using writemask 'k'
// (elements are copied from 'src' when the corresponding mask bit is not set).
//
//		FOR j := 0 to 15
//			i := j*32
//			IF k[j]
//				tmp := 31
//				dst[i+31:i] := 0
//				DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
//					tmp := tmp - 1
//					dst[i+31:i] := dst[i+31:i] + 1
//				OD
//			ELSE
//				dst[i+31:i] := src[i+31:i]
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPLZCNTD'. Intrinsic: '_mm512_mask_lzcnt_epi32'.
// Requires AVX512CD.
func M512MaskLzcntEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskLzcntEpi32([64]byte(src), uint16(k), [64]byte(a)))
}
Beispiel #12
0
// M512MaskzConflictEpi64: Test each 64-bit element of 'a' for equality with
// all other elements in 'a' closer to the least significant bit using zeromask
// 'k' (elements are zeroed out when the corresponding mask bit is not set).
// Each element's comparison forms a zero extended bit vector in 'dst'.
//
//		FOR j := 0 to 7
//			i := j*64
//			IF k[j]
//				FOR l := 0 to j-1
//					m := l*64
//					dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
//				ENDFOR
//				dst[i+63:i+j] := 0
//			ELSE
//				dst[i+63:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm512_maskz_conflict_epi64'.
// Requires AVX512CD.
func M512MaskzConflictEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskzConflictEpi64(uint8(k), [64]byte(a)))
}
Beispiel #13
0
// M512BroadcastmbEpi64: Broadcast the low 8-bits from input mask 'k' to all
// 64-bit elements of 'dst'.
//
//		FOR j := 0 to 7
//			i := j*64
//			dst[i+63:i] := ZeroExtend(k[7:0])
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPBROADCASTMB2Q'. Intrinsic: '_mm512_broadcastmb_epi64'.
// Requires AVX512CD.
func M512BroadcastmbEpi64(k x86.Mmask8) (dst x86.M512i) {
	return x86.M512i(m512BroadcastmbEpi64(uint8(k)))
}
Beispiel #14
0
// M512PermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using the
// corresponding index in 'idx', and store the results in 'dst'.
//
//		FOR j := 0 to 63
//			i := j*8
//			id := idx[i+5:i]*8
//			dst[i+7:i] := a[id+7:id]
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPERMB'. Intrinsic: '_mm512_permutexvar_epi8'.
// Requires AVX512VBMI.
func M512PermutexvarEpi8(idx x86.M512i, a x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512PermutexvarEpi8([64]byte(idx), [64]byte(a)))
}
Beispiel #15
0
// M512MaskMadd52loEpu64: Multiply packed unsigned 52-bit integers in each
// 64-bit element of 'b' and 'c' to form a 104-bit intermediate result. Add the
// low 52-bit unsigned integer from the intermediate result with the
// corresponding unsigned 64-bit integer in 'a', and store the results in 'dst'
// using writemask 'k' (elements are copied from 'a' when the corresponding
// mask bit is not set).
//
//		FOR j := 0 to 7
//			i := j*64
//			IF k[j]
//				tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
//				dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
//			ELSE
//				dst[i+63:i] := a[i+63:i]
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMADD52LUQ'. Intrinsic: '_mm512_mask_madd52lo_epu64'.
// Requires AVX512IFMA52.
func M512MaskMadd52loEpu64(a x86.M512i, k x86.Mmask8, b x86.M512i, c x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskMadd52loEpu64([64]byte(a), uint8(k), [64]byte(b), [64]byte(c)))
}
Beispiel #16
0
// M512MaskMultishiftEpi64Epi8: For each 64-bit element in 'b', select 8
// unaligned bytes using a byte-granular shift control within the corresponding
// 64-bit element of 'a', and store the 8 assembled bytes to the corresponding
// 64-bit element of 'dst' using writemask 'k' (elements are copied from 'src'
// when the corresponding mask bit is not set).
//
//		FOR i := 0 to 7
//			q := i * 64
//			FOR j := 0 to 7
//				tmp8 := 0
//				ctrl := a[q+j*8+7:q+j*8] & 63
//				FOR l := 0 to 7
//					tmp8[k] := b[q+((ctrl+k) & 63)]
//				ENDFOR
//				IF k[i*8+j]
//					dst[q+j*8+7:q+j*8] := tmp8[7:0]
//				ELSE
//					dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
//				FI
//			ENDFOR
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_mask_multishift_epi64_epi8'.
// Requires AVX512VBMI.
func M512MaskMultishiftEpi64Epi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskMultishiftEpi64Epi8([64]byte(src), uint64(k), [64]byte(a), [64]byte(b)))
}
Beispiel #17
0
// M512MaskzMadd52hiEpu64: Multiply packed unsigned 52-bit integers in each
// 64-bit element of 'b' and 'c' to form a 104-bit intermediate result. Add the
// high 52-bit unsigned integer from the intermediate result with the
// corresponding unsigned 64-bit integer in 'a', and store the results in 'dst'
// using zeromask 'k' (elements are zeroed out when the corresponding mask bit
// is not set).
//
//		FOR j := 0 to 7
//			i := j*64
//			IF k[j]
//				tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
//				dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
//			ELSE
//				dst[i+63:i] := 0
//			FI
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMADD52HUQ'. Intrinsic: '_mm512_maskz_madd52hi_epu64'.
// Requires AVX512IFMA52.
func M512MaskzMadd52hiEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MaskzMadd52hiEpu64(uint8(k), [64]byte(a), [64]byte(b), [64]byte(c)))
}
Beispiel #18
0
// M512MultishiftEpi64Epi8: For each 64-bit element in 'b', select 8 unaligned
// bytes using a byte-granular shift control within the corresponding 64-bit
// element of 'a', and store the 8 assembled bytes to the corresponding 64-bit
// element of 'dst'.
//
//		FOR i := 0 to 7
//			q := i * 64
//			FOR j := 0 to 7
//				tmp8 := 0
//				ctrl := a[q+j*8+7:q+j*8] & 63
//				FOR l := 0 to 7
//					tmp8[k] := b[q+((ctrl+k) & 63)]
//				ENDFOR
//				dst[q+j*8+7:q+j*8] := tmp8[7:0]
//			ENDFOR
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_multishift_epi64_epi8'.
// Requires AVX512VBMI.
func M512MultishiftEpi64Epi8(a x86.M512i, b x86.M512i) (dst x86.M512i) {
	return x86.M512i(m512MultishiftEpi64Epi8([64]byte(a), [64]byte(b)))
}
Beispiel #19
0
// M512BroadcastmwEpi32: Broadcast the low 16-bits from input mask 'k' to all
// 32-bit elements of 'dst'.
//
//		FOR j := 0 to 15
//			i := j*32
//			dst[i+31:i] := ZeroExtend(k[15:0])
//		ENDFOR
//		dst[MAX:512] := 0
//
// Instruction: 'VPBROADCASTMW2D'. Intrinsic: '_mm512_broadcastmw_epi32'.
// Requires AVX512CD.
func M512BroadcastmwEpi32(k x86.Mmask16) (dst x86.M512i) {
	return x86.M512i(m512BroadcastmwEpi32(uint16(k)))
}