// M512MaskPermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using // the corresponding index in 'idx', and store the results in 'dst' using // writemask 'k' (elements are copied from 'src' when the corresponding mask // bit is not set). // // FOR j := 0 to 63 // i := j*8 // id := idx[i+5:i]*8 // IF k[j] // dst[i+7:i] := a[id+7:id] // ELSE // dst[i+7:i] := src[i+7:i] // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPERMB'. Intrinsic: '_mm512_mask_permutexvar_epi8'. // Requires AVX512VBMI. func M512MaskPermutexvarEpi8(src x86.M512i, k x86.Mmask64, idx x86.M512i, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskPermutexvarEpi8([64]byte(src), uint64(k), [64]byte(idx), [64]byte(a))) }
// M512MaskzPermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using // the corresponding index in 'idx', and store the results in 'dst' using // zeromask 'k' (elements are zeroed out when the corresponding mask bit is not // set). // // FOR j := 0 to 63 // i := j*8 // id := idx[i+5:i]*8 // IF k[j] // dst[i+7:i] := a[id+7:id] // ELSE // dst[i+7:i] := 0 // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPERMB'. Intrinsic: '_mm512_maskz_permutexvar_epi8'. // Requires AVX512VBMI. func M512MaskzPermutexvarEpi8(k x86.Mmask64, idx x86.M512i, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskzPermutexvarEpi8(uint64(k), [64]byte(idx), [64]byte(a))) }
// M512Mask2Permutex2varEpi8: Shuffle 8-bit integers in 'a' and 'b' across // lanes using the corresponding selector and index in 'idx', and store the // results in 'dst' using writemask 'k' (elements are copied from 'a' when the // corresponding mask bit is not set). // // FOR j := 0 to 63 // i := j*8 // IF k[j] // off := 8*idx[i+5:i] // dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPERMI2B'. Intrinsic: '_mm512_mask2_permutex2var_epi8'. // Requires AVX512VBMI. func M512Mask2Permutex2varEpi8(a x86.M512i, idx x86.M512i, k x86.Mmask64, b x86.M512i) (dst x86.M512i) { return x86.M512i(m512Mask2Permutex2varEpi8([64]byte(a), [64]byte(idx), uint64(k), [64]byte(b))) }
// M512Permutex2varEpi8: Shuffle 8-bit integers in 'a' and 'b' across lanes // using the corresponding selector and index in 'idx', and store the results // in 'dst'. // // FOR j := 0 to 63 // i := j*8 // off := 8*idx[i+5:i] // dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPERMI2B'. Intrinsic: '_mm512_permutex2var_epi8'. // Requires AVX512VBMI. func M512Permutex2varEpi8(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i) { return x86.M512i(m512Permutex2varEpi8([64]byte(a), [64]byte(idx), [64]byte(b))) }
// M512ConflictEpi64: Test each 64-bit element of 'a' for equality with all // other elements in 'a' closer to the least significant bit. Each element's // comparison forms a zero extended bit vector in 'dst'. // // FOR j := 0 to 7 // i := j*64 // FOR k := 0 to j-1 // m := k*64 // dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 // ENDFOR // dst[i+63:i+j] := 0 // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm512_conflict_epi64'. // Requires AVX512CD. func M512ConflictEpi64(a x86.M512i) (dst x86.M512i) { return x86.M512i(m512ConflictEpi64([64]byte(a))) }
// M512Madd52loEpu64: Multiply packed unsigned 52-bit integers in each 64-bit // element of 'b' and 'c' to form a 104-bit intermediate result. Add the low // 52-bit unsigned integer from the intermediate result with the corresponding // unsigned 64-bit integer in 'a', and store the results in 'dst'. // // FOR j := 0 to 7 // i := j*64 // tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) // dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMADD52LUQ'. Intrinsic: '_mm512_madd52lo_epu64'. // Requires AVX512IFMA52. func M512Madd52loEpu64(a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i) { return x86.M512i(m512Madd52loEpu64([64]byte(a), [64]byte(b), [64]byte(c))) }
// M512MaskLzcntEpi64: Counts the number of leading zero bits in each packed // 64-bit integer in 'a', and store the results in 'dst' using writemask 'k' // (elements are copied from 'src' when the corresponding mask bit is not set). // // FOR j := 0 to 7 // i := j*64 // IF k[j] // tmp := 63 // dst[i+63:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+63:i] := dst[i+63:i] + 1 // OD // ELSE // dst[i+63:i] := src[i+63:i] // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPLZCNTQ'. Intrinsic: '_mm512_mask_lzcnt_epi64'. // Requires AVX512CD. func M512MaskLzcntEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskLzcntEpi64([64]byte(src), uint8(k), [64]byte(a))) }
// M512MaskzMultishiftEpi64Epi8: For each 64-bit element in 'b', select 8 // unaligned bytes using a byte-granular shift control within the corresponding // 64-bit element of 'a', and store the 8 assembled bytes to the corresponding // 64-bit element of 'dst' using zeromask 'k' (elements are zeroed out when the // corresponding mask bit is not set). // // FOR i := 0 to 7 // q := i * 64 // FOR j := 0 to 7 // tmp8 := 0 // ctrl := a[q+j*8+7:q+j*8] & 63 // FOR l := 0 to 7 // tmp8[k] := b[q+((ctrl+k) & 63)] // ENDFOR // IF k[i*8+j] // dst[q+j*8+7:q+j*8] := tmp8[7:0] // ELSE // dst[q+j*8+7:q+j*8] := 0 // FI // ENDFOR // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_maskz_multishift_epi64_epi8'. // Requires AVX512VBMI. func M512MaskzMultishiftEpi64Epi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskzMultishiftEpi64Epi8(uint64(k), [64]byte(a), [64]byte(b))) }
// M512MaskzLzcntEpi32: Counts the number of leading zero bits in each packed // 32-bit integer in 'a', and store the results in 'dst' using zeromask 'k' // (elements are zeroed out when the corresponding mask bit is not set). // // FOR j := 0 to 15 // i := j*32 // IF k[j] // tmp := 31 // dst[i+31:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+31:i] := dst[i+31:i] + 1 // OD // ELSE // dst[i+31:i] := 0 // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPLZCNTD'. Intrinsic: '_mm512_maskz_lzcnt_epi32'. // Requires AVX512CD. func M512MaskzLzcntEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskzLzcntEpi32(uint16(k), [64]byte(a))) }
// M512LzcntEpi64: Counts the number of leading zero bits in each packed 64-bit // integer in 'a', and store the results in 'dst'. // // FOR j := 0 to 7 // i := j*64 // tmp := 63 // dst[i+63:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+63:i] := dst[i+63:i] + 1 // OD // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPLZCNTQ'. Intrinsic: '_mm512_lzcnt_epi64'. // Requires AVX512CD. func M512LzcntEpi64(a x86.M512i) (dst x86.M512i) { return x86.M512i(m512LzcntEpi64([64]byte(a))) }
// M512MaskLzcntEpi32: Counts the number of leading zero bits in each packed // 32-bit integer in 'a', and store the results in 'dst' using writemask 'k' // (elements are copied from 'src' when the corresponding mask bit is not set). // // FOR j := 0 to 15 // i := j*32 // IF k[j] // tmp := 31 // dst[i+31:i] := 0 // DO WHILE (tmp >= 0 AND a[i+tmp] == 0) // tmp := tmp - 1 // dst[i+31:i] := dst[i+31:i] + 1 // OD // ELSE // dst[i+31:i] := src[i+31:i] // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPLZCNTD'. Intrinsic: '_mm512_mask_lzcnt_epi32'. // Requires AVX512CD. func M512MaskLzcntEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskLzcntEpi32([64]byte(src), uint16(k), [64]byte(a))) }
// M512MaskzConflictEpi64: Test each 64-bit element of 'a' for equality with // all other elements in 'a' closer to the least significant bit using zeromask // 'k' (elements are zeroed out when the corresponding mask bit is not set). // Each element's comparison forms a zero extended bit vector in 'dst'. // // FOR j := 0 to 7 // i := j*64 // IF k[j] // FOR l := 0 to j-1 // m := l*64 // dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 // ENDFOR // dst[i+63:i+j] := 0 // ELSE // dst[i+63:i] := 0 // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPCONFLICTQ'. Intrinsic: '_mm512_maskz_conflict_epi64'. // Requires AVX512CD. func M512MaskzConflictEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskzConflictEpi64(uint8(k), [64]byte(a))) }
// M512BroadcastmbEpi64: Broadcast the low 8-bits from input mask 'k' to all // 64-bit elements of 'dst'. // // FOR j := 0 to 7 // i := j*64 // dst[i+63:i] := ZeroExtend(k[7:0]) // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPBROADCASTMB2Q'. Intrinsic: '_mm512_broadcastmb_epi64'. // Requires AVX512CD. func M512BroadcastmbEpi64(k x86.Mmask8) (dst x86.M512i) { return x86.M512i(m512BroadcastmbEpi64(uint8(k))) }
// M512PermutexvarEpi8: Shuffle 8-bit integers in 'a' across lanes using the // corresponding index in 'idx', and store the results in 'dst'. // // FOR j := 0 to 63 // i := j*8 // id := idx[i+5:i]*8 // dst[i+7:i] := a[id+7:id] // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPERMB'. Intrinsic: '_mm512_permutexvar_epi8'. // Requires AVX512VBMI. func M512PermutexvarEpi8(idx x86.M512i, a x86.M512i) (dst x86.M512i) { return x86.M512i(m512PermutexvarEpi8([64]byte(idx), [64]byte(a))) }
// M512MaskMadd52loEpu64: Multiply packed unsigned 52-bit integers in each // 64-bit element of 'b' and 'c' to form a 104-bit intermediate result. Add the // low 52-bit unsigned integer from the intermediate result with the // corresponding unsigned 64-bit integer in 'a', and store the results in 'dst' // using writemask 'k' (elements are copied from 'a' when the corresponding // mask bit is not set). // // FOR j := 0 to 7 // i := j*64 // IF k[j] // tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) // dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) // ELSE // dst[i+63:i] := a[i+63:i] // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMADD52LUQ'. Intrinsic: '_mm512_mask_madd52lo_epu64'. // Requires AVX512IFMA52. func M512MaskMadd52loEpu64(a x86.M512i, k x86.Mmask8, b x86.M512i, c x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskMadd52loEpu64([64]byte(a), uint8(k), [64]byte(b), [64]byte(c))) }
// M512MaskMultishiftEpi64Epi8: For each 64-bit element in 'b', select 8 // unaligned bytes using a byte-granular shift control within the corresponding // 64-bit element of 'a', and store the 8 assembled bytes to the corresponding // 64-bit element of 'dst' using writemask 'k' (elements are copied from 'src' // when the corresponding mask bit is not set). // // FOR i := 0 to 7 // q := i * 64 // FOR j := 0 to 7 // tmp8 := 0 // ctrl := a[q+j*8+7:q+j*8] & 63 // FOR l := 0 to 7 // tmp8[k] := b[q+((ctrl+k) & 63)] // ENDFOR // IF k[i*8+j] // dst[q+j*8+7:q+j*8] := tmp8[7:0] // ELSE // dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] // FI // ENDFOR // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_mask_multishift_epi64_epi8'. // Requires AVX512VBMI. func M512MaskMultishiftEpi64Epi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskMultishiftEpi64Epi8([64]byte(src), uint64(k), [64]byte(a), [64]byte(b))) }
// M512MaskzMadd52hiEpu64: Multiply packed unsigned 52-bit integers in each // 64-bit element of 'b' and 'c' to form a 104-bit intermediate result. Add the // high 52-bit unsigned integer from the intermediate result with the // corresponding unsigned 64-bit integer in 'a', and store the results in 'dst' // using zeromask 'k' (elements are zeroed out when the corresponding mask bit // is not set). // // FOR j := 0 to 7 // i := j*64 // IF k[j] // tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) // dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) // ELSE // dst[i+63:i] := 0 // FI // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMADD52HUQ'. Intrinsic: '_mm512_maskz_madd52hi_epu64'. // Requires AVX512IFMA52. func M512MaskzMadd52hiEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i) { return x86.M512i(m512MaskzMadd52hiEpu64(uint8(k), [64]byte(a), [64]byte(b), [64]byte(c))) }
// M512MultishiftEpi64Epi8: For each 64-bit element in 'b', select 8 unaligned // bytes using a byte-granular shift control within the corresponding 64-bit // element of 'a', and store the 8 assembled bytes to the corresponding 64-bit // element of 'dst'. // // FOR i := 0 to 7 // q := i * 64 // FOR j := 0 to 7 // tmp8 := 0 // ctrl := a[q+j*8+7:q+j*8] & 63 // FOR l := 0 to 7 // tmp8[k] := b[q+((ctrl+k) & 63)] // ENDFOR // dst[q+j*8+7:q+j*8] := tmp8[7:0] // ENDFOR // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPMULTISHIFTQB'. Intrinsic: '_mm512_multishift_epi64_epi8'. // Requires AVX512VBMI. func M512MultishiftEpi64Epi8(a x86.M512i, b x86.M512i) (dst x86.M512i) { return x86.M512i(m512MultishiftEpi64Epi8([64]byte(a), [64]byte(b))) }
// M512BroadcastmwEpi32: Broadcast the low 16-bits from input mask 'k' to all // 32-bit elements of 'dst'. // // FOR j := 0 to 15 // i := j*32 // dst[i+31:i] := ZeroExtend(k[15:0]) // ENDFOR // dst[MAX:512] := 0 // // Instruction: 'VPBROADCASTMW2D'. Intrinsic: '_mm512_broadcastmw_epi32'. // Requires AVX512CD. func M512BroadcastmwEpi32(k x86.Mmask16) (dst x86.M512i) { return x86.M512i(m512BroadcastmwEpi32(uint16(k))) }