// CvtphPs: Convert packed half-precision (16-bit) floating-point elements in // 'a' to packed single-precision (32-bit) floating-point elements, and store // the results in 'dst'. // // FOR j := 0 to 3 // i := j*32 // m := j*16 // dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) // ENDFOR // dst[MAX:128] := 0 // // Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_cvtph_ps'. // Requires FP16C. func CvtphPs(a x86.M128i) (dst x86.M128) { return x86.M128(cvtphPs([16]byte(a))) }
// AddsubPs: Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in 'a' to/from packed elements in 'b', and store the // results in 'dst'. // // FOR j := 0 to 3 // i := j*32 // IF (j is even) // dst[i+31:i] := a[i+31:i] - b[i+31:i] // ELSE // dst[i+31:i] := a[i+31:i] + b[i+31:i] // FI // ENDFOR // // Instruction: 'ADDSUBPS'. Intrinsic: '_mm_addsub_ps'. // Requires SSE3. func AddsubPs(a x86.M128, b x86.M128) (dst x86.M128) { return x86.M128(addsubPs([4]float32(a), [4]float32(b))) }
// HaddPs: Horizontally add adjacent pairs of single-precision (32-bit) // floating-point elements in 'a' and 'b', and pack the results in 'dst'. // // dst[31:0] := a[63:32] + a[31:0] // dst[63:32] := a[127:96] + a[95:64] // dst[95:64] := b[63:32] + b[31:0] // dst[127:96] := b[127:96] + b[95:64] // // Instruction: 'HADDPS'. Intrinsic: '_mm_hadd_ps'. // Requires SSE3. func HaddPs(a x86.M128, b x86.M128) (dst x86.M128) { return x86.M128(haddPs([4]float32(a), [4]float32(b))) }
// MoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point // elements from 'a', and store the results in 'dst'. // // dst[31:0] := a[31:0] // dst[63:32] := a[31:0] // dst[95:64] := a[95:64] // dst[127:96] := a[95:64] // // Instruction: 'MOVSLDUP'. Intrinsic: '_mm_moveldup_ps'. // Requires SSE3. func MoveldupPs(a x86.M128) (dst x86.M128) { return x86.M128(moveldupPs([4]float32(a))) }