示例#1
0
// CvtphPs: Convert packed half-precision (16-bit) floating-point elements in
// 'a' to packed single-precision (32-bit) floating-point elements, and store
// the results in 'dst'.
//
//		FOR j := 0 to 3
//			i := j*32
//			m := j*16
//			dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
//		ENDFOR
//		dst[MAX:128] := 0
//
// Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_cvtph_ps'.
// Requires FP16C.
func CvtphPs(a x86.M128i) (dst x86.M128) {
	return x86.M128(cvtphPs([16]byte(a)))
}
示例#2
0
// AddsubPs: Alternatively add and subtract packed single-precision (32-bit)
// floating-point elements in 'a' to/from packed elements in 'b', and store the
// results in 'dst'.
//
//		FOR j := 0 to 3
//			i := j*32
//			IF (j is even)
//				dst[i+31:i] := a[i+31:i] - b[i+31:i]
//			ELSE
//				dst[i+31:i] := a[i+31:i] + b[i+31:i]
//			FI
//		ENDFOR
//
// Instruction: 'ADDSUBPS'. Intrinsic: '_mm_addsub_ps'.
// Requires SSE3.
func AddsubPs(a x86.M128, b x86.M128) (dst x86.M128) {
	return x86.M128(addsubPs([4]float32(a), [4]float32(b)))
}
示例#3
0
// HaddPs: Horizontally add adjacent pairs of single-precision (32-bit)
// floating-point elements in 'a' and 'b', and pack the results in 'dst'.
//
//		dst[31:0] := a[63:32] + a[31:0]
//		dst[63:32] := a[127:96] + a[95:64]
//		dst[95:64] := b[63:32] + b[31:0]
//		dst[127:96] := b[127:96] + b[95:64]
//
// Instruction: 'HADDPS'. Intrinsic: '_mm_hadd_ps'.
// Requires SSE3.
func HaddPs(a x86.M128, b x86.M128) (dst x86.M128) {
	return x86.M128(haddPs([4]float32(a), [4]float32(b)))
}
示例#4
0
// MoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point
// elements from 'a', and store the results in 'dst'.
//
//		dst[31:0] := a[31:0]
//		dst[63:32] := a[31:0]
//		dst[95:64] := a[95:64]
//		dst[127:96] := a[95:64]
//
// Instruction: 'MOVSLDUP'. Intrinsic: '_mm_moveldup_ps'.
// Requires SSE3.
func MoveldupPs(a x86.M128) (dst x86.M128) {
	return x86.M128(moveldupPs([4]float32(a)))
}