コード例 #1
0
ファイル: m128_test.go プロジェクト: klauspost/intrinsics
func log2f4(x x86.M128) x86.M128 {
	exp := sse2.Set1Epi32(exp_mask)
	mant := sse2.Set1Epi32(mantissa_mask)
	one := sse.Set1Ps(1.0)
	i := sse2.CastpsSi128(x)
	e := sse2.Cvtepi32Ps(sse2.SubEpi32(sse2.SrliEpi32(sse2.AndSi128(i, exp), 23), sse2.Set1Epi32(127)))
	m := sse.OrPs(sse2.Castsi128Ps(sse2.AndSi128(i, mant)), one)
	var p x86.M128

	/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */

	if LOG_poly_DEGREE == 6 {
		p = poly5(m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5)
	} else if LOG_poly_DEGREE == 5 {
		p = poly4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4)
	} else if LOG_poly_DEGREE == 4 {
		p = poly3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3)
	} else if LOG_poly_DEGREE == 3 {
		p = poly2(m, log_p2_0, log_p2_1, log_p2_2)
	} else {
		panic("unsupported poly degree")
	}

	/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
	p = sse.MulPs(p, sse.SubPs(m, one))

	return sse.AddPs(p, e)
}
コード例 #2
0
ファイル: intdiv.go プロジェクト: klauspost/intrinsics
// vector of 4 32-bit unsigned integers
func (u Uint32Div) DivSSE4(a M128i) M128i {
	t1 := sse2.MulEpu32(a, u.multiplier)  // 32x32->64 bit unsigned multiplication of a[0] and a[2]
	t2 := sse2.SrliEpi64(t1, 32)          // high dword of result 0 and 2
	t3 := sse2.SrliEpi64(a, 32)           // get a[1] and a[3] into position for multiplication
	t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3]
	t5 := sse2.SetEpi32(-1, 0, -1, 0)     // mask of dword 1 and 3
	t7 := sse4.BlendvEpi8(t2, t4, t5)     // blend two results
	t8 := sse2.SubEpi32(a, t7)            // subtract
	t9 := sse2.SrlEpi32(t8, u.shift1)     // shift right logical
	t10 := sse2.AddEpi32(t7, t9)          // add
	return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical
}
コード例 #3
0
ファイル: intdiv.go プロジェクト: klauspost/intrinsics
// vector of 4 32-bit unsigned integers
func (d Uint32Div) Div(a M128i) M128i {
	t1 := sse2.MulEpu32(a, u.multiplier)  // 32x32->64 bit unsigned multiplication of a[0] and a[2]
	t2 := sse2.SrliEpi64(t1, 32)          // high dword of result 0 and 2
	t3 := sse2.SrliEpi64(a, 32)           // get a[1] and a[3] into position for multiplication
	t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3]
	t5 := sse2.SetEpi32(-1, 0, -1, 0)     // mask of dword 1 and 3
	t6 := sse2.AndSi128(t4, t5)           // high dword of result 1 and 3
	t7 := sse2.OrSi128(t2, t6)            // combine all four results into one vector
	t8 := sse2.SubEpi32(a, t7)            // subtract
	t9 := sse2.SrlEpi32(t8, u.shift1)     // shift right logical
	t10 := sse2.AddEpi32(t7, t9)          // add
	return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical
}
コード例 #4
0
ファイル: m128_test.go プロジェクト: klauspost/intrinsics
// Try some complex intrinsics (SSE2)
// Doesn't test any values (since intrisics are unimplemented)
// Converted from https://github.com/klauspost/rawspeed/blob/develop/RawSpeed/RawImageDataU16.cpp#L152
func TestComplex(t *testing.T) {
	full_scale_fp := 1000
	half_scale_fp := 500
	mDitherScale := true
	var sub_mul [4]x86.M128i

	var rand_mul x86.M128i
	sseround := sse2.SetEpi32(512, 512, 512, 512)
	ssesub2 := sse2.SetEpi32(32768, 32768, 32768, 32768)
	ssesign := sse2.SetEpi32(0x80008000, 0x80008000, 0x80008000, 0x80008000)
	sse_full_scale_fp := sse2.Set1Epi32(full_scale_fp | (full_scale_fp << 16))
	sse_half_scale_fp := sse2.Set1Epi32(half_scale_fp >> 4)

	if mDitherScale {
		rand_mul = sse2.Set1Epi32(0x4d9f1d32)
	} else {
		rand_mul = sse2.SetzeroSi128()
	}
	rand_mask := sse2.Set1Epi32(0x00ff00ff) // 8 random bits

	width := 1024
	height := 1024

	// Emulate 1024 x 1024 x 16bpp
	input := make([]byte, 1024*1024*2)

	for y := 0; y < height; y++ {
		// Convert current line to []M128i
		line := x86.BytesToM128i(input[y*width*2 : y*width*2+width*2])

		var sserandom x86.M128i
		if mDitherScale {
			sserandom = sse2.SetEpi32(width*1676+y*18000, width*2342+y*34311, width*4272+y*12123, width*1234+y*23464)
		} else {
			sserandom = sse2.SetzeroSi128()
		}

		var ssescale, ssesub x86.M128i
		if (y & 1) == 0 {
			ssesub = sub_mul[0]
			ssescale = sub_mul[1]
		} else {
			ssesub = sub_mul[2]
			ssescale = sub_mul[3]
		}

		for x, pix_low := range line {
			// Subtract black
			pix_low = sse2.SubsEpu16(pix_low, ssesub)
			// Multiply the two unsigned shorts and combine it to 32 bit result
			pix_high := sse2.MulhiEpu16(pix_low, ssescale)

			temp := sse2.MulloEpi16(pix_low, ssescale)

			pix_low = sse2.UnpackloEpi16(temp, pix_high)
			pix_high = sse2.UnpackhiEpi16(temp, pix_high)

			// Add rounder
			pix_low = sse2.AddEpi32(pix_low, sseround)
			pix_high = sse2.AddEpi32(pix_high, sseround)

			sserandom = sse2.XorSi128(sse2.MulhiEpi16(sserandom, rand_mul), sse2.MulloEpi16(sserandom, rand_mul))
			rand_masked := sse2.AndSi128(sserandom, rand_mask) // Get 8 random bits
			rand_masked = sse2.MulloEpi16(rand_masked, sse_full_scale_fp)

			zero := sse2.SetzeroSi128()
			rand_lo := sse2.SubEpi32(sse_half_scale_fp, sse2.UnpackloEpi16(rand_masked, zero))
			rand_hi := sse2.SubEpi32(sse_half_scale_fp, sse2.UnpackhiEpi16(rand_masked, zero))

			pix_low = sse2.AddEpi32(pix_low, rand_lo)
			pix_high = sse2.AddEpi32(pix_high, rand_hi)

			// Shift down
			pix_low = sse2.SraiEpi32(pix_low, 10)
			pix_high = sse2.SraiEpi32(pix_high, 10)

			// Subtract to avoid clipping
			pix_low = sse2.SubEpi32(pix_low, ssesub2)
			pix_high = sse2.SubEpi32(pix_high, ssesub2)

			// Pack
			pix_low = sse2.PacksEpi32(pix_low, pix_high)

			// Shift sign off
			pix_low = sse2.XorSi128(pix_low, ssesign)
			line[x] = pix_low
		}
	}
}