Example #1
0
func exp2f4(x x86.M128) x86.M128 {
	var ipart x86.M128i
	var fpart, expipart, expfpart x86.M128

	x = sse.MinPs(x, sse.Set1Ps(129))
	x = sse.MaxPs(x, sse.Set1Ps(-126.99999))

	/* ipart = int(x - 0.5) */
	ipart = sse2.CvtpsEpi32(sse.SubPs(x, sse.Set1Ps(0.5)))

	/* fpart = x - ipart */
	fpart = sse.SubPs(x, sse2.Cvtepi32Ps(ipart))

	/* expipart = (float) (1 << ipart) */
	expipart = sse2.Castsi128Ps(sse2.SlliEpi32(sse2.AddEpi32(ipart, sse2.Set1Epi32(127)), 23))

	/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
	if EXP_poly_DEGREE == 5 {
		expfpart = poly5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5)
	} else if EXP_poly_DEGREE == 4 {
		expfpart = poly4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4)
	} else if EXP_poly_DEGREE == 3 {
		expfpart = poly3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3)
	} else if EXP_poly_DEGREE == 2 {
		expfpart = poly2(fpart, exp_p2_0, exp_p2_1, exp_p2_2)
	} else {
		panic("invalid poly degree")
	}

	return sse.MulPs(expipart, expfpart)
}
Example #2
0
func log2f4(x x86.M128) x86.M128 {
	exp := sse2.Set1Epi32(exp_mask)
	mant := sse2.Set1Epi32(mantissa_mask)
	one := sse.Set1Ps(1.0)
	i := sse2.CastpsSi128(x)
	e := sse2.Cvtepi32Ps(sse2.SubEpi32(sse2.SrliEpi32(sse2.AndSi128(i, exp), 23), sse2.Set1Epi32(127)))
	m := sse.OrPs(sse2.Castsi128Ps(sse2.AndSi128(i, mant)), one)
	var p x86.M128

	/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */

	if LOG_poly_DEGREE == 6 {
		p = poly5(m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5)
	} else if LOG_poly_DEGREE == 5 {
		p = poly4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4)
	} else if LOG_poly_DEGREE == 4 {
		p = poly3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3)
	} else if LOG_poly_DEGREE == 3 {
		p = poly2(m, log_p2_0, log_p2_1, log_p2_2)
	} else {
		panic("unsupported poly degree")
	}

	/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
	p = sse.MulPs(p, sse.SubPs(m, one))

	return sse.AddPs(p, e)
}
Example #3
0
// Mixed sse+sse2
// Applies a 3x3 matrix and sRGB gamma corrects input
// and writes it to output.
// https://github.com/rawstudio/rawstudio/blob/master/plugins/colorspace-transform/colorspace_transform_sse2.c#L205
func Transform8sRGB() {
	w := 1048
	h := 1024
	var matrix [3][3]float32

	// Input is a w * h * 4 components 16 bit per component image
	var input = make([]byte, w*h*8)

	// Output is 4 byte/pixel, RGBA image
	var output = make([]byte, w*h*4)

	// The matrix with values splatted to all registers
	var matPs = make([]x86.M128, 3*3)

	// Fill with values
	matPs[0] = sse.Set1Ps(matrix[0][0])
	matPs[1] = sse.Set1Ps(matrix[0][1])
	matPs[2] = sse.Set1Ps(matrix[0][2])
	matPs[3] = sse.Set1Ps(matrix[1][0])
	matPs[4] = sse.Set1Ps(matrix[1][1])
	matPs[5] = sse.Set1Ps(matrix[1][2])
	matPs[6] = sse.Set1Ps(matrix[2][0])
	matPs[7] = sse.Set1Ps(matrix[2][1])
	matPs[8] = sse.Set1Ps(matrix[2][2])

	for y := 0; y < h; y++ {
		i := x86.BytesToM128i(input[y*w*8 : y*w*8+w*8])
		o := x86.BytesToM128i(output[y*w*4 : y*w*4+w*4])

		// Converts 4 pixels per loop
		for x := 0; x < w/4; x++ {

			/* Load and convert to float */
			zero := sse2.SetzeroSi128()
			in := i[x*2]    // Load two pixels
			in2 := i[x*2+1] // Load two pixels
			p1 := sse2.UnpackloEpi16(in, zero)
			p2 := sse2.UnpackhiEpi16(in, zero)
			p3 := sse2.UnpackloEpi16(in2, zero)
			p4 := sse2.UnpackhiEpi16(in2, zero)
			p1f := sse2.Cvtepi32Ps(p1)
			p2f := sse2.Cvtepi32Ps(p2)
			p3f := sse2.Cvtepi32Ps(p3)
			p4f := sse2.Cvtepi32Ps(p4)

			/* Convert to planar */
			g1g0r1r0 := sse.UnpackloPs(p1f, p2f)
			b1b0 := sse.UnpackhiPs(p1f, p2f)
			g3g2r3r2 := sse.UnpackloPs(p3f, p4f)
			b3b2 := sse.UnpackhiPs(p3f, p4f)
			r := sse.MovelhPs(g1g0r1r0, g3g2r3r2)
			g := sse.MovehlPs(g3g2r3r2, g1g0r1r0)
			b := sse.MovelhPs(b1b0, b3b2)

			/* Apply matrix to convert to sRGB */
			r = sseMatrix3Mul(matPs[0:3], r, g, b)
			g = sseMatrix3Mul(matPs[3:6], r, g, b)
			b = sseMatrix3Mul(matPs[6:9], r, g, b)

			/* Normalize to 0->1 and clamp */
			normalize := sse.Set1Ps(1.0 / 65535.0)
			max_val := sse.Set1Ps(1.0)
			min_val := sse.SetzeroPs()
			r = sse.MinPs(max_val, sse.MaxPs(min_val, sse.MulPs(normalize, r)))
			g = sse.MinPs(max_val, sse.MaxPs(min_val, sse.MulPs(normalize, g)))
			b = sse.MinPs(max_val, sse.MaxPs(min_val, sse.MulPs(normalize, b)))

			/* Apply Gamma */
			/* Calculate values to be used if larger than junction point */
			mul_over := sse.Set1Ps(1.055)
			sub_over := sse.Set1Ps(0.055)
			pow_over := sse.Set1Ps(1.0 / 2.4)
			r_gam := sse.SubPs(sse.MulPs(mul_over, FastPowPs(r, pow_over)), sub_over)
			g_gam := sse.SubPs(sse.MulPs(mul_over, FastPowPs(g, pow_over)), sub_over)
			b_gam := sse.SubPs(sse.MulPs(mul_over, FastPowPs(b, pow_over)), sub_over)

			/* Create mask for values smaller than junction point */
			junction := sse.Set1Ps(0.0031308)
			mask_r := sse.CmpltPs(r, junction)
			mask_g := sse.CmpltPs(g, junction)
			mask_b := sse.CmpltPs(b, junction)

			/* Calculate value to be used if under junction */
			mul_under := sse.Set1Ps(12.92)
			r_mul := sse.AndPs(mask_r, sse.MulPs(mul_under, r))
			g_mul := sse.AndPs(mask_g, sse.MulPs(mul_under, g))
			b_mul := sse.AndPs(mask_b, sse.MulPs(mul_under, b))

			/* Select the value to be used based on the junction mask and scale to 8 bit */
			upscale := sse.Set1Ps(255.5)
			r = sse.MulPs(upscale, sse.OrPs(r_mul, sse.AndnotPs(mask_r, r_gam)))
			g = sse.MulPs(upscale, sse.OrPs(g_mul, sse.AndnotPs(mask_g, g_gam)))
			b = sse.MulPs(upscale, sse.OrPs(b_mul, sse.AndnotPs(mask_b, b_gam)))

			/* Convert to 8 bit unsigned  and interleave*/
			r_i := sse2.CvtpsEpi32(r)
			g_i := sse2.CvtpsEpi32(g)
			b_i := sse2.CvtpsEpi32(b)

			r_i = sse2.PacksEpi32(r_i, r_i)
			g_i = sse2.PacksEpi32(g_i, g_i)
			b_i = sse2.PacksEpi32(b_i, b_i)

			/* Set alpha value to 255 and store */
			alpha_mask := sse2.Set1Epi32(0xff000000)
			rg_i := sse2.UnpackloEpi16(r_i, g_i)
			bb_i := sse2.UnpackloEpi16(b_i, b_i)
			p1 = sse2.UnpackloEpi32(rg_i, bb_i)
			p2 = sse2.UnpackhiEpi32(rg_i, bb_i)

			p1 = sse2.OrSi128(alpha_mask, sse2.PackusEpi16(p1, p2))

			o[x] = p1
		}
	}
}
Example #4
0
// Float point example.
// If intrinsics where working, this would convert 4 RGB values to HSV.
// Converted from: https://github.com/rawstudio/rawstudio/blob/master/plugins/dcp/dcp-sse4.c#L74
func RGBtoHSV(r, g, b x86.M128) (h, s, v x86.M128) {
	zeroPs := sse.SetzeroPs()
	smallPs := sse.Set1Ps(verySmall)
	onesPs := sse.Set1Ps(1.0)

	// Any number > 1
	add_v := sse.Set1Ps(2.0)

	// Clamp
	r = sse.MinPs(sse.MaxPs(r, smallPs), onesPs)
	g = sse.MinPs(sse.MaxPs(g, smallPs), onesPs)
	b = sse.MinPs(sse.MaxPs(b, smallPs), onesPs)

	v = sse.MaxPs(b, sse.MaxPs(r, g))
	h = zeroPs

	m := sse.MinPs(b, sse.MinPs(r, g))
	gap := sse.SubPs(v, m)
	v_mask := sse.CmpeqPs(gap, zeroPs)
	v = sse.AddPs(v, sse.AndPs(add_v, v_mask))

	// Set gap to one where sat = 0, this will avoid divisions by zero, these values will not be used
	onesPs = sse.AndPs(onesPs, v_mask)
	gap = sse.OrPs(gap, onesPs)

	//  gap_inv = 1.0 / gap
	gap_inv := sse.RcpPs(gap)

	// if r == v
	// h = (g - b) / gap;
	mask := sse.CmpeqPs(r, v)
	val := sse.MulPs(gap_inv, sse.SubPs(g, b))

	// fill h
	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// if g == v
	// h = 2.0f + (b - r) / gap;
	twoPs := sse.Set1Ps(2.0)
	mask = sse.CmpeqPs(g, v)
	val = sse.SubPs(b, r)
	val = sse.MulPs(val, gap_inv)
	val = sse.AddPs(val, twoPs)

	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// If (b == v)
	// h = 4.0f + (r - g) / gap;
	fourPs := sse.AddPs(twoPs, twoPs)
	mask = sse.CmpeqPs(b, v)
	val = sse.AddPs(fourPs, sse.MulPs(gap_inv, sse.SubPs(r, g)))

	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// Fill s, if gap > 0
	v = sse.SubPs(v, add_v)
	val = sse.MulPs(gap, sse.RcpPs(v))
	s = sse.AndnotPs(v_mask, val)

	// Check if h < 0
	zeroPs = sse.SetzeroPs()
	sixPs := sse.Set1Ps(6.0 - verySmall)
	mask = sse.CmpltPs(h, zeroPs)
	h = sse.AddPs(h, sse.AndPs(mask, sixPs))
	return
}