Ejemplo n.º 1
0
// Multiply planar values by a 3x3 matrix
func sseMatrix3Mul(mul []x86.M128, a, b, c x86.M128) x86.M128 {
	acc := sse.MulPs(a, mul[0])
	acc = sse.AddPs(acc, sse.MulPs(b, mul[1]))
	acc = sse.AddPs(acc, sse.MulPs(c, mul[2]))

	return acc
}
Ejemplo n.º 2
0
func log2f4(x x86.M128) x86.M128 {
	exp := sse2.Set1Epi32(exp_mask)
	mant := sse2.Set1Epi32(mantissa_mask)
	one := sse.Set1Ps(1.0)
	i := sse2.CastpsSi128(x)
	e := sse2.Cvtepi32Ps(sse2.SubEpi32(sse2.SrliEpi32(sse2.AndSi128(i, exp), 23), sse2.Set1Epi32(127)))
	m := sse.OrPs(sse2.Castsi128Ps(sse2.AndSi128(i, mant)), one)
	var p x86.M128

	/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */

	if LOG_poly_DEGREE == 6 {
		p = poly5(m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5)
	} else if LOG_poly_DEGREE == 5 {
		p = poly4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4)
	} else if LOG_poly_DEGREE == 4 {
		p = poly3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3)
	} else if LOG_poly_DEGREE == 3 {
		p = poly2(m, log_p2_0, log_p2_1, log_p2_2)
	} else {
		panic("unsupported poly degree")
	}

	/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
	p = sse.MulPs(p, sse.SubPs(m, one))

	return sse.AddPs(p, e)
}
Ejemplo n.º 3
0
func TestMulAddPs(t *testing.T) {
	a := sse.SetPs(1.0, 2.0, 3.0, 4.0)
	b := sse.SetPs(1000.0, 100.0, 10.0, 1.0)

	c := sse.MulPs(sse.AddPs(a, a), b)

	expect := x86.M128{(a[0] + a[0]) * b[0], (a[1] + a[1]) * b[1], (a[2] + a[2]) * b[2], (a[3] + a[3]) * b[3]}
	if !reflect.DeepEqual(c, expect) {
		t.Fatal("got", c, "expected", expect)
	}
	t.Log("correctly got", expect)
}
Ejemplo n.º 4
0
func TestAddPs(t *testing.T) {
	a := sse.SetPs(10.0, 20.0, 30.0, 40.0)
	b := sse.SetPs(100.0, 200.0, 300.0, 400.0)

	c := sse.AddPs(a, b)

	expect := x86.M128{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]}
	if !reflect.DeepEqual(c, expect) {
		t.Fatal("got", c, "expected", expect)
	}
	t.Log("correctly got", expect)
}
Ejemplo n.º 5
0
func poly5(x x86.M128, c0, c1, c2, c3, c4, c5 float32) x86.M128 {
	return sse.AddPs(sse.MulPs(poly4(x, c1, c2, c3, c4, c5), x), sse.Set1Ps(c0))
}
Ejemplo n.º 6
0
func poly3(x x86.M128, c0, c1, c2, c3 float32) x86.M128 {
	return sse.AddPs(sse.MulPs(poly2(x, c1, c2, c3), x), sse.Set1Ps(c0))
}
Ejemplo n.º 7
0
func poly1(x x86.M128, c0, c1 float32) x86.M128 {
	return sse.AddPs(sse.MulPs(poly0(x, c1), x), sse.Set1Ps(c0))
}
Ejemplo n.º 8
0
// Float point example.
// If intrinsics where working, this would convert 4 RGB values to HSV.
// Converted from: https://github.com/rawstudio/rawstudio/blob/master/plugins/dcp/dcp-sse4.c#L74
func RGBtoHSV(r, g, b x86.M128) (h, s, v x86.M128) {
	zeroPs := sse.SetzeroPs()
	smallPs := sse.Set1Ps(verySmall)
	onesPs := sse.Set1Ps(1.0)

	// Any number > 1
	add_v := sse.Set1Ps(2.0)

	// Clamp
	r = sse.MinPs(sse.MaxPs(r, smallPs), onesPs)
	g = sse.MinPs(sse.MaxPs(g, smallPs), onesPs)
	b = sse.MinPs(sse.MaxPs(b, smallPs), onesPs)

	v = sse.MaxPs(b, sse.MaxPs(r, g))
	h = zeroPs

	m := sse.MinPs(b, sse.MinPs(r, g))
	gap := sse.SubPs(v, m)
	v_mask := sse.CmpeqPs(gap, zeroPs)
	v = sse.AddPs(v, sse.AndPs(add_v, v_mask))

	// Set gap to one where sat = 0, this will avoid divisions by zero, these values will not be used
	onesPs = sse.AndPs(onesPs, v_mask)
	gap = sse.OrPs(gap, onesPs)

	//  gap_inv = 1.0 / gap
	gap_inv := sse.RcpPs(gap)

	// if r == v
	// h = (g - b) / gap;
	mask := sse.CmpeqPs(r, v)
	val := sse.MulPs(gap_inv, sse.SubPs(g, b))

	// fill h
	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// if g == v
	// h = 2.0f + (b - r) / gap;
	twoPs := sse.Set1Ps(2.0)
	mask = sse.CmpeqPs(g, v)
	val = sse.SubPs(b, r)
	val = sse.MulPs(val, gap_inv)
	val = sse.AddPs(val, twoPs)

	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// If (b == v)
	// h = 4.0f + (r - g) / gap;
	fourPs := sse.AddPs(twoPs, twoPs)
	mask = sse.CmpeqPs(b, v)
	val = sse.AddPs(fourPs, sse.MulPs(gap_inv, sse.SubPs(r, g)))

	v = sse.AddPs(v, sse.AndPs(add_v, mask))
	h = sse4.BlendvPs(h, val, mask)

	// Fill s, if gap > 0
	v = sse.SubPs(v, add_v)
	val = sse.MulPs(gap, sse.RcpPs(v))
	s = sse.AndnotPs(v_mask, val)

	// Check if h < 0
	zeroPs = sse.SetzeroPs()
	sixPs := sse.Set1Ps(6.0 - verySmall)
	mask = sse.CmpltPs(h, zeroPs)
	h = sse.AddPs(h, sse.AndPs(mask, sixPs))
	return
}