// Multiply planar values by a 3x3 matrix func sseMatrix3Mul(mul []x86.M128, a, b, c x86.M128) x86.M128 { acc := sse.MulPs(a, mul[0]) acc = sse.AddPs(acc, sse.MulPs(b, mul[1])) acc = sse.AddPs(acc, sse.MulPs(c, mul[2])) return acc }
func log2f4(x x86.M128) x86.M128 { exp := sse2.Set1Epi32(exp_mask) mant := sse2.Set1Epi32(mantissa_mask) one := sse.Set1Ps(1.0) i := sse2.CastpsSi128(x) e := sse2.Cvtepi32Ps(sse2.SubEpi32(sse2.SrliEpi32(sse2.AndSi128(i, exp), 23), sse2.Set1Epi32(127))) m := sse.OrPs(sse2.Castsi128Ps(sse2.AndSi128(i, mant)), one) var p x86.M128 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */ if LOG_poly_DEGREE == 6 { p = poly5(m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5) } else if LOG_poly_DEGREE == 5 { p = poly4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4) } else if LOG_poly_DEGREE == 4 { p = poly3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3) } else if LOG_poly_DEGREE == 3 { p = poly2(m, log_p2_0, log_p2_1, log_p2_2) } else { panic("unsupported poly degree") } /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ p = sse.MulPs(p, sse.SubPs(m, one)) return sse.AddPs(p, e) }
func TestMulAddPs(t *testing.T) { a := sse.SetPs(1.0, 2.0, 3.0, 4.0) b := sse.SetPs(1000.0, 100.0, 10.0, 1.0) c := sse.MulPs(sse.AddPs(a, a), b) expect := x86.M128{(a[0] + a[0]) * b[0], (a[1] + a[1]) * b[1], (a[2] + a[2]) * b[2], (a[3] + a[3]) * b[3]} if !reflect.DeepEqual(c, expect) { t.Fatal("got", c, "expected", expect) } t.Log("correctly got", expect) }
func TestAddPs(t *testing.T) { a := sse.SetPs(10.0, 20.0, 30.0, 40.0) b := sse.SetPs(100.0, 200.0, 300.0, 400.0) c := sse.AddPs(a, b) expect := x86.M128{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]} if !reflect.DeepEqual(c, expect) { t.Fatal("got", c, "expected", expect) } t.Log("correctly got", expect) }
func poly5(x x86.M128, c0, c1, c2, c3, c4, c5 float32) x86.M128 { return sse.AddPs(sse.MulPs(poly4(x, c1, c2, c3, c4, c5), x), sse.Set1Ps(c0)) }
func poly3(x x86.M128, c0, c1, c2, c3 float32) x86.M128 { return sse.AddPs(sse.MulPs(poly2(x, c1, c2, c3), x), sse.Set1Ps(c0)) }
func poly1(x x86.M128, c0, c1 float32) x86.M128 { return sse.AddPs(sse.MulPs(poly0(x, c1), x), sse.Set1Ps(c0)) }
// Float point example. // If intrinsics where working, this would convert 4 RGB values to HSV. // Converted from: https://github.com/rawstudio/rawstudio/blob/master/plugins/dcp/dcp-sse4.c#L74 func RGBtoHSV(r, g, b x86.M128) (h, s, v x86.M128) { zeroPs := sse.SetzeroPs() smallPs := sse.Set1Ps(verySmall) onesPs := sse.Set1Ps(1.0) // Any number > 1 add_v := sse.Set1Ps(2.0) // Clamp r = sse.MinPs(sse.MaxPs(r, smallPs), onesPs) g = sse.MinPs(sse.MaxPs(g, smallPs), onesPs) b = sse.MinPs(sse.MaxPs(b, smallPs), onesPs) v = sse.MaxPs(b, sse.MaxPs(r, g)) h = zeroPs m := sse.MinPs(b, sse.MinPs(r, g)) gap := sse.SubPs(v, m) v_mask := sse.CmpeqPs(gap, zeroPs) v = sse.AddPs(v, sse.AndPs(add_v, v_mask)) // Set gap to one where sat = 0, this will avoid divisions by zero, these values will not be used onesPs = sse.AndPs(onesPs, v_mask) gap = sse.OrPs(gap, onesPs) // gap_inv = 1.0 / gap gap_inv := sse.RcpPs(gap) // if r == v // h = (g - b) / gap; mask := sse.CmpeqPs(r, v) val := sse.MulPs(gap_inv, sse.SubPs(g, b)) // fill h v = sse.AddPs(v, sse.AndPs(add_v, mask)) h = sse4.BlendvPs(h, val, mask) // if g == v // h = 2.0f + (b - r) / gap; twoPs := sse.Set1Ps(2.0) mask = sse.CmpeqPs(g, v) val = sse.SubPs(b, r) val = sse.MulPs(val, gap_inv) val = sse.AddPs(val, twoPs) v = sse.AddPs(v, sse.AndPs(add_v, mask)) h = sse4.BlendvPs(h, val, mask) // If (b == v) // h = 4.0f + (r - g) / gap; fourPs := sse.AddPs(twoPs, twoPs) mask = sse.CmpeqPs(b, v) val = sse.AddPs(fourPs, sse.MulPs(gap_inv, sse.SubPs(r, g))) v = sse.AddPs(v, sse.AndPs(add_v, mask)) h = sse4.BlendvPs(h, val, mask) // Fill s, if gap > 0 v = sse.SubPs(v, add_v) val = sse.MulPs(gap, sse.RcpPs(v)) s = sse.AndnotPs(v_mask, val) // Check if h < 0 zeroPs = sse.SetzeroPs() sixPs := sse.Set1Ps(6.0 - verySmall) mask = sse.CmpltPs(h, zeroPs) h = sse.AddPs(h, sse.AndPs(mask, sixPs)) return }