func exp2f4(x x86.M128) x86.M128 { var ipart x86.M128i var fpart, expipart, expfpart x86.M128 x = sse.MinPs(x, sse.Set1Ps(129)) x = sse.MaxPs(x, sse.Set1Ps(-126.99999)) /* ipart = int(x - 0.5) */ ipart = sse2.CvtpsEpi32(sse.SubPs(x, sse.Set1Ps(0.5))) /* fpart = x - ipart */ fpart = sse.SubPs(x, sse2.Cvtepi32Ps(ipart)) /* expipart = (float) (1 << ipart) */ expipart = sse2.Castsi128Ps(sse2.SlliEpi32(sse2.AddEpi32(ipart, sse2.Set1Epi32(127)), 23)) /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ if EXP_poly_DEGREE == 5 { expfpart = poly5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5) } else if EXP_poly_DEGREE == 4 { expfpart = poly4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4) } else if EXP_poly_DEGREE == 3 { expfpart = poly3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3) } else if EXP_poly_DEGREE == 2 { expfpart = poly2(fpart, exp_p2_0, exp_p2_1, exp_p2_2) } else { panic("invalid poly degree") } return sse.MulPs(expipart, expfpart) }
// vector of 4 32-bit unsigned integers func (u Uint32Div) DivSSE4(a M128i) M128i { t1 := sse2.MulEpu32(a, u.multiplier) // 32x32->64 bit unsigned multiplication of a[0] and a[2] t2 := sse2.SrliEpi64(t1, 32) // high dword of result 0 and 2 t3 := sse2.SrliEpi64(a, 32) // get a[1] and a[3] into position for multiplication t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3] t5 := sse2.SetEpi32(-1, 0, -1, 0) // mask of dword 1 and 3 t7 := sse4.BlendvEpi8(t2, t4, t5) // blend two results t8 := sse2.SubEpi32(a, t7) // subtract t9 := sse2.SrlEpi32(t8, u.shift1) // shift right logical t10 := sse2.AddEpi32(t7, t9) // add return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical }
// vector of 4 32-bit unsigned integers func (d Uint32Div) Div(a M128i) M128i { t1 := sse2.MulEpu32(a, u.multiplier) // 32x32->64 bit unsigned multiplication of a[0] and a[2] t2 := sse2.SrliEpi64(t1, 32) // high dword of result 0 and 2 t3 := sse2.SrliEpi64(a, 32) // get a[1] and a[3] into position for multiplication t4 := sse2.MulEpu32(t3, u.multiplier) // 32x32->64 bit unsigned multiplication of a[1] and a[3] t5 := sse2.SetEpi32(-1, 0, -1, 0) // mask of dword 1 and 3 t6 := sse2.AndSi128(t4, t5) // high dword of result 1 and 3 t7 := sse2.OrSi128(t2, t6) // combine all four results into one vector t8 := sse2.SubEpi32(a, t7) // subtract t9 := sse2.SrlEpi32(t8, u.shift1) // shift right logical t10 := sse2.AddEpi32(t7, t9) // add return sse2.SrlEpi32(t10, d.u.shift2) // shift right logical }
// Try some complex intrinsics (SSE2) // Doesn't test any values (since intrisics are unimplemented) // Converted from https://github.com/klauspost/rawspeed/blob/develop/RawSpeed/RawImageDataU16.cpp#L152 func TestComplex(t *testing.T) { full_scale_fp := 1000 half_scale_fp := 500 mDitherScale := true var sub_mul [4]x86.M128i var rand_mul x86.M128i sseround := sse2.SetEpi32(512, 512, 512, 512) ssesub2 := sse2.SetEpi32(32768, 32768, 32768, 32768) ssesign := sse2.SetEpi32(0x80008000, 0x80008000, 0x80008000, 0x80008000) sse_full_scale_fp := sse2.Set1Epi32(full_scale_fp | (full_scale_fp << 16)) sse_half_scale_fp := sse2.Set1Epi32(half_scale_fp >> 4) if mDitherScale { rand_mul = sse2.Set1Epi32(0x4d9f1d32) } else { rand_mul = sse2.SetzeroSi128() } rand_mask := sse2.Set1Epi32(0x00ff00ff) // 8 random bits width := 1024 height := 1024 // Emulate 1024 x 1024 x 16bpp input := make([]byte, 1024*1024*2) for y := 0; y < height; y++ { // Convert current line to []M128i line := x86.BytesToM128i(input[y*width*2 : y*width*2+width*2]) var sserandom x86.M128i if mDitherScale { sserandom = sse2.SetEpi32(width*1676+y*18000, width*2342+y*34311, width*4272+y*12123, width*1234+y*23464) } else { sserandom = sse2.SetzeroSi128() } var ssescale, ssesub x86.M128i if (y & 1) == 0 { ssesub = sub_mul[0] ssescale = sub_mul[1] } else { ssesub = sub_mul[2] ssescale = sub_mul[3] } for x, pix_low := range line { // Subtract black pix_low = sse2.SubsEpu16(pix_low, ssesub) // Multiply the two unsigned shorts and combine it to 32 bit result pix_high := sse2.MulhiEpu16(pix_low, ssescale) temp := sse2.MulloEpi16(pix_low, ssescale) pix_low = sse2.UnpackloEpi16(temp, pix_high) pix_high = sse2.UnpackhiEpi16(temp, pix_high) // Add rounder pix_low = sse2.AddEpi32(pix_low, sseround) pix_high = sse2.AddEpi32(pix_high, sseround) sserandom = sse2.XorSi128(sse2.MulhiEpi16(sserandom, rand_mul), sse2.MulloEpi16(sserandom, rand_mul)) rand_masked := sse2.AndSi128(sserandom, rand_mask) // Get 8 random bits rand_masked = sse2.MulloEpi16(rand_masked, sse_full_scale_fp) zero := sse2.SetzeroSi128() rand_lo := sse2.SubEpi32(sse_half_scale_fp, sse2.UnpackloEpi16(rand_masked, zero)) rand_hi := sse2.SubEpi32(sse_half_scale_fp, sse2.UnpackhiEpi16(rand_masked, zero)) pix_low = sse2.AddEpi32(pix_low, rand_lo) pix_high = sse2.AddEpi32(pix_high, rand_hi) // Shift down pix_low = sse2.SraiEpi32(pix_low, 10) pix_high = sse2.SraiEpi32(pix_high, 10) // Subtract to avoid clipping pix_low = sse2.SubEpi32(pix_low, ssesub2) pix_high = sse2.SubEpi32(pix_high, ssesub2) // Pack pix_low = sse2.PacksEpi32(pix_low, pix_high) // Shift sign off pix_low = sse2.XorSi128(pix_low, ssesign) line[x] = pix_low } } }