MeteorRain
4th April 2006, 09:07
I'd like to calc the distance of 2 color in RGB colorspace. I write 2 functions, one use sqrt, another use SSE2 code. In my 2000 frames test, the former function uses 39sec while the optimized code uses 48sec. Theoretically the SSE code should run much faster than the ordinary code so i wonder there's some bottleneck in my code. Please help me to find out the problem, TIA.
my code:
if(sse)
diff += _sse2_dist((float)(tr - r), (float)(tg - g), (float)(tb - b));
else
diff += sqrt((float)(tr - r) * (tr - r) + (tg - g) * (tg - g) + (tb - b) * (tb - b));
float _sse2_dist(float a, float b, float c)
{
__m128 x, s, r;
_MM_ALIGN16 float flo[4] = {0.0};
flo[0] = a;
flo[1] = b;
flo[2] = c;
x = _mm_load_ps(flo);
s = _mm_mul_ps(x, x);
r = _mm_add_ss(s, _mm_movehl_ps(s, s));
r = _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
r = _mm_sqrt_ps(r);
_mm_store_ss(flo, r);
return flo[0];
}
regards,
MeteorRain
my code:
if(sse)
diff += _sse2_dist((float)(tr - r), (float)(tg - g), (float)(tb - b));
else
diff += sqrt((float)(tr - r) * (tr - r) + (tg - g) * (tg - g) + (tb - b) * (tb - b));
float _sse2_dist(float a, float b, float c)
{
__m128 x, s, r;
_MM_ALIGN16 float flo[4] = {0.0};
flo[0] = a;
flo[1] = b;
flo[2] = c;
x = _mm_load_ps(flo);
s = _mm_mul_ps(x, x);
r = _mm_add_ss(s, _mm_movehl_ps(s, s));
r = _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
r = _mm_sqrt_ps(r);
_mm_store_ss(flo, r);
return flo[0];
}
regards,
MeteorRain