lol_123
8th January 2008, 16:29
Hi All,
I used SSE2 to optimize video fliter. So far, it is about 110 fps for 640x480 @ P4 1.4G. I feel it is pretty slow, but i can not find out where problem is. So i post my code as below,
hope someone may give me some adivce.
many thanks, L
int Linesize = (m_ImgWidth*sizeof(short))>>4;
double t1,t2;
double persecond;
QueryPerformanceFrequency((LARGE_INTEGER *)&persecond);
QueryPerformanceCounter((LARGE_INTEGER *)&t1);
for(int nRow = 0;nRow < m_ImgHeight; nRow++)
{
ImgProcessing( Img1,
Img2,
OutImg,
para1,
para2,Linesize);
}
QueryPerformanceCounter((LARGE_INTEGER *)&t2);
m_FrameRate=1/((double)(t2-t1)/(double)persecond);
__forceinline void ImgProcessing(__m128i* Img1,
__m128i* Img2,
__m128i* OutImg,
const __m128i (¶1)[32],
const int (¶2)[4],
int& Linesize)
{
for(int nCol = 0; nCol<((int)m_ImgWidth>>3);nCol++)
{
__m128i temp1, temp2, temp3, temp4;
__m128i _FFH = _mm_set1_epi16((short)0xff);
__asm{
mov ecx, Img1 //begin address
mov eax, Img2
mov esi, para1
mov edi, para2
mov edx, nCol
shl edx, 4
movdqa xmm0, [ecx+edx]
pmullw xmm0, [esi+464]
movdqa xmm2, [eax+edx]
psraw xmm0, 6
psubw xmm2, xmm0
pmullw xmm2, [esi+448]
psraw xmm2, 6
pxor xmm1, xmm1
pminsw xmm2, _FFH
pmaxsw xmm2, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+16]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+48]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+48]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+96]
psubw xmm0, xmm1
pmullw xmm0, [esi+64]
psraw xmm0, 3
paddusw xmm0, [esi+32]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+32]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+64]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+64]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+80]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+64]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+16]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp1, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+192]
psubw xmm0, xmm2
pmullw xmm0, [esi+176]
psraw xmm0, 3
movdqa xmm1, [esi+128]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+128]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+160]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+160]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+208]
psubw xmm0, xmm1
pmullw xmm0, [esi+176]
psraw xmm0, 3
paddusw xmm0, [esi+144]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+144]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+176]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+176]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+192]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+176]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+128]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp2, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+304]
psubw xmm0, xmm2
pmullw xmm0, [esi+288]
psraw xmm0, 3
movdqa xmm1, [esi+240]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+240]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+272]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+272]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+320]
psubw xmm0, xmm1
pmullw xmm0, [esi+288]
psraw xmm0, 3
paddusw xmm0, [esi+256]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+256]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+288]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+288]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+304]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+288]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+240]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp3, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+416]
psubw xmm0, xmm2
pmullw xmm0, [esi+400]
psraw xmm0, 3
movdqa xmm1, [esi+352]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+352]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+384]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+384]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+432]
psubw xmm0, xmm1
pmullw xmm0, [esi+400]
psraw xmm0, 3
paddusw xmm0, [esi+368]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+368]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+400]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+400]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+416]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+400]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+352]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp4, xmm6
}
OutImg[0] = temp1;
OutImg[1] = temp2;
OutImg[2] = temp3;
OutImg[3] = temp4;
}
Img1 += Linesize;
Img2 += LineSize;
OutImg += LineSize*6
}
I used SSE2 to optimize video fliter. So far, it is about 110 fps for 640x480 @ P4 1.4G. I feel it is pretty slow, but i can not find out where problem is. So i post my code as below,
hope someone may give me some adivce.
many thanks, L
int Linesize = (m_ImgWidth*sizeof(short))>>4;
double t1,t2;
double persecond;
QueryPerformanceFrequency((LARGE_INTEGER *)&persecond);
QueryPerformanceCounter((LARGE_INTEGER *)&t1);
for(int nRow = 0;nRow < m_ImgHeight; nRow++)
{
ImgProcessing( Img1,
Img2,
OutImg,
para1,
para2,Linesize);
}
QueryPerformanceCounter((LARGE_INTEGER *)&t2);
m_FrameRate=1/((double)(t2-t1)/(double)persecond);
__forceinline void ImgProcessing(__m128i* Img1,
__m128i* Img2,
__m128i* OutImg,
const __m128i (¶1)[32],
const int (¶2)[4],
int& Linesize)
{
for(int nCol = 0; nCol<((int)m_ImgWidth>>3);nCol++)
{
__m128i temp1, temp2, temp3, temp4;
__m128i _FFH = _mm_set1_epi16((short)0xff);
__asm{
mov ecx, Img1 //begin address
mov eax, Img2
mov esi, para1
mov edi, para2
mov edx, nCol
shl edx, 4
movdqa xmm0, [ecx+edx]
pmullw xmm0, [esi+464]
movdqa xmm2, [eax+edx]
psraw xmm0, 6
psubw xmm2, xmm0
pmullw xmm2, [esi+448]
psraw xmm2, 6
pxor xmm1, xmm1
pminsw xmm2, _FFH
pmaxsw xmm2, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+16]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+48]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+48]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+96]
psubw xmm0, xmm1
pmullw xmm0, [esi+64]
psraw xmm0, 3
paddusw xmm0, [esi+32]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+32]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+64]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+64]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+80]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+64]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+16]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp1, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+192]
psubw xmm0, xmm2
pmullw xmm0, [esi+176]
psraw xmm0, 3
movdqa xmm1, [esi+128]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+128]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+160]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+160]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+208]
psubw xmm0, xmm1
pmullw xmm0, [esi+176]
psraw xmm0, 3
paddusw xmm0, [esi+144]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+144]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+176]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+176]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+192]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+176]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+128]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp2, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+304]
psubw xmm0, xmm2
pmullw xmm0, [esi+288]
psraw xmm0, 3
movdqa xmm1, [esi+240]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+240]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+272]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+272]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+320]
psubw xmm0, xmm1
pmullw xmm0, [esi+288]
psraw xmm0, 3
paddusw xmm0, [esi+256]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+256]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+288]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+288]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+304]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+288]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+240]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp3, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+416]
psubw xmm0, xmm2
pmullw xmm0, [esi+400]
psraw xmm0, 3
movdqa xmm1, [esi+352]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+352]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+384]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+384]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+432]
psubw xmm0, xmm1
pmullw xmm0, [esi+400]
psraw xmm0, 3
paddusw xmm0, [esi+368]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+368]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+400]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+400]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+416]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+400]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+352]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp4, xmm6
}
OutImg[0] = temp1;
OutImg[1] = temp2;
OutImg[2] = temp3;
OutImg[3] = temp4;
}
Img1 += Linesize;
Img2 += LineSize;
OutImg += LineSize*6
}