PDA

View Full Version : Avisynth filter optimization


lol_123
8th January 2008, 16:29
Hi All,
I used SSE2 to optimize video fliter. So far, it is about 110 fps for 640x480 @ P4 1.4G. I feel it is pretty slow, but i can not find out where problem is. So i post my code as below,
hope someone may give me some adivce.

many thanks, L


int Linesize = (m_ImgWidth*sizeof(short))>>4;
double t1,t2;
double persecond;
QueryPerformanceFrequency((LARGE_INTEGER *)&persecond);
QueryPerformanceCounter((LARGE_INTEGER *)&t1);

for(int nRow = 0;nRow < m_ImgHeight; nRow++)
{

ImgProcessing( Img1,
Img2,
OutImg,
para1,
para2,Linesize);
}
QueryPerformanceCounter((LARGE_INTEGER *)&t2);
m_FrameRate=1/((double)(t2-t1)/(double)persecond);



__forceinline void ImgProcessing(__m128i* Img1,
__m128i* Img2,
__m128i* OutImg,
const __m128i (&para1)[32],
const int (&para2)[4],
int& Linesize)
{
for(int nCol = 0; nCol<((int)m_ImgWidth>>3);nCol++)
{

__m128i temp1, temp2, temp3, temp4;
__m128i _FFH = _mm_set1_epi16((short)0xff);

__asm{
mov ecx, Img1 //begin address
mov eax, Img2
mov esi, para1
mov edi, para2
mov edx, nCol
shl edx, 4
movdqa xmm0, [ecx+edx]
pmullw xmm0, [esi+464]
movdqa xmm2, [eax+edx]
psraw xmm0, 6
psubw xmm2, xmm0
pmullw xmm2, [esi+448]
psraw xmm2, 6
pxor xmm1, xmm1
pminsw xmm2, _FFH
pmaxsw xmm2, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+16]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+48]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+48]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+96]
psubw xmm0, xmm1
pmullw xmm0, [esi+64]
psraw xmm0, 3
paddusw xmm0, [esi+32]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+32]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+64]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+64]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+80]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+64]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+16]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp1, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+192]
psubw xmm0, xmm2
pmullw xmm0, [esi+176]
psraw xmm0, 3
movdqa xmm1, [esi+128]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+128]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+160]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+160]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+208]
psubw xmm0, xmm1
pmullw xmm0, [esi+176]
psraw xmm0, 3
paddusw xmm0, [esi+144]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+144]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+176]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+176]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+192]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+176]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+128]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp2, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+304]
psubw xmm0, xmm2
pmullw xmm0, [esi+288]
psraw xmm0, 3
movdqa xmm1, [esi+240]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+240]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+272]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+272]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+320]
psubw xmm0, xmm1
pmullw xmm0, [esi+288]
psraw xmm0, 3
paddusw xmm0, [esi+256]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+256]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+288]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+288]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+304]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+288]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+240]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp3, xmm6
movdqa xmm0, [ecx]
movdqa xmm2, [esi+416]
psubw xmm0, xmm2
pmullw xmm0, [esi+400]
psraw xmm0, 3
movdqa xmm1, [esi+352]
paddusw xmm0, xmm1
mov edx, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+352]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+384]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
movdqa xmm6, [esi+384]
pcmpeqw xmm6, xmm4
por xmm1, xmm2
pandn xmm6, xmm1
pand xmm3, xmm6
pandn xmm6, xmm0
por xmm6, xmm3
movdqa xmm0, [ecx]
movdqa xmm1, [esi+432]
psubw xmm0, xmm1
pmullw xmm0, [esi+400]
psraw xmm0, 3
paddusw xmm0, [esi+368]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, [esi+368]
pcmpgtw xmm1, xmm3
pcmpeqw xmm2, xmm3
por xmm2, xmm1
pcmpeqd xmm7, xmm7
pandn xmm1, xmm7
pxor xmm4, xmm4
movdqa xmm5, [esi+400]
pcmpgtw xmm5, xmm4
pand xmm1, xmm5
pandn xmm5, xmm7
pand xmm2, xmm5
pand xmm1, xmm2
movdqa xmm5, [esi+400]
pcmpeqw xmm5, xmm4
pandn xmm5, xmm1
pand xmm3, xmm5
pandn xmm5, xmm0
por xmm3, xmm0
movdqa xmm0, [ecx]
movdqa xmm1, [esi+416]
pcmpgtw xmm1, xmm0
pand xmm6, xmm1
movdqa xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm4, [esi+400]
pcmpgtw xmm4, xmm0
movdqa xmm5, [esi+352]
pand xmm5, xmm4
por xmm6, xmm5
pandn xmm2, xmm7
pandn xmm4, xmm2
pand xmm3, xmm4
por xmm6, xmm3
pminsw xmm6,_FFH
pxor xmm0, xmm0
pmaxsw xmm6, xmm0
movdqa temp4, xmm6

}
OutImg[0] = temp1;
OutImg[1] = temp2;
OutImg[2] = temp3;
OutImg[3] = temp4;

}
Img1 += Linesize;
Img2 += LineSize;
OutImg += LineSize*6

}

Leak
8th January 2008, 17:19
Ugh.

Please, please, please put your code into a [code] tag. And also - comment it, and tell us what it's supposed to do, and if possible also supply a non-assembler version of what your SSE code is supposed to do, so we can actually grasp the algorithm behind it without having to tear the SSE code apart...

Looking at a soup of assembler code just isn't a very good basis for suggesting improvements if you didn't write it yourself and thus know it in and out...

np: Mike Shannon - The Last Days (Possible Conclusions To Stories That Never End)

sh0dan
8th January 2008, 19:56
Seems good.

You have many memory reads. You can hide load latency by loading into a register earlier than you need it.

You also use the same data several, or at least two times. A load into an unused register and re-using the data is always faster than two memory reads.

Otherwise I doubt you can speed it much up.

IanB
8th January 2008, 22:18
The code has no comments, adding clear and concise comments will help you make it faster. There is dead code! (which may well be a bug) I suspect you may be a troll. There is just tooooo much code, if you delete about 80% of it, it will run a lot faster.

lol_123
9th January 2008, 04:15
thank you Leak, i dont think there is a room for architecture optimization in terms of the current algorithm. that is reason why i did not put C codes here.
thank you Dan.
Due to register pressure, i have to load twice a value from memory. I will try load earlier and give report you.
Thanks again. L

lol_123
9th January 2008, 04:27
60-70% of my code is for eliminating if/else, i am not sure if it is worth to doing it.