-h
11th April 2002, 14:11
Just started working with sse2, and I'd like a critique of some SAD code. It doesn't really matter if you don't know what a SAD routine is supposed to do, I'm just wondering how pipelining etc. should work. Assume esi is 16-btye aligned:
sad16_sse2
push esi
push edi
push ebx
mov esi, [esp + 12 + 4] ; cur
mov edi, [esp + 12 + 8] ; ref
mov eax, [esp + 12 + 12] ; stride
mov ebx, eax
mov ecx, eax
shl ebx, 2
add ecx, ebx ; ecx = stride*5
mov edx, ebx
sub ebx, eax ; ebx = stride*3
add edx, ebx ; edx = stride*7
pxor xmm7, xmm7 ; xmm7 = sum = 0
movdqu xmm0, [edi] ; ref 0
movdqu xmm1, [edi+eax] ; ref 1
movdqu xmm2, [edi+eax*2] ; ref 2
movdqu xmm3, [edi+ebx] ; ref 3
psadbw xmm0, [esi] ; diff to cur
psadbw xmm1, [esi+eax]
psadbw xmm2, [esi+eax*2]
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm0 ; add diffs to sum
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqu xmm0, [edi+eax*4] ; ref 4
movdqu xmm1, [edi+ecx] ; ref 5
movdqu xmm2, [edi+ebx*2] ; ref 6
movdqu xmm3, [edi+edx] ; ref 7
psadbw xmm0, [esi+eax*4]
psadbw xmm1, [esi+ecx]
psadbw xmm2, [esi+ebx*2]
psadbw xmm3, [esi+edx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
add esi, eax ; advance by 8 lines
add edi, eax
add esi, edx
add edi, edx
movdqu xmm0, [edi] ; ref 8
movdqu xmm1, [edi+eax] ; ref 9
movdqu xmm2, [edi+eax*2] ; ref 10
movdqu xmm3, [edi+ebx] ; ref 11
psadbw xmm0, [esi]
psadbw xmm1, [esi+eax]
psadbw xmm2, [esi+eax*2]
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqu xmm0, [edi+eax*4] ; ref 12
movdqu xmm1, [edi+ecx] ; ref 13
movdqu xmm2, [edi+ebx*2] ; ref 14
movdqu xmm3, [edi+edx] ; ref 15
psadbw xmm0, [esi+eax*4]
psadbw xmm1, [esi+ecx]
psadbw xmm2, [esi+ebx*2]
psadbw xmm3, [esi+edx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqa xmm6, xmm7 ; copy sum
psrldq xmm6, 8 ; shift right by 8 bytes
paddq xmm7, xmm6 ; get final sum in low dword
movd eax, xmm7 ; return sum
pop ebx
pop edi
pop esi
ret
Now I'm not all that interested in whether it would compile or not (probably not, I don't have a P4 to test on atm), but is that the way the reads/psadbw's/paddusw's should be structured? Or would this be better/faster:
movdqu xmm0, [edi] ; ref 0
psadbw xmm0, [esi] ; diff to cur
movdqu xmm1, [edi+eax] ; ref 1
paddusw xmm7, xmm0 ; add diff to sum
psadbw xmm1, [esi+eax]
movdqu xmm2, [edi+eax*2] ; ref 2
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2]
movdqu xmm3, [edi+ebx] ; ref 3
paddusw xmm7, xmm2
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm3
Where I've just interleaved the paddusw's to immediately follow the unaligned read, with the uninformed hope that it'll perform the paddusw while waiting for the read to complete, seeing as the psadbw will be waiting for the unaligned read to finish.
But yeah, this is first time sse2, I'm just after some general ideas.
Edit: corrected some mm6/mm7 -> xmm6/xmm7 mistakes.
-h
sad16_sse2
push esi
push edi
push ebx
mov esi, [esp + 12 + 4] ; cur
mov edi, [esp + 12 + 8] ; ref
mov eax, [esp + 12 + 12] ; stride
mov ebx, eax
mov ecx, eax
shl ebx, 2
add ecx, ebx ; ecx = stride*5
mov edx, ebx
sub ebx, eax ; ebx = stride*3
add edx, ebx ; edx = stride*7
pxor xmm7, xmm7 ; xmm7 = sum = 0
movdqu xmm0, [edi] ; ref 0
movdqu xmm1, [edi+eax] ; ref 1
movdqu xmm2, [edi+eax*2] ; ref 2
movdqu xmm3, [edi+ebx] ; ref 3
psadbw xmm0, [esi] ; diff to cur
psadbw xmm1, [esi+eax]
psadbw xmm2, [esi+eax*2]
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm0 ; add diffs to sum
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqu xmm0, [edi+eax*4] ; ref 4
movdqu xmm1, [edi+ecx] ; ref 5
movdqu xmm2, [edi+ebx*2] ; ref 6
movdqu xmm3, [edi+edx] ; ref 7
psadbw xmm0, [esi+eax*4]
psadbw xmm1, [esi+ecx]
psadbw xmm2, [esi+ebx*2]
psadbw xmm3, [esi+edx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
add esi, eax ; advance by 8 lines
add edi, eax
add esi, edx
add edi, edx
movdqu xmm0, [edi] ; ref 8
movdqu xmm1, [edi+eax] ; ref 9
movdqu xmm2, [edi+eax*2] ; ref 10
movdqu xmm3, [edi+ebx] ; ref 11
psadbw xmm0, [esi]
psadbw xmm1, [esi+eax]
psadbw xmm2, [esi+eax*2]
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqu xmm0, [edi+eax*4] ; ref 12
movdqu xmm1, [edi+ecx] ; ref 13
movdqu xmm2, [edi+ebx*2] ; ref 14
movdqu xmm3, [edi+edx] ; ref 15
psadbw xmm0, [esi+eax*4]
psadbw xmm1, [esi+ecx]
psadbw xmm2, [esi+ebx*2]
psadbw xmm3, [esi+edx]
paddusw xmm7, xmm0
paddusw xmm7, xmm1
paddusw xmm7, xmm2
paddusw xmm7, xmm3
movdqa xmm6, xmm7 ; copy sum
psrldq xmm6, 8 ; shift right by 8 bytes
paddq xmm7, xmm6 ; get final sum in low dword
movd eax, xmm7 ; return sum
pop ebx
pop edi
pop esi
ret
Now I'm not all that interested in whether it would compile or not (probably not, I don't have a P4 to test on atm), but is that the way the reads/psadbw's/paddusw's should be structured? Or would this be better/faster:
movdqu xmm0, [edi] ; ref 0
psadbw xmm0, [esi] ; diff to cur
movdqu xmm1, [edi+eax] ; ref 1
paddusw xmm7, xmm0 ; add diff to sum
psadbw xmm1, [esi+eax]
movdqu xmm2, [edi+eax*2] ; ref 2
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2]
movdqu xmm3, [edi+ebx] ; ref 3
paddusw xmm7, xmm2
psadbw xmm3, [esi+ebx]
paddusw xmm7, xmm3
Where I've just interleaved the paddusw's to immediately follow the unaligned read, with the uninformed hope that it'll perform the paddusw while waiting for the read to complete, seeing as the psadbw will be waiting for the unaligned read to finish.
But yeah, this is first time sse2, I'm just after some general ideas.
Edit: corrected some mm6/mm7 -> xmm6/xmm7 mistakes.
-h