Code:
movdqu xmm2, [esi-1]
movdqa xmm3, [esi]
movdqu xmm4, [esi+1]
movdqu xmm5, [esi+edx-1]
movdqa xmm6, [esi+edx]
movdqu xmm7, [esi+edx+1]
...
movdqu xmm1, [esi+eax-1]
movdqu xmm3, [esi+eax+1]
This is what palignr was made for; SSSE3-ifying this with palignr will avoid all the unaligned loads nicely. If you retain loads between loop iterations, you can reduce the number of memory accesses, too.
Code:
movdqu xmm6, [esi-6]
movdqu xmm0, [esi+6]
pavgb xmm6, xmm0
movdqu xmm5, [esi-5]
movdqu xmm7, [esi+5]
pavgb xmm5, xmm7
movdqu xmm4, [esi-4]
movdqu xmm0, [esi+4]
pavgb xmm4, xmm0
movdqu xmm3, [esi-3]
movdqu xmm7, [esi+3]
pavgb xmm3, xmm7
movdqu xmm2, [esi-2]
movdqu xmm0, [esi+2]
pavgb xmm2, xmm0
movdqu xmm1, [esi-1]
movdqu xmm7, [esi+1]
pavgb xmm1, xmm7
movdqa xmm0, [esi]
Did someone say
made for palignr
Code:
movd eax, xmm2
psrldq xmm2, 4
pinsrw xmm3, [eax+esi], 0
pinsrw xmm4, [eax+edx], 0
movd eax, xmm2
psrldq xmm2, 4
pinsrw xmm3, [eax+esi+1], 1
pinsrw xmm4, [eax+edx+1], 1
movd eax, xmm2
psrldq xmm2, 4
pinsrw xmm3, [eax+esi+2], 2
pinsrw xmm4, [eax+edx+2], 2
movd eax, xmm2
pinsrw xmm3, [eax+esi+3], 3
pinsrw xmm4, [eax+edx+3], 3
movd eax, xmm7
psrldq xmm7, 4
pinsrw xmm3, [eax+esi+4], 4
pinsrw xmm4, [eax+edx+4], 4
movd eax, xmm7
psrldq xmm7, 4
pinsrw xmm3, [eax+esi+5], 5
pinsrw xmm4, [eax+edx+5], 5
movd eax, xmm7
psrldq xmm7, 4
pinsrw xmm3, [eax+esi+6], 6
pinsrw xmm4, [eax+edx+6], 6
movd eax, xmm7
pinsrw xmm3, [eax+esi+7], 7
pinsrw xmm4, [eax+edx+7], 7
mov eax, [esp]
I'm going to have to start killing kittens if I keep seeing things like this.
Code:
movq xmm7, qword ptr [edi+ebx-1] // one line above actual position, but it gives 1.4x speedup
How about you figure out
why it does?