lol_123
10th December 2007, 14:08
I'm using Intel vtune for the code optimization.
Vtune found out a hotpot (Misaligned data memory)
Does anyone konw the reason?
the report from vtune and my code as follows
Counter dependencies:
This insight is dependent on the following performance counter function:
MisalignedRefPerInstRatio = Misaligned Data Memory References/Instructions Retired
Low value: 0
High value: 0.0015
Current value: 0.045
Units: misaligned data memory references per instruction retired
This insight is relevant when MisalignedRefPerInstRatio is high. Based on the current value of MisalignedRefPerInstRatio, this insight is extremely relevant (100%).
#define MARCO_ADD(x,y) _mm_add_epi16((x),(y))
void CS::G(__m128i* PB5, __m128i *PB3, int Q)
{
struct {
BYTE* buf0 ,*buf1,*buf2 ,*buf3;
int r, c,k,l;
UINT OF0,OF1,OF2;
}a;
__m128i SUM8;
__m128i ZERO = _mm_setzero_si128 ();
__m128i img_00_04_20_24_Hi, img_01_03_21_23_Hi, img_10_14_Hi, img_11_02_13_22_Hi, img_12_Hi;
__m128i img_00_04_20_24_Lo, img_01_03_21_23_Lo, img_10_14_Lo, img_11_02_13_22_Lo, img_12_Lo;
__m128i _GLF_00 = _mm_set1_epi16((short)GLF_00);
__m128i _GLF_01 = _mm_set1_epi16((short)GLF_01);
__m128i _GLF_10 = _mm_set1_epi16((short)GLF_10);
__m128i _GLF_11 = _mm_set1_epi16((short)GLF_11);
__m128i _GLF_12 = _mm_set1_epi16((short)GLF_12);
__m128i *LN1 = (PB5+((Q+16)>>4));
for( a.r =0;a.r < NOL2 ; a.r++)
{
a.OF0 = (a.r*(Q+16))>>4;
a.OF1 = a.OF0+((Q+16)>>4);//((r+1)*(Q))>>4;
a.OF2 = a.OF0+((Q+16)>>3);//((r+2)*(Q))>>4;
a.buf0 = (BYTE*)(PB5+a.OF0);
a.buf1 = (BYTE*)(PB5+a.OF1);
a.buf2 = (BYTE*)(PB5+a.OF2);
for(a.c = 0; a.c<(Q>>4); a.c++)
{
__m128i subMat0 = _mm_loadu_si128((const __m128i*)a.buf0);
}
return;
}
Thanks in advance, L
Vtune found out a hotpot (Misaligned data memory)
Does anyone konw the reason?
the report from vtune and my code as follows
Counter dependencies:
This insight is dependent on the following performance counter function:
MisalignedRefPerInstRatio = Misaligned Data Memory References/Instructions Retired
Low value: 0
High value: 0.0015
Current value: 0.045
Units: misaligned data memory references per instruction retired
This insight is relevant when MisalignedRefPerInstRatio is high. Based on the current value of MisalignedRefPerInstRatio, this insight is extremely relevant (100%).
#define MARCO_ADD(x,y) _mm_add_epi16((x),(y))
void CS::G(__m128i* PB5, __m128i *PB3, int Q)
{
struct {
BYTE* buf0 ,*buf1,*buf2 ,*buf3;
int r, c,k,l;
UINT OF0,OF1,OF2;
}a;
__m128i SUM8;
__m128i ZERO = _mm_setzero_si128 ();
__m128i img_00_04_20_24_Hi, img_01_03_21_23_Hi, img_10_14_Hi, img_11_02_13_22_Hi, img_12_Hi;
__m128i img_00_04_20_24_Lo, img_01_03_21_23_Lo, img_10_14_Lo, img_11_02_13_22_Lo, img_12_Lo;
__m128i _GLF_00 = _mm_set1_epi16((short)GLF_00);
__m128i _GLF_01 = _mm_set1_epi16((short)GLF_01);
__m128i _GLF_10 = _mm_set1_epi16((short)GLF_10);
__m128i _GLF_11 = _mm_set1_epi16((short)GLF_11);
__m128i _GLF_12 = _mm_set1_epi16((short)GLF_12);
__m128i *LN1 = (PB5+((Q+16)>>4));
for( a.r =0;a.r < NOL2 ; a.r++)
{
a.OF0 = (a.r*(Q+16))>>4;
a.OF1 = a.OF0+((Q+16)>>4);//((r+1)*(Q))>>4;
a.OF2 = a.OF0+((Q+16)>>3);//((r+2)*(Q))>>4;
a.buf0 = (BYTE*)(PB5+a.OF0);
a.buf1 = (BYTE*)(PB5+a.OF1);
a.buf2 = (BYTE*)(PB5+a.OF2);
for(a.c = 0; a.c<(Q>>4); a.c++)
{
__m128i subMat0 = _mm_loadu_si128((const __m128i*)a.buf0);
}
return;
}
Thanks in advance, L