View Single Post
Old 3rd October 2007, 04:50   #59  |  Link
Dark Shikari
x264 developer
 
Dark Shikari's Avatar
 
Join Date: Sep 2005
Posts: 8,666
Quote:
Originally Posted by DeathTheSheep View Post
Lol, no problem. But next time could you put up the whole function (or source code?) instead of the diff? Much easier to manually apply that way.

Oh, I noticed the new prepass beefs up the filesize along with the SSIM at constant quantization. Is this normal, or is something b0rked for me?

And quality remains constant (and filesize increases!) as merange is increased... FtW? Tested with esa, of course... Satd.

[edit]Yes, as I suspected there is something hideously wrong here. Without any prepass at all, differs drastically from an old build without it. Yeah, some patched sources would help like crazy. XD
Here is the beginning of my source up to the start of ME-DIA and such:

Code:
/*****************************************************************************
 * me.c: h264 encoder library (Motion Estimation)
 *****************************************************************************
 * Copyright (C) 2003 Laurent Aimar
 * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/

#include "common/common.h"
#include "me.h"
#include <limits.h>

/* presets selected from good points on the speed-vs-quality curve of several test videos
 * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
 * where me_* are the number of EPZS iterations run on all candidate block types,
 * and refine_* are run only on the winner. */
 //The --subme 7 values are much higher because since they get the motion search
 //closer to the optimal value, they actually tend to save time in the more intensive
 //RD search that follows.
static const int subpel_iterations[][4] = 
   {{1,0,0,0},
    {1,1,0,0},
    {0,1,1,0},
    {0,2,1,0},
    {0,2,1,1},
    {0,2,1,2},
    {0,0,2,2},
    {0,0,4,10}};

static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );

#define BITS_MVD( mx, my )\
    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])

#define COST_MV( mx, my )\
{\
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
                   &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
             + BITS_MVD(mx,my);\
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}

#define COST_MV_HPEL( mx, my ) \
{ \
    int stride = 16; \
    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}

#define COST_MV_HPEL2( mx, my, cost ) \
{ \
    int stride = 16; \
    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
    cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
}

#define COST_MV_HPEL3( mx, my) \
{ \
    int stride = 16; \
    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bestcost, cost, bestx, mx, besty, my ); \
}

#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
    uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
    h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
        pix_base + (m0x) + (m0y)*m->i_stride[0],\
        pix_base + (m1x) + (m1y)*m->i_stride[0],\
        pix_base + (m2x) + (m2y)*m->i_stride[0],\
        m->i_stride[0], costs );\
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
}

#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
    uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
    h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
        pix_base + (m0x) + (m0y)*m->i_stride[0],\
        pix_base + (m1x) + (m1y)*m->i_stride[0],\
        pix_base + (m2x) + (m2y)*m->i_stride[0],\
        pix_base + (m3x) + (m3y)*m->i_stride[0],\
        m->i_stride[0], costs );\
    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}

#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
    h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
        p_fref + (m0x) + (m0y)*m->i_stride[0],\
        p_fref + (m1x) + (m1y)*m->i_stride[0],\
        p_fref + (m2x) + (m2y)*m->i_stride[0],\
        p_fref + (m3x) + (m3y)*m->i_stride[0],\
        m->i_stride[0], costs );\
    costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
    costs[1] += p_cost_mvx[m1x<<2];\
    costs[2] += p_cost_mvx[m2x<<2];\
    costs[3] += p_cost_mvx[m3x<<2];\
    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
    COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\
}

/*  1  */
/* 101 */
/*  1  */
#define DIA1_ITER( mx, my )\
{\
    omx = mx; omy = my;\
    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
}

#define DIA2_ITER( mx, my )\
{\
    omx = mx; omy = my;\
    COST_MV_X4( 0,-2, 0,2, -2,0, 2,0 );\
}

#define CROSS( start, x_max, y_max )\
{\
    i = start;\
    if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
        for( ; i < x_max-2; i+=4 )\
            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
    for( ; i < x_max; i+=2 )\
    {\
        if( omx+i <= mv_x_max )\
            COST_MV( omx+i, omy );\
        if( omx-i >= mv_x_min )\
            COST_MV( omx-i, omy );\
    }\
    i = start;\
    if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
        for( ; i < y_max-2; i+=4 )\
            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
    for( ; i < y_max; i+=2 )\
    {\
        if( omy+i <= mv_y_max )\
            COST_MV( omx, omy+i );\
        if( omy-i >= mv_y_min )\
            COST_MV( omx, omy-i );\
    }\
}

#define ME_HEX(X,Y,range)\
{\
	static const int mod6[8] = {5,0,1,2,3,4,5,0};\
	bmx = X;\
	bmy = Y;\
	dir = -2;\
	COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );\
	COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+3 );\
	COPY2_IF_LT( bcost, costs[0], dir, 0 );\
	COPY2_IF_LT( bcost, costs[1], dir, 1 );\
	COPY2_IF_LT( bcost, costs[2], dir, 2 );\
	COPY2_IF_LT( bcost, costs[3], dir, 3 );\
	COPY2_IF_LT( bcost, costs[4], dir, 4 );\
	COPY2_IF_LT( bcost, costs[5], dir, 5 );\
	if( dir != -2 )	{\
		static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};\
		bmx += hex2[dir+1][0];\
		bmy += hex2[dir+1][1];\
		for( i = 1; i < range && CHECK_MVRANGE(bmx, bmy); i++ )\
		{\
			const int odir = mod6[dir+1];\
			COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],\
							hex2[odir+1][0], hex2[odir+1][1],\
							hex2[odir+2][0], hex2[odir+2][1],\
							costs );\
			dir = -2;\
			COPY2_IF_LT( bcost, costs[0], dir, odir-1 );\
			COPY2_IF_LT( bcost, costs[1], dir, odir   );\
			COPY2_IF_LT( bcost, costs[2], dir, odir+1 );\
			if( dir == -2 ) break;\
			bmx += hex2[dir+1][0];\
			bmy += hex2[dir+1][1];}\
		if(dir == -2 || bcost > bestCost) {}\
		else{\
			for( i = 1; i < range && CHECK_MVRANGE(bmx, bmy); i++ )\
			{\
				const int odir = mod6[dir+1];\
				COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],\
								hex2[odir+1][0], hex2[odir+1][1],\
								hex2[odir+2][0], hex2[odir+2][1],\
								costs );\
				dir = -2;\
				COPY2_IF_LT( bcost, costs[0], dir, odir-1 );\
				COPY2_IF_LT( bcost, costs[1], dir, odir   );\
				COPY2_IF_LT( bcost, costs[2], dir, odir+1 );\
				if( dir == -2 ) break;\
				bmx += hex2[dir+1][0];\
				bmy += hex2[dir+1][1];}}}\
	omx = bmx; omy = bmy;\
	COST_MV_X4(  0,-1,  0,1, -1,0, 1,0 );\
	COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 );\
}\

void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
    int cost;
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const int i_pixel = m->i_pixel;
    int i_me_range = h->param.analyse.i_me_range;
    int bmx, bmy, bcost;
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
    int omx, omy, pmx, pmy;
    uint8_t *p_fref = m->p_fref[0];
    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
    
    int i, j;
    int dir;
    int costs[6];

    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];
	int mv_x_min4 = h->mb.mv_min_fpel[0]<<2;
    int mv_y_min4 = h->mb.mv_min_fpel[1]<<2;
    int mv_x_max4 = h->mb.mv_max_fpel[0]<<2;
    int mv_y_max4 = h->mb.mv_max_fpel[1]<<2;

#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
#define CHECK_MVRANGE4(mx,my) ( mx >= mv_x_min4 && mx <= mv_x_max4 && my >= mv_y_min4 && my <= mv_y_max4 )

    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];

    bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
    bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
    bcost = COST_MAX;
    
    /* try extra predictors if provided */
    if( h->mb.i_subpel_refine >= 3 )
    {
        COST_MV_HPEL( bmx, bmy );
        if(!h->param.analyse.i_me_prepass)
        {
            for( i = 0; i < i_mvc; i++ )
            {
                 const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                 const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
                 if( mx != bpred_mx || my != bpred_my )
                     COST_MV_HPEL( mx, my );
            }
        }
        else
        {
            for( i = 0; i < i_mvc; i++ )
            {
                const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
				int doSearch = 1;
				int j;
				for(j = 0; j < i; j++)
				{
					if(mvc[i][0] == mvc[j][0] && mvc[i][1] == mvc[j][1]) doSearch = 0;
				}
                if( ( mx != bpred_mx || my != bpred_my ) && doSearch)
                {
                    int bestcost;
                    int bestx = mx;
                    int besty = my;
                    COST_MV_HPEL2( mx, my, bestcost );
                    COPY3_IF_LT( bpred_cost, bestcost, bpred_mx, bestx, bpred_my, besty );
                    if(bestcost < 2*bpred_cost)
                    {
                        int n;
                        int dir = -2;
                        COST_MV_HPEL2(bestx-4,besty,costs[0]);
                        COST_MV_HPEL2(bestx-2,besty+4,costs[1]);
                        COST_MV_HPEL2(bestx+2,besty+4,costs[2]);
                        COST_MV_HPEL2(bestx+4,besty,costs[3]);
                        COST_MV_HPEL2(bestx+2,besty-4,costs[4]);
                        COST_MV_HPEL2(bestx-2,besty-4,costs[5]);
                        COPY2_IF_LT( bestcost, costs[0], dir, 0 );
                        COPY2_IF_LT( bestcost, costs[1], dir, 1 );
                        COPY2_IF_LT( bestcost, costs[2], dir, 2 );
                        COPY2_IF_LT( bestcost, costs[3], dir, 3 );
                        COPY2_IF_LT( bestcost, costs[4], dir, 4 );
                        COPY2_IF_LT( bestcost, costs[5], dir, 5 );
                        if( dir != -2 )
                        {
                            static const int hex2[8][2] = {{-2,-4}, {-4,0}, {-2,4}, {2,4}, {4,0}, {2,-4}, {-2,-4}, {-4,0}};
                            bestx += hex2[dir+1][0];
                            besty += hex2[dir+1][1];
                            for( n = 1; n < i_me_range && CHECK_MVRANGE4(bestx, besty); n++ )
                            {
                                static const int mod6[8] = {5,0,1,2,3,4,5,0};
                                const int odir = mod6[dir+1];
                                COST_MV_HPEL2(hex2[odir+0][0]+bestx,hex2[odir+0][1]+besty,costs[0]);
                                COST_MV_HPEL2(hex2[odir+1][0]+bestx,hex2[odir+1][1]+besty,costs[1]);
                                COST_MV_HPEL2(hex2[odir+2][0]+bestx,hex2[odir+2][1]+besty,costs[2]);
                                dir = -2;
                                COPY2_IF_LT( bestcost, costs[0], dir, odir-1 );
                                COPY2_IF_LT( bestcost, costs[1], dir, odir   );
                                COPY2_IF_LT( bestcost, costs[2], dir, odir+1 );
                                if( dir == -2 )
                                    break;
                                bestx += hex2[dir+1][0];
                                besty += hex2[dir+1][1];
                            }
                        }
                        COST_MV_HPEL3(bestx+2,besty-2);
                        COST_MV_HPEL3(bestx+2,besty);
                        COST_MV_HPEL3(bestx+2,besty+2);
                        COST_MV_HPEL3(bestx,besty-2);
                        COST_MV_HPEL3(bestx,besty+2);
                        COST_MV_HPEL3(bestx-2,besty-2);
                        COST_MV_HPEL3(bestx-2,besty);
                        COST_MV_HPEL3(bestx-2,besty+2);
                        COPY3_IF_LT(bpred_cost,bestcost,bpred_mx,bestx,bpred_my,besty);
                    }
                }
            }
        }
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
    }
    else
    {
        /* check the MVP */
        COST_MV( pmx, pmy );
        /* I don't know why this helps */
        bcost -= BITS_MVD(bmx,bmy);
        
        for( i = 0; i < i_mvc; i++ )
        {
             const int mx = x264_clip3( ( mvc[i][0] + 2 ) >> 2, mv_x_min, mv_x_max );
             const int my = x264_clip3( ( mvc[i][1] + 2 ) >> 2, mv_y_min, mv_y_max );
             if( mx != bmx || my != bmy )
                 COST_MV( mx, my );
        }
    }
    
    COST_MV( 0, 0 );
Dark Shikari is offline   Reply With Quote