View Single Post
Old 30th September 2007, 06:30   #5  |  Link
Dark Shikari
x264 developer
 
Dark Shikari's Avatar
 
Join Date: Sep 2005
Posts: 8,666
Here's my fixed ME_Prepass patch.

Code:
Index: common/common.c
===================================================================
--- common/common.c    (revision 675)
+++ common/common.c    (working copy)
@@ -441,6 +441,8 @@
         p->analyse.i_mv_range_thread = atoi(value);
     OPT2("subme", "subq")
         p->analyse.i_subpel_refine = atoi(value);
+    OPT2("me-prepass", "meprepass")
+        p->analyse.i_me_prepass = atobool(value);
     OPT("bime")
         p->analyse.b_bidir_me = atobool(value);
     OPT("chroma-me")
@@ -879,6 +881,7 @@
     s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
     s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
     s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
+    s += sprintf( s, " me-prepass=%d", p->analyse.i_me_prepass );
     s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );
     s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
     s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
Index: encoder/me.c
===================================================================
--- encoder/me.c    (revision 675)
+++ encoder/me.c    (working copy)
@@ -61,6 +61,23 @@
     COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
 }
 
+#define COST_MV_HPEL2( mx, my, cost ) \
+{ \
+    int stride = 16; \
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
+    cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
+}
+
+#define COST_MV_HPEL3( mx, my) \
+{ \
+    int stride = 16; \
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
+    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
+    COPY3_IF_LT( bestcost, cost, bestx, mx, besty, my ); \
+}
+
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
     uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
@@ -177,18 +194,85 @@
     pmx = ( bmx + 2 ) >> 2;
     pmy = ( bmy + 2 ) >> 2;
     bcost = COST_MAX;
-
+    
     /* try extra predictors if provided */
     if( h->mb.i_subpel_refine >= 3 )
     {
         COST_MV_HPEL( bmx, bmy );
-        for( i = 0; i < i_mvc; i++ )
+        if(!h->param.analyse.i_me_prepass)
         {
-             const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
-             const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
-             if( mx != bpred_mx || my != bpred_my )
-                 COST_MV_HPEL( mx, my );
+            for( i = 0; i < i_mvc; i++ )
+            {
+                 const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
+                 const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
+                 if( mx != bpred_mx || my != bpred_my )
+                     COST_MV_HPEL( mx, my );
+            }
+        }
+        else
+        {
+            for( i = 0; i < i_mvc; i++ )
+            {
+                const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
+                const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
+                int doSearch = 1;
+                int j;
+                for(j = 0; j < i; j++)
+                {
+                    if(mvc[i][0] == mvc[j][0] && mvc[i][1] == mvc[j][1]) doSearch = 0;
+                }
+                if( ( mx != bpred_mx || my != bpred_my ) && doSearch)
+                {
+                    int bestcost;
+                    int bestx = mx;
+                    int besty = my;
+                    COST_MV_HPEL2( mx, my, bestcost );
+                    COPY3_IF_LT( bpred_cost, bestcost, bpred_mx, bestx, bpred_my, besty );
+                    if(bestcost < 2*bpred_cost)
+                    {
+                        int n;
+                        int dir = -2;
+                        COST_MV_HPEL2(bestx-4,besty,costs[0]);
+                        COST_MV_HPEL2(bestx-2,besty+4,costs[1]);
+                        COST_MV_HPEL2(bestx+2,besty+4,costs[2]);
+                        COST_MV_HPEL2(bestx+4,besty,costs[3]);
+                        COST_MV_HPEL2(bestx+2,besty-4,costs[4]);
+                        COST_MV_HPEL2(bestx-2,besty-4,costs[5]);
+                        COPY2_IF_LT( bestcost, costs[0], dir, 0 );
+                        COPY2_IF_LT( bestcost, costs[1], dir, 1 );
+                        COPY2_IF_LT( bestcost, costs[2], dir, 2 );
+                        COPY2_IF_LT( bestcost, costs[3], dir, 3 );
+                        COPY2_IF_LT( bestcost, costs[4], dir, 4 );
+                        COPY2_IF_LT( bestcost, costs[5], dir, 5 );
+                        if( dir != -2 )
+                        {
+                            static const int hex2[8][2] = {{-2,-4}, {-4,0}, {-2,4}, {2,4}, {4,0}, {2,-4}, {-2,-4}, {-4,0}};
+                            bestx += hex2[dir+1][0];
+                            besty += hex2[dir+1][1];
+                            for( n = 1; n < i_me_range && CHECK_MVRANGE4(bestx, besty); n++ )
+                            {
+                                static const int mod6[8] = {5,0,1,2,3,4,5,0};
+                                const int odir = mod6[dir+1];
+                                COST_MV_HPEL2(hex2[odir+0][0]+bestx,hex2[odir+0][1]+besty,costs[0]);
+                                COST_MV_HPEL2(hex2[odir+1][0]+bestx,hex2[odir+1][1]+besty,costs[1]);
+                                COST_MV_HPEL2(hex2[odir+2][0]+bestx,hex2[odir+2][1]+besty,costs[2]);
+                                dir = -2;
+                                COPY2_IF_LT( bestcost, costs[0], dir, odir-1 );
+                                COPY2_IF_LT( bestcost, costs[1], dir, odir   );
+                                COPY2_IF_LT( bestcost, costs[2], dir, odir+1 );
+                                if( dir == -2 )
+                                    break;
+                                bestx += hex2[dir+1][0];
+                                besty += hex2[dir+1][1];
+                            }
+                        }
+                        COST_MV_HPEL3(bestx+2,besty-2);
+                        COST_MV_HPEL3(bestx+2,besty);
+                        COST_MV_HPEL3(bestx+2,besty+2);
+                        COST_MV_HPEL3(bestx,besty-2);
+                        COST_MV_HPEL3(bestx,besty+2);
+                        COST_MV_HPEL3(bestx-2,besty-2);
+                        COST_MV_HPEL3(bestx-2,besty);
+                        COST_MV_HPEL3(bestx-2,besty+2);
+                        COPY3_IF_LT(bpred_cost,bestcost,bpred_mx,bestx,bpred_my,besty);
+                    }
+                }
+            }
         }
         bmx = ( bpred_mx + 2 ) >> 2;
         bmy = ( bpred_my + 2 ) >> 2;
         COST_MV( bmx, bmy );
     }
Index: x264.c
===================================================================
--- x264.c    (revision 675)
+++ x264.c    (working copy)
@@ -232,7 +232,8 @@
     H1( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
     H0( "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
         "                                  decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
-    H0( "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6.\n" );
+    H0( "      --me-prepass            Run an ME prepass on predictors.  Requires subme 3 or higher.\n");
+    H0( "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6 or higher.\n" );
     H0( "      --mixed-refs            Decide references on a per partition basis\n" );
     H1( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
     H1( "      --bime                  Jointly optimize both MVs in B-frames\n" );
@@ -398,6 +399,7 @@
             { "mvrange", required_argument, NULL, 0 },
             { "mvrange-thread", required_argument, NULL, 0 },
             { "subme",   required_argument, NULL, 'm' },
+            { "me-prepass", no_argument,    NULL, 0 },
             { "b-rdo",   no_argument,       NULL, 0 },
             { "mixed-refs", no_argument,    NULL, 0 },
             { "no-chroma-me", no_argument,  NULL, 0 },
Index: x264.h
===================================================================
--- x264.h    (revision 675)
+++ x264.h    (working copy)
@@ -220,6 +220,7 @@
         int          i_mv_range; /* maximum length of a mv (in pixels). -1 = auto, based on level */
         int          i_mv_range_thread; /* minimum space between threads. -1 = auto, based on number of threads. */
         int          i_subpel_refine; /* subpixel motion estimation quality */
+        int          i_me_prepass; /* run an ME prepass on predictors */
         int          b_bidir_me; /* jointly optimize both MVs in B-frames */
         int          b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */
         int          b_bframe_rdo; /* RD based mode decision for B-frames */
Speed: 25% faster (25% less impact on speed as compared to the old ME-prepass)
Quality: 42% better (42% more increase in quality as compared to the old ME-prepass)

Not surprisingly, eliminating the qpel aspect of the search gave a huge speed boost with an actual slight increase in quality.
Dark Shikari is offline   Reply With Quote