Upgrade libx264

Signed-off-by: Leo Ma <begeekmyfriend@gmail.com>

Upgrade libx264
eb23c134 · Leo Ma · 4b131dd7 · eb23c134 · eb23c134 · eb23c134
Commit eb23c134 authored 8 years ago by Leo Ma
--- a/library/src/main/libenc/jni/libx264/common/aarch64/mc-a.S
+++ b/library/src/main/libenc/jni/libx264/common/aarch64/mc-a.S
@@ -1253,7 +1253,7 @@ load_deinterleave_chroma:
    ret
 endfunc
  
-function x264_plane_copy_neon, export=1
+function x264_plane_copy_core_neon, export=1
    add         x8,  x4,  #15
    and         x4,  x8,  #~15
    sub         x1,  x1,  x4
@@ -1281,6 +1281,34 @@ function x264_plane_copy_neon, export=1
    ret
 endfunc
  
+function x264_plane_copy_swap_core_neon, export=1
+    lsl         w4,  w4,  #1
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+    tbz         w4,  #4,  32f
+    subs        w8,  w8,  #16
+    ld1         {v0.16b}, [x2], #16
+    rev16       v0.16b, v0.16b
+    st1         {v0.16b}, [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ld1         {v0.16b,v1.16b}, [x2], #32
+    rev16       v0.16b, v0.16b
+    rev16       v1.16b, v1.16b
+    st1         {v0.16b,v1.16b}, [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
    add         w9,  w6,  #15
    and         w9,  w9,  #0xfffffff0
@@ -1352,7 +1380,7 @@ function x264_plane_copy_deinterleave_rgb_neon, export=1
    ret
 endfunc
  
-function x264_plane_copy_interleave_neon, export=1
+function x264_plane_copy_interleave_core_neon, export=1
    add         w9,  w6,  #15
    and         w9,  w9,  #0xfffffff0
    sub         x1,  x1,  x9,  lsl #1

--- a/library/src/main/libenc/jni/libx264/common/aarch64/mc-c.c
+++ b/library/src/main/libenc/jni/libx264/common/aarch64/mc-c.c
@@ -49,8 +49,10 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  
-void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
-                           pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
@@ -58,9 +60,9 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                            pixel *dstb, intptr_t i_dstb,
                                            pixel *dstc, intptr_t i_dstc,
                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
-                                      pixel *srcu, intptr_t i_srcu,
-                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
  
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -206,6 +208,10 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
 void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                            uint8_t *src, intptr_t stride, int width,
                            int height, int16_t *buf );
+
+PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
+PLANE_INTERLEAVE(neon)
 #endif // !HIGH_BIT_DEPTH
  
 PROPAGATE_LIST(neon)
@@ -229,6 +235,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
    pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
  
    pf->plane_copy                  = x264_plane_copy_neon;
+    pf->plane_copy_swap             = x264_plane_copy_swap_neon;
    pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
    pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;

--- a/library/src/main/libenc/jni/libx264/common/arm/mc-a.S
+++ b/library/src/main/libenc/jni/libx264/common/arm/mc-a.S
@@ -1468,7 +1468,7 @@ function x264_load_deinterleave_chroma_fenc_neon
    bx              lr
 endfunc
  
-function x264_plane_copy_neon
+function x264_plane_copy_core_neon
    push            {r4,lr}
    ldr             r4,  [sp, #8]
    ldr             lr,  [sp, #12]
@@ -1577,7 +1577,7 @@ block4:
    pop             {r4-r8, r10, r11, pc}
 endfunc
  
-function x264_plane_copy_interleave_neon
+function x264_plane_copy_interleave_core_neon
    push            {r4-r7, lr}
    ldrd            r6, r7, [sp, #28]
    ldrd            r4, r5, [sp, #20]
@@ -1604,7 +1604,7 @@ blocki:
    pop             {r4-r7, pc}
 endfunc
  
-function x264_plane_copy_swap_neon
+function x264_plane_copy_swap_core_neon
    push            {r4-r5, lr}
    ldrd            r4, r5, [sp, #12]
    add             lr,  r4,  #15

--- a/library/src/main/libenc/jni/libx264/common/arm/mc-c.c
+++ b/library/src/main/libenc/jni/libx264/common/arm/mc-c.c
@@ -48,8 +48,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  
-void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
-                           pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
@@ -57,11 +57,11 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                            pixel *dstb, intptr_t i_dstb,
                                            pixel *dstc, intptr_t i_dstc,
                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
-                                      pixel *srcu, intptr_t i_srcu,
-                                      pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
-                                pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
  
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -232,6 +232,10 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
        src  += stride;
    }
 }
+
+PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
+PLANE_INTERLEAVE(neon)
 #endif // !HIGH_BIT_DEPTH
  
 PROPAGATE_LIST(neon)

--- a/library/src/main/libenc/jni/libx264/common/common.c
+++ b/library/src/main/libenc/jni/libx264/common/common.c
@@ -221,7 +221,6 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
    }
    else if( !strcasecmp( preset, "veryfast" ) )
    {
-        param->analyse.i_me_method = X264_ME_HEX;
        param->analyse.i_subpel_refine = 2;
        param->i_frame_reference = 1;
        param->analyse.b_mixed_references = 0;
@@ -250,11 +249,10 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
    }
    else if( !strcasecmp( preset, "slow" ) )
    {
-        param->analyse.i_me_method = X264_ME_UMH;
        param->analyse.i_subpel_refine = 8;
        param->i_frame_reference = 5;
-        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+        param->analyse.i_trellis = 2;
        param->rc.i_lookahead = 50;
    }
    else if( !strcasecmp( preset, "slower" ) )
@@ -1074,18 +1072,6 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
 /****************************************************************************
 * x264_log:
 ****************************************************************************/
-#ifdef __ANDROID__
-    #include <android/log.h>
-    #define LIBX264_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, "libx264", __VA_ARGS__))
-    #define LIBX264_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO , "libx264", __VA_ARGS__))
-    #define LIBX264_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN , "libx264", __VA_ARGS__))
-    #define LIBX264_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, "libx264", __VA_ARGS__))
-#else
-    #define LIBX264_LOGD(...) do {} while (0)
-    #define LIBX264_LOGI(...) do {} while (0)
-    #define LIBX264_LOGW(...) do {} while (0)
-    #define LIBX264_LOGE(...) do {} while (0)
-#endif
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
 {
    if( !h || i_level <= h->param.i_log_level )
@@ -1107,25 +1093,20 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
    {
        case X264_LOG_ERROR:
            psz_prefix = "error";
-            LIBX264_LOGE(psz_fmt, arg);
            break;
        case X264_LOG_WARNING:
            psz_prefix = "warning";
-            LIBX264_LOGW(psz_fmt, arg);
            break;
        case X264_LOG_INFO:
            psz_prefix = "info";
-            LIBX264_LOGI(psz_fmt, arg);
            break;
        case X264_LOG_DEBUG:
            psz_prefix = "debug";
-            LIBX264_LOGD(psz_fmt, arg);
            break;
        default:
            psz_prefix = "unknown";
            break;
    }
-
    fprintf( stderr, "x264 [%s]: ", psz_prefix );
    x264_vfprintf( stderr, psz_fmt, arg );
 }

--- a/library/src/main/libenc/jni/libx264/common/dct.c
+++ b/library/src/main/libenc/jni/libx264/common/dct.c
@@ -990,10 +990,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
    if( cpu&X264_CPU_MMX2 )
    {
-        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_mmx2;
        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
    }
+    if( cpu&X264_CPU_SSE )
+        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse;
    if( cpu&X264_CPU_SSE2_IS_FAST )
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
    if( cpu&X264_CPU_SSSE3 )

--- a/library/src/main/libenc/jni/libx264/common/mc.h
+++ b/library/src/main/libenc/jni/libx264/common/mc.h
@@ -100,6 +100,98 @@ static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, in
    }\
 }
  
+void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align) / sizeof(pixel) - 1;\
+    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    else if( !(w&c_w) )\
+        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pixel) );\
+    }\
+}
+
+void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+
+#define PLANE_COPY_SWAP(align, cpu)\
+static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align>>1) / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else if( w > c_w )\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
+        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
+        {\
+            dst[x]   = src[x+1];\
+            dst[x+1] = src[x];\
+        }\
+    }\
+    else\
+        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
+}
+
+void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
+                                   pixel *srcu, intptr_t i_srcu,
+                                   pixel *srcv, intptr_t i_srcv, int w, int h );
+
+#define PLANE_INTERLEAVE(cpu) \
+static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
+                                              pixel *srcu, intptr_t i_srcu,\
+                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
+{\
+    int c_w = 16 / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_srcu > 0 )\
+            {\
+                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
+                dst  += i_dst  * h;\
+                srcu += i_srcu * h;\
+                srcv += i_srcv * h;\
+            }\
+            else\
+                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
+    }\
+    else\
+        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+}
+
 struct x264_weight_t;
 typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
 typedef struct x264_weight_t

--- a/library/src/main/libenc/jni/libx264/common/mips/mc-c.c
+++ b/library/src/main/libenc/jni/libx264/common/mips/mc-c.c
@@ -3430,7 +3430,7 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
                x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
                                       p_dst, *p_dst_stride,
                                       pWeight, i_h4w );
-                for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
+                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
                {
                    uint64_t temp0;
                    v16i8 zero = {0};
@@ -3666,7 +3666,7 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
                                   pWeight, i_h4w );
            p_src1 = src1_org + i_h4w * i_src_stride;
  
-            for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
            {
                uint64_t u_temp0;
                v16i8 zero = {0};
@@ -3761,9 +3761,11 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
        return p_src1;
    }
 }
+#endif // !HIGH_BIT_DEPTH
  
 void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
 {
+#if !HIGH_BIT_DEPTH
    if( cpu & X264_CPU_MSA )
    {
        pf->mc_luma = x264_mc_luma_msa;
@@ -3803,5 +3805,5 @@ void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
        pf->memzero_aligned = x264_memzero_aligned_msa;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
    }
+#endif // !HIGH_BIT_DEPTH
 }
-#endif
--- a/library/src/main/libenc/jni/libx264/common/mvpred.c
+++ b/library/src/main/libenc/jni/libx264/common/mvpred.c
@@ -370,8 +370,8 @@ static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t *h, int
            h->mb.i_partition = partition_col[0];
        }
    }
-    int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy ;
-    int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy ;
+    int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy;
+    int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy;
  
    int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8];
    int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8];

--- a/library/src/main/libenc/jni/libx264/common/osdep.h
+++ b/library/src/main/libenc/jni/libx264/common/osdep.h
@@ -249,7 +249,7 @@ int x264_threading_init( void );
 static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex )
 {
 #if HAVE_THREAD
-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86
+#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && (ARCH_X86 || ARCH_X86_64)
    return __sync_fetch_and_add( val, add );
 #else
    x264_pthread_mutex_lock( mutex );

--- a/library/src/main/libenc/jni/libx264/common/pixel.c
+++ b/library/src/main/libenc/jni/libx264/common/pixel.c
@@ -556,6 +556,7 @@ INTRA_MBCMP(satd, 16x16,  v, h, dc,  ,, _c )
 #if HIGH_BIT_DEPTH
 #define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
 #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
+#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c
 #define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
 #define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
 #define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
@@ -884,7 +885,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
        INIT8( ssd, _mmx2 );
        INIT_ADS( _mmx2 );
  
-        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
 #if ARCH_X86
@@ -1070,7 +1070,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_mmx2;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_mmx2;
 #if ARCH_X86
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;

--- a/library/src/main/libenc/jni/libx264/common/ppc/quant.c
+++ b/library/src/main/libenc/jni/libx264/common/ppc/quant.c
@@ -71,7 +71,7 @@ int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16]
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_u16_t biasvA;
-    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;
  
    vector bool short mskB;
@@ -216,7 +216,7 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_u16_t biasvA;
-    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;
  
    vector bool short mskB;

--- a/library/src/main/libenc/jni/libx264/common/x86/const-a.asm
+++ b/library/src/main/libenc/jni/libx264/common/x86/const-a.asm
@@ -38,6 +38,8 @@ const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const pw_0to15,    dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 const pd_1,        times 8 dd 1
+const pd_0123,     dd 0,1,2,3
+const pd_4567,     dd 4,5,6,7
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
 const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
@@ -63,6 +65,7 @@ const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
  
+const pd_8,        times 4 dd 8
 const pd_32,       times 4 dd 32
 const pd_1024,     times 4 dd 1024
 const pd_ffff,     times 4 dd 0xffff

--- a/library/src/main/libenc/jni/libx264/common/x86/dct-a.asm
+++ b/library/src/main/libenc/jni/libx264/common/x86/dct-a.asm
@@ -1463,9 +1463,9 @@ cglobal zigzag_scan_4x4_frame, 2,2
 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal zigzag_scan_4x4_field, 2,3
-    movu       m4, [r1+ 8]
-    pshufd     m0, m4, q3102
+cglobal zigzag_scan_4x4_field, 2,2
+    movu       m0, [r1+ 8]
+    pshufd     m0, m0, q3102
    mova       m1, [r1+32]
    mova       m2, [r1+48]
    movu  [r0+ 8], m0
@@ -1480,19 +1480,14 @@ cglobal zigzag_scan_4x4_field, 2,3
 ;-----------------------------------------------------------------------------
 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 ;-----------------------------------------------------------------------------
-; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-INIT_MMX mmx2
-cglobal zigzag_scan_4x4_field, 2,3
-    pshufw      m0, [r1+4], q3102
-    mova        m1, [r1+16]
-    mova        m2, [r1+24]
-    movu    [r0+4], m0
-    mova   [r0+16], m1
-    mova   [r0+24], m2
-    mov        r2d, [r1]
-    mov       [r0], r2d
-    mov        r2d, [r1+12]
-    mov    [r0+12], r2d
+INIT_XMM sse
+cglobal zigzag_scan_4x4_field, 2,2
+    mova       m0, [r1]
+    mova       m1, [r1+16]
+    pshufw    mm0, [r1+4], q3102
+    mova     [r0], m0
+    mova  [r0+16], m1
+    movq   [r0+4], mm0
    RET
 %endif ; HIGH_BIT_DEPTH
  

--- a/library/src/main/libenc/jni/libx264/common/x86/dct.h
+++ b/library/src/main/libenc/jni/libx264/common/x86/dct.h
@@ -112,7 +112,7 @@ void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
 void x264_zigzag_scan_4x4_frame_mmx  ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse  ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_8x8_field_xop  ( int16_t level[64], int16_t dct[64] );
 void x264_zigzag_scan_8x8_field_avx  ( int32_t level[64], int32_t dct[64] );
 void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );

--- a/library/src/main/libenc/jni/libx264/common/x86/mc-a2.asm
+++ b/library/src/main/libenc/jni/libx264/common/x86/mc-a2.asm
@@ -67,7 +67,6 @@ pf_256:    times 4 dd 256.0
 pf_inv256: times 4 dd 0.00390625
  
 pd_16: times 4 dd 16
-pd_0f: times 4 dd 0xffff
  
 pad10: times 8 dw    10*PIXEL_MAX
 pad20: times 8 dw    20*PIXEL_MAX
@@ -94,6 +93,8 @@ cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
 cextern pw_0to15
+cextern pd_8
+cextern pd_0123
 cextern pd_ffff
  
 %macro LOAD_ADD 4
@@ -285,7 +286,7 @@ cglobal hpel_filter_c, 3,3,10
    psrad      m1, 10
    psrad      m2, 10
    pslld      m2, 16
-    pand       m1, [pd_0f]
+    pand       m1, [pd_ffff]
    por        m1, m2
    CLIPW      m1, [pb_0], [pw_pixel_max]
    mova  [r0+r2], m1
@@ -2178,7 +2179,7 @@ MBTREE_AVX
  
 %macro MBTREE_PROPAGATE_LIST 0
 ;-----------------------------------------------------------------------------
-; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
+; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
 ;                                      int16_t *output, int bipred_weight, int mb_y, int len )
 ;-----------------------------------------------------------------------------
 cglobal mbtree_propagate_list_internal, 4,6,8
@@ -2268,6 +2269,67 @@ MBTREE_PROPAGATE_LIST
 INIT_XMM avx
 MBTREE_PROPAGATE_LIST
  
+INIT_YMM avx2
+cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
+    mova          xm4, [pw_0xc000]
+%if UNIX64
+    shl           r4d, 9
+    shl           r5d, 16
+    movd          xm5, r4d
+    movd          xm6, r5d
+    vpbroadcastw  xm5, xm5
+    vpbroadcastd   m6, xm6
+%else
+    vpbroadcastw  xm5, r4m
+    vpbroadcastd   m6, r5m
+    psllw         xm5, 9             ; bipred_weight << 9
+    pslld          m6, 16
+%endif
+    mov           r4d, r6m
+    lea            r1, [r1+r4*2]
+    lea            r2, [r2+r4*2]
+    lea            r0, [r0+r4*4]
+    neg            r4
+    por            m6, [pd_0123]     ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
+    vbroadcasti128 m7, [pw_31]
+.loop:
+    mova          xm3, [r1+r4*2]
+    pand          xm0, xm4, [r2+r4*2]
+    pmulhrsw      xm1, xm3, xm5      ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6
+    pcmpeqw       xm0, xm4
+    pblendvb      xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount
+    vpermq         m3, m3, q1100
+
+    movu           m0, [r0+r4*4]     ; {x, y}
+    vbroadcasti128 m1, [pd_8]
+    psraw          m2, m0, 5
+    paddw          m2, m6            ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
+    paddw          m6, m1            ; i_mb_x += 8
+    mova         [r3], m2
+
+    mova           m1, [pw_32]
+    pand           m0, m7
+    psubw          m1, m0
+    packuswb       m1, m0            ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y}
+    psrlw          m0, m1, 3
+    pand           m1, [pw_00ff]     ; 32-x x 32-x x
+    pandn          m0, m7, m0        ; (32-y y 32-y y) << 5
+    pshufd         m2, m1, q1032
+    pmullw         m1, m0            ; idx0 idx3 idx0 idx3
+    pmullw         m2, m0            ; idx1 idx2 idx1 idx2
+
+    pmulhrsw       m0, m1, m3        ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10
+    pmulhrsw       m2, m3            ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10
+    psignw         m0, m1            ; correct potential overflow in the idx0 input to pmulhrsw
+    punpcklwd      m1, m0, m2        ; idx01weight
+    punpckhwd      m2, m0            ; idx23weight
+    mova      [r3+32], m1
+    mova      [r3+64], m2
+    add            r3, 3*mmsize
+    add            r4, 8
+    jl .loop
+    RET
+
 %macro MBTREE_FIX8 0
 ;-----------------------------------------------------------------------------
 ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )

--- a/library/src/main/libenc/jni/libx264/common/x86/mc-c.c
+++ b/library/src/main/libenc/jni/libx264/common/x86/mc-c.c
@@ -88,10 +88,8 @@ void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
 void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
                                           pixel *srcu, intptr_t i_srcu,
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
@@ -101,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
 void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
                                          pixel *srcu, intptr_t i_srcu,
                                          pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
-                                   pixel *srcu, intptr_t i_srcu,
-                                   pixel *srcv, intptr_t i_srcv, int w, int h );
 void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
                                       pixel *dstv, intptr_t i_dstv,
                                       pixel *src,  intptr_t i_src, int w, int h );
@@ -493,96 +488,12 @@ HPEL(32, avx2, avx2, avx2, avx2)
 #endif
 #endif // HIGH_BIT_DEPTH
  
-#define PLANE_COPY(align, cpu)\
-static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align) / sizeof(pixel) - 1;\
-    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
-    else if( !(w&c_w) )\
-        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
-        memcpy( dst, src, w*sizeof(pixel) );\
-    }\
-}
-
 PLANE_COPY(16, sse)
 PLANE_COPY(32, avx)
  
-#define PLANE_COPY_SWAP(align, cpu)\
-static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align>>1) / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else if( w > c_w )\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
-        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
-        {\
-            dst[x]   = src[x+1];\
-            dst[x+1] = src[x];\
-        }\
-    }\
-    else\
-        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
-}
-
 PLANE_COPY_SWAP(16, ssse3)
 PLANE_COPY_SWAP(32, avx2)
  
-#define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
-                                              pixel *srcu, intptr_t i_srcu,\
-                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
-{\
-    int c_w = 16 / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_srcu > 0 )\
-            {\
-                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
-                dst  += i_dst  * h;\
-                srcu += i_srcu * h;\
-                srcv += i_srcv * h;\
-            }\
-            else\
-                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
-    }\
-    else\
-        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-}
-
 PLANE_INTERLEAVE(mmx2)
 PLANE_INTERLEAVE(sse2)
 #if HIGH_BIT_DEPTH
@@ -621,6 +532,7 @@ do\
  
 PROPAGATE_LIST(ssse3)
 PROPAGATE_LIST(avx)
+PROPAGATE_LIST(avx2)
  
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
@@ -932,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
    pf->plane_copy_swap = x264_plane_copy_swap_avx2;
    pf->get_ref = get_ref_avx2;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_avx2;
    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_avx2;
 }
--- a/library/src/main/libenc/jni/libx264/common/x86/pixel-a.asm
+++ b/library/src/main/libenc/jni/libx264/common/x86/pixel-a.asm
@@ -43,6 +43,9 @@ mask_ff:   times 16 db 0xff
 mask_ac4:  times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
 mask_ac8:  times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
+%if HIGH_BIT_DEPTH
+ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+%endif
 %if BIT_DEPTH == 10
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -531,8 +534,8 @@ SSD 16,  8
 ;
 ;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
 ;
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
-; distortion levels it will take much more than that though.
+; For 10-bit XMM this means width >= 32832. At sane distortion levels
+; it will take much more than that though.
 ;-----------------------------------------------------------------------------
 %if HIGH_BIT_DEPTH
 %macro SSD_NV12 0
@@ -541,13 +544,14 @@ cglobal pixel_ssd_nv12_core, 6,7,7
    FIX_STRIDES r1, r3
    add         r0, r4
    add         r2, r4
-    xor         r6, r6
+    neg         r4
    pxor        m4, m4
    pxor        m5, m5
-    pxor        m6, m6
+%if mmsize == 32
+    vbroadcasti128 m6, [ssd_nv12_shuf]
+%endif
 .loopy:
    mov         r6, r4
-    neg         r6
    pxor        m2, m2
    pxor        m3, m3
 .loopx:
@@ -555,11 +559,11 @@ cglobal pixel_ssd_nv12_core, 6,7,7
    mova        m1, [r0+r6+mmsize]
    psubw       m0, [r2+r6]
    psubw       m1, [r2+r6+mmsize]
-    PSHUFLW     m0, m0, q3120
-    PSHUFLW     m1, m1, q3120
-%if mmsize >= 16
-    pshufhw     m0, m0, q3120
-    pshufhw     m1, m1, q3120
+%if mmsize == 32
+    pshufb      m0, m6
+    pshufb      m1, m6
+%else
+    SBUTTERFLY wd, 0, 1, 6
 %endif
 %if cpuflag(xop)
    pmadcswd    m2, m0, m0, m2
@@ -577,59 +581,30 @@ cglobal pixel_ssd_nv12_core, 6,7,7
    psubd       m3, m1
 .no_overread:
 %endif
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
-                 ; equation above, putting the width limit at 8208
-    punpckhdq   m0, m2, m6
-    punpckhdq   m1, m3, m6
-    punpckldq   m2, m6
-    punpckldq   m3, m6
-    paddq       m3, m2
-    paddq       m1, m0
-    paddq       m4, m3
-    paddq       m4, m1
-%else ; unfortunately paddq is sse2
-      ; emulate 48 bit precision for mmx2 instead
-    mova        m0, m2
-    mova        m1, m3
-    punpcklwd   m2, m6
-    punpcklwd   m3, m6
-    punpckhwd   m0, m6
-    punpckhwd   m1, m6
-    paddd       m3, m2
-    paddd       m1, m0
-    paddd       m4, m3
-    paddd       m5, m1
-%endif
+    punpckhdq   m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
+    punpckhdq   m1, m3, m5 ; equation above, putting the width limit at 8208
+    punpckldq   m2, m5
+    punpckldq   m3, m5
+    paddq       m0, m1
+    paddq       m2, m3
+    paddq       m4, m0
+    paddq       m4, m2
    add         r0, r1
    add         r2, r3
    dec        r5d
    jg .loopy
-    mov         r3, r6m
-    mov         r4, r7m
+    mov         r0, r6m
+    mov         r1, r7m
 %if mmsize == 32
    vextracti128 xm0, m4, 1
    paddq      xm4, xm0
 %endif
-%if mmsize >= 16
-    movq      [r3], xm4
-    movhps    [r4], xm4
-%else ; fixup for mmx2
-    SBUTTERFLY dq, 4, 5, 0
-    mova        m0, m4
-    psrld       m4, 16
-    paddd       m5, m4
-    pslld       m0, 16
-    SBUTTERFLY dq, 0, 5, 4
-    psrlq       m0, 16
-    psrlq       m5, 16
-    movq      [r3], m0
-    movq      [r4], m5
-%endif
+    movq      [r0], xm4
+    movhps    [r1], xm4
    RET
 %endmacro ; SSD_NV12
-%endif ; HIGH_BIT_DEPTH
  
-%if HIGH_BIT_DEPTH == 0
+%else ; !HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
 ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
@@ -643,12 +618,12 @@ cglobal pixel_ssd_nv12_core, 6,7
    add    r4d, r4d
    add     r0, r4
    add     r2, r4
+    neg     r4
    pxor    m3, m3
    pxor    m4, m4
    mova    m5, [pw_00ff]
 .loopy:
    mov     r6, r4
-    neg     r6
 .loopx:
 %if mmsize == 32 ; only 16-byte alignment is guaranteed
    movu    m2, [r0+r6]
@@ -686,21 +661,27 @@ cglobal pixel_ssd_nv12_core, 6,7
    add     r2, r3
    dec    r5d
    jg .loopy
-    mov     r3, r6m
-    mov     r4, r7m
-    HADDD   m3, m0
-    HADDD   m4, m0
-    pxor   xm0, xm0
-    punpckldq xm3, xm0
-    punpckldq xm4, xm0
-    movq  [r3], xm3
-    movq  [r4], xm4
+    mov     r0, r6m
+    mov     r1, r7m
+%if cpuflag(ssse3)
+    phaddd  m3, m4
+%else
+    SBUTTERFLY qdq, 3, 4, 0
+    paddd   m3, m4
+%endif
+%if mmsize == 32
+    vextracti128 xm4, m3, 1
+    paddd  xm3, xm4
+%endif
+    psllq  xm4, xm3, 32
+    paddd  xm3, xm4
+    psrlq  xm3, 32
+    movq  [r0], xm3
+    movhps [r1], xm3
    RET
 %endmacro ; SSD_NV12
 %endif ; !HIGH_BIT_DEPTH
  
-INIT_MMX mmx2
-SSD_NV12
 INIT_XMM sse2
 SSD_NV12
 INIT_XMM avx
@@ -4614,67 +4595,82 @@ cglobal intra_sad_x9_8x8, 5,7,8
 ;-----------------------------------------------------------------------------
 %macro SSIM_ITER 1
 %if HIGH_BIT_DEPTH
-    movdqu    m5, [r0+(%1&1)*r1]
-    movdqu    m6, [r2+(%1&1)*r3]
+    movu      m4, [r0+(%1&1)*r1]
+    movu      m5, [r2+(%1&1)*r3]
+%elif cpuflag(avx)
+    pmovzxbw  m4, [r0+(%1&1)*r1]
+    pmovzxbw  m5, [r2+(%1&1)*r3]
 %else
-    movq      m5, [r0+(%1&1)*r1]
-    movq      m6, [r2+(%1&1)*r3]
-    punpcklbw m5, m0
-    punpcklbw m6, m0
+    movq      m4, [r0+(%1&1)*r1]
+    movq      m5, [r2+(%1&1)*r3]
+    punpcklbw m4, m7
+    punpcklbw m5, m7
 %endif
 %if %1==1
    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
 %endif
-%if %1==0
-    movdqa    m1, m5
-    movdqa    m2, m6
+%if %1 == 0 && cpuflag(avx)
+    SWAP       0, 4
+    SWAP       1, 5
+    pmaddwd   m4, m0, m0
+    pmaddwd   m5, m1, m1
+    pmaddwd   m6, m0, m1
+%else
+%if %1 == 0
+    mova      m0, m4
+    mova      m1, m5
 %else
+    paddw     m0, m4
    paddw     m1, m5
-    paddw     m2, m6
 %endif
-    pmaddwd   m7, m5, m6
+    pmaddwd   m6, m4, m5
+    pmaddwd   m4, m4
    pmaddwd   m5, m5
-    pmaddwd   m6, m6
-    ACCUM  paddd, 3, 5, %1
-    ACCUM  paddd, 4, 7, %1
-    paddd     m3, m6
+%endif
+    ACCUM  paddd, 2, 4, %1
+    ACCUM  paddd, 3, 6, %1
+    paddd     m2, m5
 %endmacro
  
 %macro SSIM 0
-cglobal pixel_ssim_4x4x2_core, 4,4,8
+%if HIGH_BIT_DEPTH
+cglobal pixel_ssim_4x4x2_core, 4,4,7
    FIX_STRIDES r1, r3
-    pxor      m0, m0
+%else
+cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx)
+%if notcpuflag(avx)
+    pxor      m7, m7
+%endif
+%endif
    SSIM_ITER 0
    SSIM_ITER 1
    SSIM_ITER 2
    SSIM_ITER 3
-    ; PHADDW m1, m2
-    ; PHADDD m3, m4
-    movdqa    m7, [pw_1]
-    pshufd    m5, m3, q2301
-    pmaddwd   m1, m7
-    pmaddwd   m2, m7
-    pshufd    m6, m4, q2301
-    packssdw  m1, m2
-    paddd     m3, m5
-    pshufd    m1, m1, q3120
-    paddd     m4, m6
-    pmaddwd   m1, m7
-    punpckhdq m5, m3, m4
-    punpckldq m3, m4
-
 %if UNIX64
-    %define t0 r4
+    DECLARE_REG_TMP 4
 %else
-    %define t0 rax
-    mov t0, r4mp
+    DECLARE_REG_TMP 0
+    mov       t0, r4mp
 %endif
-
-    movq      [t0+ 0], m1
-    movq      [t0+ 8], m3
-    movhps    [t0+16], m1
-    movq      [t0+24], m5
+%if cpuflag(ssse3)
+    phaddw    m0, m1
+    pmaddwd   m0, [pw_1]
+    phaddd    m2, m3
+%else
+    mova      m4, [pw_1]
+    pmaddwd   m0, m4
+    pmaddwd   m1, m4
+    packssdw  m0, m1
+    shufps    m1, m2, m3, q2020
+    shufps    m2, m3, q3131
+    pmaddwd   m0, m4
+    paddd     m2, m1
+%endif
+    shufps    m1, m0, m2, q2020
+    shufps    m0, m2, q3131
+    mova    [t0], m1
+    mova [t0+16], m0
    RET
  
 ;-----------------------------------------------------------------------------

--- a/library/src/main/libenc/jni/libx264/common/x86/pixel.h
+++ b/library/src/main/libenc/jni/libx264/common/x86/pixel.h
@@ -145,9 +145,6 @@ int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u
 int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_avx2  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
  
-void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
                                    pixel *pixuv2, intptr_t stride2, int width,
                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );

--- a/library/src/main/libenc/jni/libx264/common/x86/predict-a.asm
+++ b/library/src/main/libenc/jni/libx264/common/x86/predict-a.asm
@@ -2092,63 +2092,28 @@ PREDICT_16x16_H
 %endif
  
 ;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
+; void predict_16x16_dc( pixel *src )
 ;-----------------------------------------------------------------------------
-%macro PRED16x16_DC_MMX 2
-%if HIGH_BIT_DEPTH
-    mova       m0, [r0 - FDEC_STRIDEB+ 0]
-    paddw      m0, [r0 - FDEC_STRIDEB+ 8]
-    paddw      m0, [r0 - FDEC_STRIDEB+16]
-    paddw      m0, [r0 - FDEC_STRIDEB+24]
-    HADDW      m0, m1
-    paddw      m0, %1
-    psrlw      m0, %2
-    SPLATW     m0, m0
-    STORE16    m0, m0, m0, m0
-%else ; !HIGH_BIT_DEPTH
-    pxor        m0, m0
-    pxor        m1, m1
-    psadbw      m0, [r0 - FDEC_STRIDE]
-    psadbw      m1, [r0 - FDEC_STRIDE + 8]
-    paddusw     m0, m1
-    paddusw     m0, %1
-    psrlw       m0, %2                      ; dc
-    pshufw      m0, m0, 0
-    packuswb    m0, m0                      ; dc in bytes
-    STORE16     m0, m0
-%endif
-%endmacro
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_core, 1,2
-%if ARCH_X86_64
-    movd         m6, r1d
-    PRED16x16_DC_MMX m6, 5
+%if WIN64
+DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
 %else
-    PRED16x16_DC_MMX r1m, 5
+DECLARE_REG_TMP 3
 %endif
-    RET
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_top, 1,2
-    PRED16x16_DC_MMX [pw_8], 4
-    RET
  
-INIT_MMX mmx2
-%if HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,2
-    movd       m0, r1m
-    SPLATW     m0, m0
-    STORE16    m0, m0, m0, m0
-    RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
-    movd       m0, r1m
-    pshufw     m0, m0, 0
-    packuswb   m0, m0
-    STORE16    m0, m0
+INIT_XMM
+; Returns the sum of the left pixels in r1d+r2d
+cglobal predict_16x16_dc_left_internal, 0,4
+    movzx r1d, pixel [r0-SIZEOF_PIXEL]
+    movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
+%assign i 2*FDEC_STRIDEB
+%rep 7
+    movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
+    add   r1d, t0d
+    movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
+    add   r2d, t0d
+%assign i i+2*FDEC_STRIDEB
+%endrep
    RET
-%endif
  
 %macro PRED16x16_DC 2
 %if HIGH_BIT_DEPTH
@@ -2176,9 +2141,11 @@ cglobal predict_16x16_dc_left_core, 1,1
 %endif
 %endmacro
  
-%macro PREDICT_16x16_DC_CORE 0
-cglobal predict_16x16_dc_core, 2,2,4
-    movd         xm3, r1m
+%macro PREDICT_16x16_DC 0
+cglobal predict_16x16_dc, 1,3
+    call predict_16x16_dc_left_internal
+    lea          r1d, [r1+r2+16]
+    movd         xm3, r1d
    PRED16x16_DC xm3, 5
    RET
  
@@ -2186,8 +2153,11 @@ cglobal predict_16x16_dc_top, 1,2
    PRED16x16_DC [pw_8], 4
    RET
  
-cglobal predict_16x16_dc_left_core, 1,2
-    movd      xm0, r1m
+cglobal predict_16x16_dc_left, 1,3
+    call predict_16x16_dc_left_internal
+    lea       r1d, [r1+r2+8]
+    shr       r1d, 4
+    movd      xm0, r1d
    SPLATW     m0, xm0
 %if HIGH_BIT_DEPTH && mmsize == 16
    STORE16    m0, m0
@@ -2201,11 +2171,11 @@ cglobal predict_16x16_dc_left_core, 1,2
 %endmacro
  
 INIT_XMM sse2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
 %if HIGH_BIT_DEPTH
 INIT_YMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
 %else
 INIT_XMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
 %endif