Skip to content
Snippets Groups Projects
Commit eb23c134 authored by Leo Ma's avatar Leo Ma
Browse files

Upgrade libx264


Signed-off-by: default avatarLeo Ma <begeekmyfriend@gmail.com>
parent 4b131dd7
No related branches found
No related tags found
No related merge requests found
Showing
with 374 additions and 324 deletions
Loading
Loading
@@ -1253,7 +1253,7 @@ load_deinterleave_chroma:
ret
endfunc
 
function x264_plane_copy_neon, export=1
function x264_plane_copy_core_neon, export=1
add x8, x4, #15
and x4, x8, #~15
sub x1, x1, x4
Loading
Loading
@@ -1281,6 +1281,34 @@ function x264_plane_copy_neon, export=1
ret
endfunc
 
function x264_plane_copy_swap_core_neon, export=1
lsl w4, w4, #1
sub x1, x1, x4
sub x3, x3, x4
1:
mov w8, w4
tbz w4, #4, 32f
subs w8, w8, #16
ld1 {v0.16b}, [x2], #16
rev16 v0.16b, v0.16b
st1 {v0.16b}, [x0], #16
b.eq 0f
32:
subs w8, w8, #32
ld1 {v0.16b,v1.16b}, [x2], #32
rev16 v0.16b, v0.16b
rev16 v1.16b, v1.16b
st1 {v0.16b,v1.16b}, [x0], #32
b.gt 32b
0:
subs w5, w5, #1
add x2, x2, x3
add x0, x0, x1
b.gt 1b
ret
endfunc
function x264_plane_copy_deinterleave_neon, export=1
add w9, w6, #15
and w9, w9, #0xfffffff0
Loading
Loading
@@ -1352,7 +1380,7 @@ function x264_plane_copy_deinterleave_rgb_neon, export=1
ret
endfunc
 
function x264_plane_copy_interleave_neon, export=1
function x264_plane_copy_interleave_core_neon, export=1
add w9, w6, #15
and w9, w9, #0xfffffff0
sub x1, x1, x9, lsl #1
Loading
Loading
Loading
Loading
@@ -49,8 +49,10 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
Loading
Loading
@@ -58,9 +60,9 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
 
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
Loading
Loading
@@ -206,6 +208,10 @@ static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
uint8_t *src, intptr_t stride, int width,
int height, int16_t *buf );
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
#endif // !HIGH_BIT_DEPTH
 
PROPAGATE_LIST(neon)
Loading
Loading
@@ -229,6 +235,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
 
pf->plane_copy = x264_plane_copy_neon;
pf->plane_copy_swap = x264_plane_copy_swap_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
Loading
Loading
Loading
Loading
@@ -1468,7 +1468,7 @@ function x264_load_deinterleave_chroma_fenc_neon
bx lr
endfunc
 
function x264_plane_copy_neon
function x264_plane_copy_core_neon
push {r4,lr}
ldr r4, [sp, #8]
ldr lr, [sp, #12]
Loading
Loading
@@ -1577,7 +1577,7 @@ block4:
pop {r4-r8, r10, r11, pc}
endfunc
 
function x264_plane_copy_interleave_neon
function x264_plane_copy_interleave_core_neon
push {r4-r7, lr}
ldrd r6, r7, [sp, #28]
ldrd r4, r5, [sp, #20]
Loading
Loading
@@ -1604,7 +1604,7 @@ blocki:
pop {r4-r7, pc}
endfunc
 
function x264_plane_copy_swap_neon
function x264_plane_copy_swap_core_neon
push {r4-r5, lr}
ldrd r4, r5, [sp, #12]
add lr, r4, #15
Loading
Loading
Loading
Loading
@@ -48,8 +48,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
Loading
Loading
@@ -57,11 +57,11 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
 
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
Loading
Loading
@@ -232,6 +232,10 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
src += stride;
}
}
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
#endif // !HIGH_BIT_DEPTH
 
PROPAGATE_LIST(neon)
Loading
Loading
Loading
Loading
@@ -221,7 +221,6 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
}
else if( !strcasecmp( preset, "veryfast" ) )
{
param->analyse.i_me_method = X264_ME_HEX;
param->analyse.i_subpel_refine = 2;
param->i_frame_reference = 1;
param->analyse.b_mixed_references = 0;
Loading
Loading
@@ -250,11 +249,10 @@ static int x264_param_apply_preset( x264_param_t *param, const char *preset )
}
else if( !strcasecmp( preset, "slow" ) )
{
param->analyse.i_me_method = X264_ME_UMH;
param->analyse.i_subpel_refine = 8;
param->i_frame_reference = 5;
param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
param->analyse.i_trellis = 2;
param->rc.i_lookahead = 50;
}
else if( !strcasecmp( preset, "slower" ) )
Loading
Loading
@@ -1074,18 +1072,6 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
/****************************************************************************
* x264_log:
****************************************************************************/
#ifdef __ANDROID__
#include <android/log.h>
#define LIBX264_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, "libx264", __VA_ARGS__))
#define LIBX264_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO , "libx264", __VA_ARGS__))
#define LIBX264_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN , "libx264", __VA_ARGS__))
#define LIBX264_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, "libx264", __VA_ARGS__))
#else
#define LIBX264_LOGD(...) do {} while (0)
#define LIBX264_LOGI(...) do {} while (0)
#define LIBX264_LOGW(...) do {} while (0)
#define LIBX264_LOGE(...) do {} while (0)
#endif
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
{
if( !h || i_level <= h->param.i_log_level )
Loading
Loading
@@ -1107,25 +1093,20 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
{
case X264_LOG_ERROR:
psz_prefix = "error";
LIBX264_LOGE(psz_fmt, arg);
break;
case X264_LOG_WARNING:
psz_prefix = "warning";
LIBX264_LOGW(psz_fmt, arg);
break;
case X264_LOG_INFO:
psz_prefix = "info";
LIBX264_LOGI(psz_fmt, arg);
break;
case X264_LOG_DEBUG:
psz_prefix = "debug";
LIBX264_LOGD(psz_fmt, arg);
break;
default:
psz_prefix = "unknown";
break;
}
fprintf( stderr, "x264 [%s]: ", psz_prefix );
x264_vfprintf( stderr, psz_fmt, arg );
}
Loading
Loading
Loading
Loading
@@ -990,10 +990,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
if( cpu&X264_CPU_MMX2 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
}
if( cpu&X264_CPU_SSE )
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse;
if( cpu&X264_CPU_SSE2_IS_FAST )
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
if( cpu&X264_CPU_SSSE3 )
Loading
Loading
Loading
Loading
@@ -100,6 +100,98 @@ static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, in
}\
}
 
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define PLANE_COPY(align, cpu)\
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align) / sizeof(pixel) - 1;\
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
else if( !(w&c_w) )\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
else\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
memcpy( dst, src, w*sizeof(pixel) );\
}\
}
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define PLANE_COPY_SWAP(align, cpu)\
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
{\
dst[x] = src[x+1];\
dst[x+1] = src[x];\
}\
}\
else\
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define PLANE_INTERLEAVE(cpu) \
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
pixel *srcu, intptr_t i_srcu,\
pixel *srcv, intptr_t i_srcv, int w, int h )\
{\
int c_w = 16 / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
{\
if( --h > 0 )\
{\
if( i_srcu > 0 )\
{\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
srcu += i_srcu * h;\
srcv += i_srcv * h;\
}\
else\
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
}\
else\
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
}
struct x264_weight_t;
typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
typedef struct x264_weight_t
Loading
Loading
Loading
Loading
@@ -3430,7 +3430,7 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
p_dst, *p_dst_stride,
pWeight, i_h4w );
for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
{
uint64_t temp0;
v16i8 zero = {0};
Loading
Loading
@@ -3666,7 +3666,7 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
pWeight, i_h4w );
p_src1 = src1_org + i_h4w * i_src_stride;
 
for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
{
uint64_t u_temp0;
v16i8 zero = {0};
Loading
Loading
@@ -3761,9 +3761,11 @@ uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
return p_src1;
}
}
#endif // !HIGH_BIT_DEPTH
 
void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
{
#if !HIGH_BIT_DEPTH
if( cpu & X264_CPU_MSA )
{
pf->mc_luma = x264_mc_luma_msa;
Loading
Loading
@@ -3803,5 +3805,5 @@ void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
pf->memzero_aligned = x264_memzero_aligned_msa;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
}
#endif // !HIGH_BIT_DEPTH
}
#endif
Loading
Loading
@@ -370,8 +370,8 @@ static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t *h, int
h->mb.i_partition = partition_col[0];
}
}
int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy ;
int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy ;
int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy;
int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy;
 
int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8];
int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8];
Loading
Loading
Loading
Loading
@@ -249,7 +249,7 @@ int x264_threading_init( void );
static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex )
{
#if HAVE_THREAD
#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86
#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && (ARCH_X86 || ARCH_X86_64)
return __sync_fetch_and_add( val, add );
#else
x264_pthread_mutex_lock( mutex );
Loading
Loading
Loading
Loading
@@ -556,6 +556,7 @@ INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#if HIGH_BIT_DEPTH
#define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c
#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
Loading
Loading
@@ -884,7 +885,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
 
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
Loading
Loading
@@ -1070,7 +1070,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
Loading
Loading
Loading
Loading
@@ -71,7 +71,7 @@ int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16]
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t one = vec_splat_s16(1);;
vec_s16_t one = vec_splat_s16(1);
vec_s16_t nz = zero_s16v;
 
vector bool short mskB;
Loading
Loading
@@ -216,7 +216,7 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t one = vec_splat_s16(1);;
vec_s16_t one = vec_splat_s16(1);
vec_s16_t nz = zero_s16v;
 
vector bool short mskB;
Loading
Loading
Loading
Loading
@@ -38,6 +38,8 @@ const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
const pd_1, times 8 dd 1
const pd_0123, dd 0,1,2,3
const pd_4567, dd 4,5,6,7
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
Loading
Loading
@@ -63,6 +65,7 @@ const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
 
const pd_8, times 4 dd 8
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
Loading
Loading
Loading
Loading
@@ -1463,9 +1463,9 @@ cglobal zigzag_scan_4x4_frame, 2,2
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal zigzag_scan_4x4_field, 2,3
movu m4, [r1+ 8]
pshufd m0, m4, q3102
cglobal zigzag_scan_4x4_field, 2,2
movu m0, [r1+ 8]
pshufd m0, m0, q3102
mova m1, [r1+32]
mova m2, [r1+48]
movu [r0+ 8], m0
Loading
Loading
@@ -1480,19 +1480,14 @@ cglobal zigzag_scan_4x4_field, 2,3
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
INIT_MMX mmx2
cglobal zigzag_scan_4x4_field, 2,3
pshufw m0, [r1+4], q3102
mova m1, [r1+16]
mova m2, [r1+24]
movu [r0+4], m0
mova [r0+16], m1
mova [r0+24], m2
mov r2d, [r1]
mov [r0], r2d
mov r2d, [r1+12]
mov [r0+12], r2d
INIT_XMM sse
cglobal zigzag_scan_4x4_field, 2,2
mova m0, [r1]
mova m1, [r1+16]
pshufw mm0, [r1+4], q3102
mova [r0], m0
mova [r0+16], m1
movq [r0+4], mm0
RET
%endif ; HIGH_BIT_DEPTH
 
Loading
Loading
Loading
Loading
@@ -112,7 +112,7 @@ void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
Loading
Loading
Loading
Loading
@@ -67,7 +67,6 @@ pf_256: times 4 dd 256.0
pf_inv256: times 4 dd 0.00390625
 
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
 
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
Loading
Loading
@@ -94,6 +93,8 @@ cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
cextern pw_0to15
cextern pd_8
cextern pd_0123
cextern pd_ffff
 
%macro LOAD_ADD 4
Loading
Loading
@@ -285,7 +286,7 @@ cglobal hpel_filter_c, 3,3,10
psrad m1, 10
psrad m2, 10
pslld m2, 16
pand m1, [pd_0f]
pand m1, [pd_ffff]
por m1, m2
CLIPW m1, [pb_0], [pw_pixel_max]
mova [r0+r2], m1
Loading
Loading
@@ -2178,7 +2179,7 @@ MBTREE_AVX
 
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
; int16_t *output, int bipred_weight, int mb_y, int len )
;-----------------------------------------------------------------------------
cglobal mbtree_propagate_list_internal, 4,6,8
Loading
Loading
@@ -2268,6 +2269,67 @@ MBTREE_PROPAGATE_LIST
INIT_XMM avx
MBTREE_PROPAGATE_LIST
 
INIT_YMM avx2
cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
mova xm4, [pw_0xc000]
%if UNIX64
shl r4d, 9
shl r5d, 16
movd xm5, r4d
movd xm6, r5d
vpbroadcastw xm5, xm5
vpbroadcastd m6, xm6
%else
vpbroadcastw xm5, r4m
vpbroadcastd m6, r5m
psllw xm5, 9 ; bipred_weight << 9
pslld m6, 16
%endif
mov r4d, r6m
lea r1, [r1+r4*2]
lea r2, [r2+r4*2]
lea r0, [r0+r4*4]
neg r4
por m6, [pd_0123] ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
vbroadcasti128 m7, [pw_31]
.loop:
mova xm3, [r1+r4*2]
pand xm0, xm4, [r2+r4*2]
pmulhrsw xm1, xm3, xm5 ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6
pcmpeqw xm0, xm4
pblendvb xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount
vpermq m3, m3, q1100
movu m0, [r0+r4*4] ; {x, y}
vbroadcasti128 m1, [pd_8]
psraw m2, m0, 5
paddw m2, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
paddw m6, m1 ; i_mb_x += 8
mova [r3], m2
mova m1, [pw_32]
pand m0, m7
psubw m1, m0
packuswb m1, m0 ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y}
psrlw m0, m1, 3
pand m1, [pw_00ff] ; 32-x x 32-x x
pandn m0, m7, m0 ; (32-y y 32-y y) << 5
pshufd m2, m1, q1032
pmullw m1, m0 ; idx0 idx3 idx0 idx3
pmullw m2, m0 ; idx1 idx2 idx1 idx2
pmulhrsw m0, m1, m3 ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10
pmulhrsw m2, m3 ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10
psignw m0, m1 ; correct potential overflow in the idx0 input to pmulhrsw
punpcklwd m1, m0, m2 ; idx01weight
punpckhwd m2, m0 ; idx23weight
mova [r3+32], m1
mova [r3+64], m2
add r3, 3*mmsize
add r4, 8
jl .loop
RET
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
Loading
Loading
Loading
Loading
@@ -88,10 +88,8 @@ void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
Loading
Loading
@@ -101,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
Loading
Loading
@@ -493,96 +488,12 @@ HPEL(32, avx2, avx2, avx2, avx2)
#endif
#endif // HIGH_BIT_DEPTH
 
#define PLANE_COPY(align, cpu)\
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align) / sizeof(pixel) - 1;\
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
else if( !(w&c_w) )\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
else\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
memcpy( dst, src, w*sizeof(pixel) );\
}\
}
PLANE_COPY(16, sse)
PLANE_COPY(32, avx)
 
#define PLANE_COPY_SWAP(align, cpu)\
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
{\
dst[x] = src[x+1];\
dst[x+1] = src[x];\
}\
}\
else\
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
PLANE_COPY_SWAP(16, ssse3)
PLANE_COPY_SWAP(32, avx2)
 
#define PLANE_INTERLEAVE(cpu) \
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
pixel *srcu, intptr_t i_srcu,\
pixel *srcv, intptr_t i_srcv, int w, int h )\
{\
int c_w = 16 / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
{\
if( --h > 0 )\
{\
if( i_srcu > 0 )\
{\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
srcu += i_srcu * h;\
srcv += i_srcv * h;\
}\
else\
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
}\
else\
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
}
PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
Loading
Loading
@@ -621,6 +532,7 @@ do\
 
PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
PROPAGATE_LIST(avx2)
 
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
Loading
Loading
@@ -932,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
}
Loading
Loading
@@ -43,6 +43,9 @@ mask_ff: times 16 db 0xff
mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
%if HIGH_BIT_DEPTH
ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
%endif
%if BIT_DEPTH == 10
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
Loading
Loading
@@ -531,8 +534,8 @@ SSD 16, 8
;
; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
;
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
; distortion levels it will take much more than that though.
; For 10-bit XMM this means width >= 32832. At sane distortion levels
; it will take much more than that though.
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
%macro SSD_NV12 0
Loading
Loading
@@ -541,13 +544,14 @@ cglobal pixel_ssd_nv12_core, 6,7,7
FIX_STRIDES r1, r3
add r0, r4
add r2, r4
xor r6, r6
neg r4
pxor m4, m4
pxor m5, m5
pxor m6, m6
%if mmsize == 32
vbroadcasti128 m6, [ssd_nv12_shuf]
%endif
.loopy:
mov r6, r4
neg r6
pxor m2, m2
pxor m3, m3
.loopx:
Loading
Loading
@@ -555,11 +559,11 @@ cglobal pixel_ssd_nv12_core, 6,7,7
mova m1, [r0+r6+mmsize]
psubw m0, [r2+r6]
psubw m1, [r2+r6+mmsize]
PSHUFLW m0, m0, q3120
PSHUFLW m1, m1, q3120
%if mmsize >= 16
pshufhw m0, m0, q3120
pshufhw m1, m1, q3120
%if mmsize == 32
pshufb m0, m6
pshufb m1, m6
%else
SBUTTERFLY wd, 0, 1, 6
%endif
%if cpuflag(xop)
pmadcswd m2, m0, m0, m2
Loading
Loading
@@ -577,59 +581,30 @@ cglobal pixel_ssd_nv12_core, 6,7,7
psubd m3, m1
.no_overread:
%endif
%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
; equation above, putting the width limit at 8208
punpckhdq m0, m2, m6
punpckhdq m1, m3, m6
punpckldq m2, m6
punpckldq m3, m6
paddq m3, m2
paddq m1, m0
paddq m4, m3
paddq m4, m1
%else ; unfortunately paddq is sse2
; emulate 48 bit precision for mmx2 instead
mova m0, m2
mova m1, m3
punpcklwd m2, m6
punpcklwd m3, m6
punpckhwd m0, m6
punpckhwd m1, m6
paddd m3, m2
paddd m1, m0
paddd m4, m3
paddd m5, m1
%endif
punpckhdq m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
punpckhdq m1, m3, m5 ; equation above, putting the width limit at 8208
punpckldq m2, m5
punpckldq m3, m5
paddq m0, m1
paddq m2, m3
paddq m4, m0
paddq m4, m2
add r0, r1
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
mov r0, r6m
mov r1, r7m
%if mmsize == 32
vextracti128 xm0, m4, 1
paddq xm4, xm0
%endif
%if mmsize >= 16
movq [r3], xm4
movhps [r4], xm4
%else ; fixup for mmx2
SBUTTERFLY dq, 4, 5, 0
mova m0, m4
psrld m4, 16
paddd m5, m4
pslld m0, 16
SBUTTERFLY dq, 0, 5, 4
psrlq m0, 16
psrlq m5, 16
movq [r3], m0
movq [r4], m5
%endif
movq [r0], xm4
movhps [r1], xm4
RET
%endmacro ; SSD_NV12
%endif ; HIGH_BIT_DEPTH
 
%if HIGH_BIT_DEPTH == 0
%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
Loading
Loading
@@ -643,12 +618,12 @@ cglobal pixel_ssd_nv12_core, 6,7
add r4d, r4d
add r0, r4
add r2, r4
neg r4
pxor m3, m3
pxor m4, m4
mova m5, [pw_00ff]
.loopy:
mov r6, r4
neg r6
.loopx:
%if mmsize == 32 ; only 16-byte alignment is guaranteed
movu m2, [r0+r6]
Loading
Loading
@@ -686,21 +661,27 @@ cglobal pixel_ssd_nv12_core, 6,7
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
HADDD m3, m0
HADDD m4, m0
pxor xm0, xm0
punpckldq xm3, xm0
punpckldq xm4, xm0
movq [r3], xm3
movq [r4], xm4
mov r0, r6m
mov r1, r7m
%if cpuflag(ssse3)
phaddd m3, m4
%else
SBUTTERFLY qdq, 3, 4, 0
paddd m3, m4
%endif
%if mmsize == 32
vextracti128 xm4, m3, 1
paddd xm3, xm4
%endif
psllq xm4, xm3, 32
paddd xm3, xm4
psrlq xm3, 32
movq [r0], xm3
movhps [r1], xm3
RET
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH
 
INIT_MMX mmx2
SSD_NV12
INIT_XMM sse2
SSD_NV12
INIT_XMM avx
Loading
Loading
@@ -4614,67 +4595,82 @@ cglobal intra_sad_x9_8x8, 5,7,8
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
%if HIGH_BIT_DEPTH
movdqu m5, [r0+(%1&1)*r1]
movdqu m6, [r2+(%1&1)*r3]
movu m4, [r0+(%1&1)*r1]
movu m5, [r2+(%1&1)*r3]
%elif cpuflag(avx)
pmovzxbw m4, [r0+(%1&1)*r1]
pmovzxbw m5, [r2+(%1&1)*r3]
%else
movq m5, [r0+(%1&1)*r1]
movq m6, [r2+(%1&1)*r3]
punpcklbw m5, m0
punpcklbw m6, m0
movq m4, [r0+(%1&1)*r1]
movq m5, [r2+(%1&1)*r3]
punpcklbw m4, m7
punpcklbw m5, m7
%endif
%if %1==1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
%endif
%if %1==0
movdqa m1, m5
movdqa m2, m6
%if %1 == 0 && cpuflag(avx)
SWAP 0, 4
SWAP 1, 5
pmaddwd m4, m0, m0
pmaddwd m5, m1, m1
pmaddwd m6, m0, m1
%else
%if %1 == 0
mova m0, m4
mova m1, m5
%else
paddw m0, m4
paddw m1, m5
paddw m2, m6
%endif
pmaddwd m7, m5, m6
pmaddwd m6, m4, m5
pmaddwd m4, m4
pmaddwd m5, m5
pmaddwd m6, m6
ACCUM paddd, 3, 5, %1
ACCUM paddd, 4, 7, %1
paddd m3, m6
%endif
ACCUM paddd, 2, 4, %1
ACCUM paddd, 3, 6, %1
paddd m2, m5
%endmacro
 
%macro SSIM 0
cglobal pixel_ssim_4x4x2_core, 4,4,8
%if HIGH_BIT_DEPTH
cglobal pixel_ssim_4x4x2_core, 4,4,7
FIX_STRIDES r1, r3
pxor m0, m0
%else
cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx)
%if notcpuflag(avx)
pxor m7, m7
%endif
%endif
SSIM_ITER 0
SSIM_ITER 1
SSIM_ITER 2
SSIM_ITER 3
; PHADDW m1, m2
; PHADDD m3, m4
movdqa m7, [pw_1]
pshufd m5, m3, q2301
pmaddwd m1, m7
pmaddwd m2, m7
pshufd m6, m4, q2301
packssdw m1, m2
paddd m3, m5
pshufd m1, m1, q3120
paddd m4, m6
pmaddwd m1, m7
punpckhdq m5, m3, m4
punpckldq m3, m4
%if UNIX64
%define t0 r4
DECLARE_REG_TMP 4
%else
%define t0 rax
mov t0, r4mp
DECLARE_REG_TMP 0
mov t0, r4mp
%endif
movq [t0+ 0], m1
movq [t0+ 8], m3
movhps [t0+16], m1
movq [t0+24], m5
%if cpuflag(ssse3)
phaddw m0, m1
pmaddwd m0, [pw_1]
phaddd m2, m3
%else
mova m4, [pw_1]
pmaddwd m0, m4
pmaddwd m1, m4
packssdw m0, m1
shufps m1, m2, m3, q2020
shufps m2, m3, q3131
pmaddwd m0, m4
paddd m2, m1
%endif
shufps m1, m0, m2, q2020
shufps m0, m2, q3131
mova [t0], m1
mova [t0+16], m0
RET
 
;-----------------------------------------------------------------------------
Loading
Loading
Loading
Loading
@@ -145,9 +145,6 @@ int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 
void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
Loading
Loading
Loading
Loading
@@ -2092,63 +2092,28 @@ PREDICT_16x16_H
%endif
 
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core( pixel *src, int i_dc_left )
; void predict_16x16_dc( pixel *src )
;-----------------------------------------------------------------------------
%macro PRED16x16_DC_MMX 2
%if HIGH_BIT_DEPTH
mova m0, [r0 - FDEC_STRIDEB+ 0]
paddw m0, [r0 - FDEC_STRIDEB+ 8]
paddw m0, [r0 - FDEC_STRIDEB+16]
paddw m0, [r0 - FDEC_STRIDEB+24]
HADDW m0, m1
paddw m0, %1
psrlw m0, %2
SPLATW m0, m0
STORE16 m0, m0, m0, m0
%else ; !HIGH_BIT_DEPTH
pxor m0, m0
pxor m1, m1
psadbw m0, [r0 - FDEC_STRIDE]
psadbw m1, [r0 - FDEC_STRIDE + 8]
paddusw m0, m1
paddusw m0, %1
psrlw m0, %2 ; dc
pshufw m0, m0, 0
packuswb m0, m0 ; dc in bytes
STORE16 m0, m0
%endif
%endmacro
INIT_MMX mmx2
cglobal predict_16x16_dc_core, 1,2
%if ARCH_X86_64
movd m6, r1d
PRED16x16_DC_MMX m6, 5
%if WIN64
DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
%else
PRED16x16_DC_MMX r1m, 5
DECLARE_REG_TMP 3
%endif
RET
INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_MMX [pw_8], 4
RET
 
INIT_MMX mmx2
%if HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16 m0, m0, m0, m0
RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
STORE16 m0, m0
INIT_XMM
; Returns the sum of the left pixels in r1d+r2d
cglobal predict_16x16_dc_left_internal, 0,4
movzx r1d, pixel [r0-SIZEOF_PIXEL]
movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
%assign i 2*FDEC_STRIDEB
%rep 7
movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
add r1d, t0d
movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
add r2d, t0d
%assign i i+2*FDEC_STRIDEB
%endrep
RET
%endif
 
%macro PRED16x16_DC 2
%if HIGH_BIT_DEPTH
Loading
Loading
@@ -2176,9 +2141,11 @@ cglobal predict_16x16_dc_left_core, 1,1
%endif
%endmacro
 
%macro PREDICT_16x16_DC_CORE 0
cglobal predict_16x16_dc_core, 2,2,4
movd xm3, r1m
%macro PREDICT_16x16_DC 0
cglobal predict_16x16_dc, 1,3
call predict_16x16_dc_left_internal
lea r1d, [r1+r2+16]
movd xm3, r1d
PRED16x16_DC xm3, 5
RET
 
Loading
Loading
@@ -2186,8 +2153,11 @@ cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
RET
 
cglobal predict_16x16_dc_left_core, 1,2
movd xm0, r1m
cglobal predict_16x16_dc_left, 1,3
call predict_16x16_dc_left_internal
lea r1d, [r1+r2+8]
shr r1d, 4
movd xm0, r1d
SPLATW m0, xm0
%if HIGH_BIT_DEPTH && mmsize == 16
STORE16 m0, m0
Loading
Loading
@@ -2201,11 +2171,11 @@ cglobal predict_16x16_dc_left_core, 1,2
%endmacro
 
INIT_XMM sse2
PREDICT_16x16_DC_CORE
PREDICT_16x16_DC
%if HIGH_BIT_DEPTH
INIT_YMM avx2
PREDICT_16x16_DC_CORE
PREDICT_16x16_DC
%else
INIT_XMM avx2
PREDICT_16x16_DC_CORE
PREDICT_16x16_DC
%endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment