Skip to content
Snippets Groups Projects
Commit d8de3cd6 authored by Leo Ma's avatar Leo Ma
Browse files

Upgrade libx264


Signed-off-by: default avatarLeo Ma <begeekmyfriend@gmail.com>
parent 0984dd59
No related branches found
No related tags found
No related merge requests found
Showing
with 1920 additions and 806 deletions
Loading
Loading
@@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
intptr_t i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp1,[16*16] );
pixel *src0, *src1;
 
MC_LUMA_BI( 0 );
Loading
Loading
@@ -260,7 +260,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint32_t) );
 
/* 0 -> 3 top(4), 4 -> 6 : left(3) */
PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
Loading
Loading
@@ -326,7 +326,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
 
PREALLOC_END( h->mb.base );
 
memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint32_t) );
 
for( int i = 0; i < 2; i++ )
{
Loading
Loading
@@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
Loading
Loading
@@ -532,16 +532,16 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
 
Loading
Loading
@@ -1738,7 +1738,7 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.i_last_dqp = 0;
h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
h->mb.i_cbp_luma = 0xf;
h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700;
h->mb.b_transform_8x8 = 0;
for( int i = 0; i < 48; i++ )
h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;
Loading
Loading
Loading
Loading
@@ -325,15 +325,14 @@ void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
}
}
 
static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h )
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h )
{
for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
for( int x=0; x<w; x++ )
{
dstu[x] = src[2*x];
dstv[x] = src[2*x+1];
dsta[x] = src[2*x];
dstb[x] = src[2*x+1];
}
}
 
Loading
Loading
@@ -362,9 +361,9 @@ static ALWAYS_INLINE uint32_t v210_endian_fix32( uint32_t x )
#define v210_endian_fix32(x) (x)
#endif
 
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
static void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
{
for( int l = 0; l < h; l++ )
{
Loading
Loading
@@ -649,6 +648,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->plane_copy_swap = x264_plane_copy_swap_c;
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
 
Loading
Loading
Loading
Loading
@@ -160,6 +160,39 @@ static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src,
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
 
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
* input with the additional constraint that we cannot overread src. */
#define PLANE_COPY_YUYV(align, cpu)\
static void x264_plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
dsta += i_dsta * h;\
dstb += i_dstb * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
src+i_src, i_src, w, h );\
}\
x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
}\
else\
x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
}
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
Loading
Loading
@@ -260,6 +293,8 @@ typedef struct
/* may write up to 15 pixels off the end of each plane */
void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
Loading
Loading
Loading
Loading
@@ -108,10 +108,10 @@ int x264_is_pipe( const char *path );
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 
// ARM compiliers don't reliably align stack variables
// - EABI requires only 8 byte stack alignment to be maintained
Loading
Loading
@@ -125,39 +125,39 @@ int x264_is_pipe( const char *path );
type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)
 
#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#define ALIGNED_ARRAY_8( ... ) EXPAND( ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) )
#else
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
ALIGNED_8( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif
 
#if ARCH_ARM
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#define ALIGNED_ARRAY_16( ... ) EXPAND( ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) )
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_16( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
 
#define EXPAND(x) x
 
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 64
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
ALIGNED_32( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
#if STACK_ALIGNMENT >= 64
#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
/* For AVX2 */
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define ALIGNED_N ALIGNED_32
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
#endif
#else
#define NATIVE_ALIGN 16
#define ALIGNED_N ALIGNED_16
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
#define ALIGNED_32 ALIGNED_16
#define ALIGNED_64 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
 
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
Loading
Loading
Loading
Loading
@@ -201,28 +201,32 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
#define PIXEL_VAR2_C( name, w, h, shift ) \
static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
#define PIXEL_VAR2_C( name, h, shift ) \
static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \
{ \
int var = 0, sum = 0, sqr = 0; \
int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \
for( int y = 0; y < h; y++ ) \
{ \
for( int x = 0; x < w; x++ ) \
for( int x = 0; x < 8; x++ ) \
{ \
int diff = pix1[x] - pix2[x]; \
sum += diff; \
sqr += diff * diff; \
int diff_u = fenc[x] - fdec[x]; \
int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \
sum_u += diff_u; \
sum_v += diff_v; \
sqr_u += diff_u * diff_u; \
sqr_v += diff_v * diff_v; \
} \
pix1 += i_stride1; \
pix2 += i_stride2; \
fenc += FENC_STRIDE; \
fdec += FDEC_STRIDE; \
} \
var = sqr - ((int64_t)sum * sum >> shift); \
*ssd = sqr; \
return var; \
ssd[0] = sqr_u; \
ssd[1] = sqr_v; \
return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \
sqr_v - ((int64_t)sum_v * sum_v >> shift); \
}
 
PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 )
PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 )
 
#if BIT_DEPTH > 8
typedef uint32_t sum_t;
Loading
Loading
@@ -885,13 +889,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
 
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
Loading
Loading
@@ -962,7 +959,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
INIT6( satd, _ssse3 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
 
Loading
Loading
@@ -1003,7 +1002,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
Loading
Loading
@@ -1028,8 +1029,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( sad_x3, _xop );
INIT5( sad_x4, _xop );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
#if ARCH_X86_64
Loading
Loading
@@ -1044,10 +1043,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
pixf->vsad = x264_pixel_vsad_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
Loading
Loading
@@ -1067,16 +1075,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
 
if( cpu&X264_CPU_CACHELINE_32 )
Loading
Loading
@@ -1197,7 +1200,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
}
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
Loading
Loading
@@ -1280,7 +1285,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
Loading
Loading
@@ -1321,11 +1328,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
Loading
Loading
@@ -1338,7 +1340,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx2 );
#endif
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
Loading
Loading
@@ -1351,6 +1355,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
INIT8( sad, _avx512 );
INIT8_NAME( sad_aligned, sad, _avx512 );
INIT7( sad_x3, _avx512 );
INIT7( sad_x4, _avx512 );
INIT8( satd, _avx512 );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif //HAVE_MMX
 
#if HAVE_ARMV6
Loading
Loading
@@ -1480,8 +1499,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
}
Loading
Loading
Loading
Loading
@@ -93,8 +93,7 @@ typedef struct
uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
 
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
pixel *pix2, intptr_t stride2, int *ssd );
int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] );
uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
 
void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
Loading
Loading
Loading
Loading
@@ -293,12 +293,8 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
vec_vsx_st( dcvsum8, 0, dest ); \
}
 
static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
static void idct8_dc_altivec( uint8_t *dst, vec_s16_t dcv )
{
dc1 = (dc1 + 32) >> 6;
dc2 = (dc2 + 32) >> 6;
vec_s16_t dcv = { dc1, dc1, dc1, dc1, dc2, dc2, dc2, dc2 };
LOAD_ZERO;
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[0*FDEC_STRIDE], dcv );
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[1*FDEC_STRIDE], dcv );
Loading
Loading
@@ -308,8 +304,18 @@ static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
 
void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] )
{
idct8_dc_altivec( &p_dst[0], dct[0], dct[1] );
idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2], dct[3] );
vec_s16_t dcv;
vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) );
vec_u16_t v6 = vec_splat_u16( 6 );
vec_s16_t dctv = vec_vsx_ld( 0, dct );
dctv = vec_sra( vec_add( dctv, v32 ), v6 );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
idct8_dc_altivec( &p_dst[0], dcv );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv );
}
 
#define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \
Loading
Loading
Loading
Loading
@@ -32,19 +32,6 @@
typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
uint8_t *dst, intptr_t i_dst, int i_height );
 
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
{
return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
pix[ 3*i_pix_next];
}
static inline int x264_tapfilter1( uint8_t *pix )
{
return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
pix[ 3];
}
static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )
Loading
Loading
Loading
Loading
@@ -460,9 +460,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
#if ARCH_X86
pf->denoise_dct = x264_denoise_dct_mmx;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
Loading
Loading
@@ -473,8 +470,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
Loading
Loading
@@ -499,17 +494,18 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
}
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_lzcnt;
pf->coeff_last8 = x264_coeff_last8_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
Loading
Loading
@@ -557,8 +553,20 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_avx2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->dequant_8x8 = x264_dequant_8x8_avx512;
pf->decimate_score15 = x264_decimate_score15_avx512;
pf->decimate_score16 = x264_decimate_score16_avx512;
pf->decimate_score64 = x264_decimate_score64_avx512;
pf->coeff_last4 = x264_coeff_last4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
Loading
Loading
@@ -586,9 +594,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx2;
pf->quant_8x8 = x264_quant_8x8_mmx2;
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
Loading
Loading
@@ -599,13 +604,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
}
}
 
if( cpu&X264_CPU_SSE2 )
Loading
Loading
@@ -634,14 +632,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
}
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_lzcnt;
pf->coeff_last8 = x264_coeff_last8_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
 
if( cpu&X264_CPU_SSSE3 )
Loading
Loading
@@ -657,17 +660,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
}
#endif
}
 
if( cpu&X264_CPU_SSE4 )
Loading
Loading
@@ -717,12 +722,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
if( h->param.i_cqm_preset == X264_CQM_FLAT )
pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512;
else
{
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->dequant_8x8 = x264_dequant_8x8_avx512;
}
pf->decimate_score15 = x264_decimate_score15_avx512;
pf->decimate_score16 = x264_decimate_score16_avx512;
pf->decimate_score64 = x264_decimate_score64_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
 
Loading
Loading
Loading
Loading
@@ -53,21 +53,32 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
%endmacro
 
cextern coeff_last4_mmx2
cextern coeff_last4_mmx2_lzcnt
cextern coeff_last4_lzcnt
%if HIGH_BIT_DEPTH
cextern coeff_last4_avx512
%endif
cextern coeff_last15_sse2
cextern coeff_last15_sse2_lzcnt
cextern coeff_last15_lzcnt
cextern coeff_last15_avx512
cextern coeff_last16_sse2
cextern coeff_last16_sse2_lzcnt
cextern coeff_last16_lzcnt
cextern coeff_last16_avx512
cextern coeff_last64_sse2
cextern coeff_last64_sse2_lzcnt
cextern coeff_last64_avx2_lzcnt
cextern coeff_last64_lzcnt
cextern coeff_last64_avx2
cextern coeff_last64_avx512
 
%ifdef PIC
SECTION .data
%endif
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%if HIGH_BIT_DEPTH
coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%else
coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%endif
%endif
 
SECTION .text
Loading
Loading
@@ -100,7 +111,7 @@ struc cb
.start: pointer 1
.p: pointer 1
.end: pointer 1
align 16, resb 1
align 64, resb 1
.bits_encoded: resd 1
.state: resb 1024
endstruc
Loading
Loading
@@ -352,25 +363,33 @@ CABAC bmi2
%endmacro
 
%macro ABS_DCTCOEFS 2
%assign i 0
%rep %2/16
%if HIGH_BIT_DEPTH
ABSD m0, [%1+ 0+i*64], m4
ABSD m1, [%1+16+i*64], m5
ABSD m2, [%1+32+i*64], m4
ABSD m3, [%1+48+i*64], m5
mova [rsp+ 0+i*64], m0
mova [rsp+16+i*64], m1
mova [rsp+32+i*64], m2
mova [rsp+48+i*64], m3
%define %%abs ABSD
%else
ABSW m0, [%1+ 0+i*32], m2
ABSW m1, [%1+16+i*32], m3
mova [rsp+ 0+i*32], m0
mova [rsp+16+i*32], m1
%endif
%define %%abs ABSW
%endif
%if mmsize == %2*SIZEOF_DCTCOEF
%%abs m0, [%1], m1
mova [rsp], m0
%elif mmsize == %2*SIZEOF_DCTCOEF/2
%%abs m0, [%1+0*mmsize], m2
%%abs m1, [%1+1*mmsize], m3
mova [rsp+0*mmsize], m0
mova [rsp+1*mmsize], m1
%else
%assign i 0
%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
%%abs m0, [%1+(4*i+0)*mmsize], m4
%%abs m1, [%1+(4*i+1)*mmsize], m5
%%abs m2, [%1+(4*i+2)*mmsize], m4
%%abs m3, [%1+(4*i+3)*mmsize], m5
mova [rsp+(4*i+0)*mmsize], m0
mova [rsp+(4*i+1)*mmsize], m1
mova [rsp+(4*i+2)*mmsize], m2
mova [rsp+(4*i+3)*mmsize], m3
%assign i i+1
%endrep
%endif
%endmacro
 
%macro SIG_OFFSET 1
Loading
Loading
@@ -403,16 +422,14 @@ CABAC bmi2
%endif
 
%ifdef PIC
cglobal func, 4,13
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
%else
cglobal func, 4,12
cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
%define GLOBAL
%endif
 
%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
SUB rsp, pad
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
Loading
Loading
@@ -429,15 +446,13 @@ CABAC bmi2
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
mov r6, ~SIZEOF_DCTCOEF
and r6, r4 ; handle AC coefficient case
ABS_DCTCOEFS r6, 16
sub r4, r6 ; calculate our new dct pointer
and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
ABS_DCTCOEFS r4, 16
xor r4, r0 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
mov r1, [%2+gprsize*r2 GLOBAL]
; for improved OOE performance, run coeff_last on the original coefficients.
call r1 ; coeff_last[ctx_block_cat]( dct )
call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
Loading
Loading
@@ -521,7 +536,6 @@ CABAC bmi2
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
ADD rsp, pad
RET
%endmacro
 
Loading
Loading
@@ -529,15 +543,23 @@ CABAC bmi2
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
INIT_XMM lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
INIT_XMM ssse3
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM ssse3,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
%else
INIT_YMM avx512
%endif
CABAC_RESIDUAL_RD 0, coeff_last_avx512
INIT_ZMM avx512
CABAC_RESIDUAL_RD 1, coeff_last_avx512
%endif
 
;-----------------------------------------------------------------------------
Loading
Loading
@@ -615,7 +637,7 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
%endmacro
 
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15
cglobal cabac_block_residual_internal, 4,15,0,-4*64
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
Loading
Loading
@@ -625,8 +647,6 @@ cglobal cabac_block_residual_internal, 4,15
%define lastm r7d
%define GLOBAL
%endif
%assign pad gprsize+4*2+4*64-(stack_offset&15)
SUB rsp, pad
shl r1d, 4
 
%define sigoffq r8
Loading
Loading
@@ -653,8 +673,7 @@ cglobal cabac_block_residual_internal, 4,15
mov dct, r0
mov leveloffm, leveloffd
 
mov r1, [%1+gprsize*r2 GLOBAL]
call r1
call [%1+gprsize*r2 GLOBAL]
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
Loading
Loading
@@ -742,15 +761,16 @@ cglobal cabac_block_residual_internal, 4,15
%endif
dec coeffidxd
jge .level_loop
ADD rsp, pad
RET
%endmacro
 
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL coeff_last_sse2_lzcnt
INIT_XMM avx2,bmi2
CABAC_RESIDUAL coeff_last_avx2_lzcnt
INIT_XMM lzcnt
CABAC_RESIDUAL coeff_last_lzcnt
INIT_XMM avx2
CABAC_RESIDUAL coeff_last_avx2
INIT_XMM avx512
CABAC_RESIDUAL coeff_last_avx512
%endif
Loading
Loading
@@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7
RET
 
;-----------------------------------------------------------------------------
; void cpu_xgetbv( int op, int *eax, int *edx )
; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
cglobal cpu_xgetbv
movifnidn ecx, r0m
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64
shl rdx, 32
or rax, rdx
%endif
ret
 
%if ARCH_X86_64
 
Loading
Loading
@@ -77,7 +75,7 @@ cglobal stack_align
%if WIN64
sub rsp, 32 ; shadow space
%endif
and rsp, ~31
and rsp, ~(STACK_ALIGNMENT-1)
mov rax, r0
mov r0, r1
mov r1, r2
Loading
Loading
@@ -118,7 +116,7 @@ cglobal stack_align
push ebp
mov ebp, esp
sub esp, 12
and esp, ~31
and esp, ~(STACK_ALIGNMENT-1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
Loading
Loading
Loading
Loading
@@ -30,7 +30,41 @@
%include "x86inc.asm"
%include "x86util.asm"
 
SECTION_RODATA 32
SECTION_RODATA 64
; AVX-512 permutation indices are bit-packed to save cache
%if HIGH_BIT_DEPTH
scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
; bits 19-23: 8x8_frame4
scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1
dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1
dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
%endif
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
Loading
Loading
@@ -580,6 +614,217 @@ cglobal sub16x16_dct, 3,3,6
DCT4_1D 0, 1, 2, 3, 4
STORE16_DCT_AVX2 0, 1, 2, 3, 4
ret
%macro DCT4x4_AVX512 0
psubw m0, m2 ; 0 1
psubw m1, m3 ; 3 2
SUMSUB_BA w, 1, 0, 2
SBUTTERFLY wd, 1, 0, 2
paddw m2, m1, m0
psubw m3, m1, m0
paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
SUMSUB_BA w, 1, 2, 3
shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
paddw m2, m1, m3
psubw m0, m1, m3
paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
%endmacro
INIT_XMM avx512
cglobal sub4x4_dct
mov eax, 0xf0aa
kmovw k1, eax
PROLOGUE 3,3
movd m0, [r1+0*FENC_STRIDE]
movd m2, [r2+0*FDEC_STRIDE]
vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
movd m1, [r1+3*FENC_STRIDE]
movd m3, [r2+3*FDEC_STRIDE]
vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
kshiftrw k2, k1, 8
pxor m4, m4
punpcklbw m0, m4
punpcklbw m2, m4
punpcklbw m1, m4
punpcklbw m3, m4
DCT4x4_AVX512
mova [r0], m2
mova [r0+16], m0
RET
INIT_ZMM avx512
cglobal dct4x4x4_internal
punpcklbw m0, m1, m4
punpcklbw m2, m3, m4
punpckhbw m1, m4
punpckhbw m3, m4
DCT4x4_AVX512
mova m1, m2
vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
ret
%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
movu %1, [r1+%3*FENC_STRIDE]
vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
%endmacro
%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
movu %1, [r2+(%4 )*FDEC_STRIDE]
vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
movu %3, [r2+(%5 )*FDEC_STRIDE]
vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
vpermt2d %1, %2, %3
%endmacro
cglobal sub8x8_dct, 3,3
mova m0, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
mov r1d, 0xaaaaaaaa
kmovd k1, r1d
psrld m0, 5
DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
mov r1d, 0xf0f0f0f0
kmovd k2, r1d
pxor xm4, xm4
knotw k3, k2
call dct4x4x4_internal_avx512
mova [r0], m0
mova [r0+64], m1
RET
%macro SUB4x16_DCT_AVX512 2 ; dst, src
vpermd m1, m5, [r1+1*%2*64]
mova m3, [r2+2*%2*64]
vpermt2d m3, m6, [r2+2*%2*64+64]
call dct4x4x4_internal_avx512
mova [r0+%1*64 ], m0
mova [r0+%1*64+128], m1
%endmacro
cglobal sub16x16_dct
psrld m5, [dct_avx512], 10
mov eax, 0xaaaaaaaa
kmovd k1, eax
mov eax, 0xf0f0f0f0
kmovd k2, eax
PROLOGUE 3,3
pxor xm4, xm4
knotw k3, k2
psrld m6, m5, 4
SUB4x16_DCT_AVX512 0, 0
SUB4x16_DCT_AVX512 1, 1
SUB4x16_DCT_AVX512 4, 2
SUB4x16_DCT_AVX512 5, 3
RET
cglobal sub8x8_dct_dc, 3,3
mova m3, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
mov r1d, 0xaa
kmovb k1, r1d
psrld m3, 5
DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
pxor xm3, xm3
psadbw m0, m3
psadbw m1, m3
psubw m0, m1
vpmovqw xmm0, m0
vprold xmm1, xmm0, 16
paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
punpckhqdq xmm2, xmm0, xmm0
psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
punpcklqdq xmm1, xmm0, xmm0
psubw xmm0 {k1}, xm3, xmm0
paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
movhps [r0], xmm0
RET
cglobal sub8x16_dct_dc, 3,3
mova m5, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
mov r1d, 0xaa
kmovb k1, r1d
psrld m5, 5
DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
pxor xm4, xm4
psadbw m0, m4
psadbw m1, m4
psadbw m2, m4
psadbw m3, m4
psubw m0, m2
psubw m1, m3
SBUTTERFLY qdq, 0, 1, 2
paddw m0, m1
vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
psrlq xmm2, xmm0, 32
psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
punpckhdq xmm2, xmm0, xmm1
punpckldq xmm0, xmm1
psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
punpcklwd xmm0, xmm1
psrlq xmm2, xmm0, 32
psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
shufps xmm0, xmm1, q0220
mova [r0], xmm0
RET
%macro SARSUMSUB 3 ; a, b, tmp
mova m%3, m%1
vpsraw m%1 {k1}, 1
psubw m%1, m%2 ; 0-2 1>>1-3
vpsraw m%2 {k1}, 1
paddw m%2, m%3 ; 0+2 1+3>>1
%endmacro
cglobal add8x8_idct, 2,2
mova m1, [r1]
mova m2, [r1+64]
mova m3, [dct_avx512]
vbroadcasti32x4 m4, [pw_32]
mov r1d, 0xf0f0f0f0
kxnorb k2, k2, k2
kmovd k1, r1d
kmovb k3, k2
vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
vpgatherqq m6 {k2}, [r0+m5]
SARSUMSUB 0, 1, 2
SBUTTERFLY wd, 1, 0, 2
psrlq m7, m3, 28
SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
SBUTTERFLY dq, 0, 1, 2
psrlq m3, 24
SARSUMSUB 0, 1, 2
vpermi2q m3, m1, m0
vpermt2q m1, m7, m0
paddw m3, m4 ; += 32
SUMSUB_BA w, 1, 3, 0
psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
pxor xm0, xm0
SBUTTERFLY bw, 6, 0, 2
paddsw m1, m6
paddsw m3, m0
packuswb m1, m3
vpscatterqq [r0+m5] {k3}, m1
RET
%endif ; HIGH_BIT_DEPTH
 
INIT_MMX
Loading
Loading
@@ -1883,3 +2128,161 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6
mov [r2+8], r0w
RET
%endif ; !HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
mova m0, [scan_frame_avx512]
vpermd m0, m0, [r1]
mova [r0], m0
RET
cglobal zigzag_scan_4x4_field, 2,2
mova m0, [r1]
pshufd xmm1, [r1+8], q3102
mova [r0], m0
movu [r0+8], xmm1
RET
cglobal zigzag_scan_8x8_frame, 2,2
psrld m0, [scan_frame_avx512], 4
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
mov r1d, 0x01fe7f80
kmovd k1, r1d
kshiftrd k2, k1, 16
vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
psrld m6, m0, 5
vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
vmovdqa64 m0 {k1}, m5
mova [r0+0*64], m0
mova m5, m1
vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
psrld m0, m6, 5
vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
vmovdqa32 m6 {k2}, m1
mova [r0+1*64], m6
vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
psrld m1, m0, 5
vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
vmovdqa32 m5 {k1}, m0
mova [r0+2*64], m5
vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
vmovdqa64 m2 {k2}, m3
mova [r0+3*64], m2
RET
cglobal zigzag_scan_8x8_field, 2,2
mova m0, [scan_field_avx512]
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
mov r1d, 0x3f
kmovb k1, r1d
psrld m5, m0, 5
vpermi2d m0, m1, m2
vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
vpermt2d m1, m5, m2
psrld m5, 5
vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
vpermt2d m2, m5, m3
psrld m5, 5
vpermt2d m3, m5, m4
mova [r0+0*64], m0
mova [r0+1*64], m1
mova [r0+2*64], m2
mova [r0+3*64], m3
RET
cglobal zigzag_interleave_8x8_cavlc, 3,3
mova m0, [cavlc_shuf_avx512]
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
kxnorb k1, k1, k1
por m7, m1, m2
psrld m5, m0, 5
vpermi2d m0, m1, m2 ; a0 a1 b0 b1
vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
psrld m6, m5, 5
vpermi2d m5, m3, m4 ; b2 b3 a2 a3
vptestmd k0, m7, m7
vpermt2d m1, m6, m2 ; c0 c1 d0 d1
psrld m6, 5
vpermt2d m3, m6, m4 ; d2 d3 c2 c3
vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
mova [r0+0*64], m5
mova [r0+1*64], m2
mova [r0+2*64], m3
mova [r0+3*64], m4
kmovw r1d, k0
test r1d, 0x1111
setnz [r2]
test r1d, 0x2222
setnz [r2+1]
test r1d, 0x4444
setnz [r2+8]
test r1d, 0x8888
setnz [r2+9]
RET
%else ; !HIGH_BIT_DEPTH
INIT_YMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
mova m0, [scan_frame_avx512]
vpermw m0, m0, [r1]
mova [r0], m0
RET
cglobal zigzag_scan_4x4_field, 2,2
mova m0, [r1]
pshuflw xmm1, [r1+4], q3102
mova [r0], m0
movq [r0+4], xmm1
RET
INIT_ZMM avx512
cglobal zigzag_scan_8x8_frame, 2,2
psrlw m0, [scan_frame_avx512], 4
scan8_avx512:
mova m1, [r1]
mova m2, [r1+64]
psrlw m3, m0, 6
vpermi2w m0, m1, m2
vpermt2w m1, m3, m2
mova [r0], m0
mova [r0+64], m1
RET
cglobal zigzag_scan_8x8_field, 2,2
mova m0, [scan_field_avx512]
jmp scan8_avx512
cglobal zigzag_interleave_8x8_cavlc, 3,3
mova m0, [cavlc_shuf_avx512]
mova m1, [r1]
mova m2, [r1+64]
psrlw m3, m0, 6
vpermi2w m0, m1, m2
vpermt2w m1, m3, m2
kxnorb k2, k2, k2
vptestmd k0, m0, m0
vptestmd k1, m1, m1
mova [r0], m0
mova [r0+64], m1
ktestw k2, k0
setnz [r2]
setnc [r2+1]
ktestw k2, k1
setnz [r2+8]
setnc [r2+9]
RET
%endif ; !HIGH_BIT_DEPTH
Loading
Loading
@@ -34,6 +34,7 @@ void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
Loading
Loading
@@ -41,12 +42,16 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
 
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
Loading
Loading
@@ -59,6 +64,7 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
Loading
Loading
@@ -101,22 +107,26 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
 
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
Loading
Loading
@@ -125,9 +135,10 @@ int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 
#endif
Loading
Loading
@@ -28,10 +28,14 @@
%include "x86inc.asm"
%include "x86util.asm"
 
SECTION_RODATA 32
load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
insert_top_shuf: dd 0,1,4,5,7,2,3,6
SECTION_RODATA 64
load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5
dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9
dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5
dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9
load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c
dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c
transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
 
SECTION .text
Loading
Loading
@@ -906,9 +910,8 @@ DEBLOCK_LUMA_INTRA
movq m3, %4
punpcklwd m0, m2
punpcklwd m1, m3
mova m2, m0
punpckhdq m2, m0, m1
punpckldq m0, m1
punpckhdq m2, m1
 
movq m4, %5
movq m6, %6
Loading
Loading
@@ -916,9 +919,8 @@ DEBLOCK_LUMA_INTRA
movq m7, %8
punpcklwd m4, m6
punpcklwd m5, m7
mova m6, m4
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhdq m6, m5
 
punpckhqdq m1, m0, m4
punpckhqdq m3, m2, m6
Loading
Loading
@@ -2278,13 +2280,10 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
RET
%endif ; !HIGH_BIT_DEPTH
 
;-----------------------------------------------------------------------------
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
; uint8_t bs[2][4][4], int mvy_limit, int bframe )
;-----------------------------------------------------------------------------
%define scan8start (4+1*8)
%define nnz r0+scan8start
%define ref r1+scan8start
Loading
Loading
@@ -2292,145 +2291,54 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
%define bs0 r3
%define bs1 r3+32
 
%macro LOAD_BYTES_MMX 1
movd m2, [%1+8*0-1]
movd m0, [%1+8*0]
movd m3, [%1+8*2-1]
movd m1, [%1+8*2]
punpckldq m2, [%1+8*1-1]
punpckldq m0, [%1+8*1]
punpckldq m3, [%1+8*3-1]
punpckldq m1, [%1+8*3]
%endmacro
%macro DEBLOCK_STRENGTH_REFS_MMX 0
LOAD_BYTES_MMX ref
pxor m2, m0
pxor m3, m1
por m2, [bs0+0]
por m3, [bs0+8]
movq [bs0+0], m2
movq [bs0+8], m3
movd m2, [ref-8*1]
movd m3, [ref+8*1]
punpckldq m2, m0 ; row -1, row 0
punpckldq m3, m1 ; row 1, row 2
pxor m0, m2
pxor m1, m3
por m0, [bs1+0]
por m1, [bs1+8]
movq [bs1+0], m0
movq [bs1+8], m1
%endmacro
%macro DEBLOCK_STRENGTH_MVS_MMX 2
mova m0, [mv-%2]
mova m1, [mv-%2+8]
psubw m0, [mv]
psubw m1, [mv+8]
packsswb m0, m1
ABSB m0, m1
psubusb m0, m7
packsswb m0, m0
por m0, [%1]
movd [%1], m0
%endmacro
%macro DEBLOCK_STRENGTH_NNZ_MMX 1
por m2, m0
por m3, m1
mova m4, [%1]
mova m5, [%1+8]
pminub m2, m6
pminub m3, m6
pminub m4, m6 ; mv ? 1 : 0
pminub m5, m6
paddb m2, m2 ; nnz ? 2 : 0
paddb m3, m3
pmaxub m2, m4
pmaxub m3, m5
%endmacro
%macro LOAD_BYTES_XMM 1
movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
%macro LOAD_BYTES_XMM 2 ; src, aligned
%if %2
mova m2, [%1-4]
mova m1, [%1+12]
%else
movu m2, [%1-4]
movu m1, [%1+12]
pslldq m0, m2, 1
%endif
psllq m0, m2, 8
shufps m2, m1, q3131 ; cur nnz, all rows
pslldq m1, 1
psllq m1, 8
shufps m0, m1, q3131 ; left neighbors
%if cpuflag(avx) || (%2 && cpuflag(ssse3))
palignr m1, m2, [%1-20], 12
%else
pslldq m1, m2, 4
movd m3, [%1-8] ; could be palignr if nnz was aligned
movd m3, [%1-8]
por m1, m3 ; top neighbors
%endif
%endmacro
 
INIT_MMX mmx2
cglobal deblock_strength, 6,6
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd m7, r4d
SPLATW m7, m7
mova m6, [pb_1]
pxor m0, m0
mova [bs0+0], m0
mova [bs0+8], m0
mova [bs1+0], m0
mova [bs1+8], m0
.lists:
DEBLOCK_STRENGTH_REFS_MMX
mov r4d, 4
.mvs:
DEBLOCK_STRENGTH_MVS_MMX bs0, 4
DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
add r2, 4*8
add r3, 4
dec r4d
jg .mvs
add r1, 40
add r2, 4*8
sub r3, 16
dec r5d
jge .lists
; Check nnz
LOAD_BYTES_MMX nnz
DEBLOCK_STRENGTH_NNZ_MMX bs0
; Transpose column output
SBUTTERFLY bw, 2, 3, 4
SBUTTERFLY bw, 2, 3, 4
mova [bs0+0], m2
mova [bs0+8], m3
movd m2, [nnz-8*1]
movd m3, [nnz+8*1]
punpckldq m2, m0 ; row -1, row 0
punpckldq m3, m1 ; row 1, row 2
DEBLOCK_STRENGTH_NNZ_MMX bs1
mova [bs1+0], m2
mova [bs1+8], m3
RET
%if UNIX64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 4
%endif
 
%macro DEBLOCK_STRENGTH_XMM 0
cglobal deblock_strength, 6,6,7
cglobal deblock_strength, 5,5,7
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd m6, r4d
movifnidn t0d, r5m
SPLATW m6, m6
pxor m4, m4 ; bs0
pxor m5, m5 ; bs1
 
.lists:
; Check refs
LOAD_BYTES_XMM ref
LOAD_BYTES_XMM ref, 0
pxor m0, m2
pxor m1, m2
por m4, m0
por m5, m1
 
; Check mvs
%if cpuflag(ssse3)
%if cpuflag(ssse3) && notcpuflag(avx)
mova m0, [mv+4*8*0]
mova m1, [mv+4*8*1]
palignr m3, m0, [mv+4*8*0-16], 12
Loading
Loading
@@ -2483,11 +2391,11 @@ cglobal deblock_strength, 6,6,7
por m5, m0
add r1, 40
add r2, 4*8*5
dec r5d
dec t0d
jge .lists
 
; Check nnz
LOAD_BYTES_XMM nnz
LOAD_BYTES_XMM nnz, 1
por m0, m2
por m1, m2
mova m6, [pb_1]
Loading
Loading
@@ -2520,68 +2428,121 @@ INIT_XMM avx
DEBLOCK_STRENGTH_XMM
 
%macro LOAD_BYTES_YMM 1
movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
mova m2, [insert_top_shuf]
vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
vpbroadcastd m2, [%1-8] ; ABCD ....
vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
vpbroadcastd m2, [%1-8] ; ABCD ....
vpblendd m0, m0, m2, 0x80
vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
%endmacro
 
INIT_YMM avx2
cglobal deblock_strength, 6,6,7
cglobal deblock_strength, 5,5,8
mova m6, [load_bytes_ymm_shuf]
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd xm6, r4d
vpbroadcastw m6, xm6
pxor m5, m5 ; bs0,bs1
shl r4d, 8
add r4d, 3 - (1<<8)
movd xm5, r4d
movifnidn t0d, r5m
vpbroadcastw m5, xm5
psrld m7, m6, 4
pxor m4, m4 ; bs0,bs1
 
.lists:
; Check refs
LOAD_BYTES_YMM ref
pxor m0, m1
por m5, m0
pxor m0, m1
por m4, m0
 
; Check mvs
movu xm0, [mv-4+4*8*0]
vinserti128 m0, m0, [mv+4*8*-1], 1
vbroadcasti128 m2, [mv+4*8* 0]
vinserti128 m1, m2, [mv-4+4*8*1], 0
vbroadcasti128 m3, [mv+4*8* 1]
psubw m0, m2
psubw m1, m3
vinserti128 m2, m3, [mv-4+4*8*2], 0
vbroadcasti128 m4, [mv+4*8* 2]
vinserti128 m3, m4, [mv-4+4*8*3], 0
psubw m2, m4
vbroadcasti128 m4, [mv+4*8* 3]
psubw m3, m4
packsswb m0, m1
packsswb m2, m3
pabsb m0, m0
pabsb m2, m2
psubusb m0, m6
psubusb m2, m6
packsswb m0, m2
por m5, m0
add r1, 40
add r2, 4*8*5
dec r5d
movu xm0, [mv+0*4*8-4]
vinserti128 m0, m0, [mv-1*4*8 ], 1
vbroadcasti128 m2, [mv+0*4*8 ]
vinserti128 m1, m2, [mv+1*4*8-4], 0
psubw m0, m2
vbroadcasti128 m2, [mv+1*4*8 ]
psubw m1, m2
packsswb m0, m1
vinserti128 m1, m2, [mv+2*4*8-4], 0
vbroadcasti128 m3, [mv+2*4*8 ]
vinserti128 m2, m3, [mv+3*4*8-4], 0
psubw m1, m3
vbroadcasti128 m3, [mv+3*4*8 ]
psubw m2, m3
packsswb m1, m2
pabsb m0, m0
pabsb m1, m1
psubusb m0, m5
psubusb m1, m5
packsswb m0, m1
por m4, m0
add r1, 40
add r2, 4*8*5
dec t0d
jge .lists
 
; Check nnz
LOAD_BYTES_YMM nnz
por m0, m1
mova m6, [pb_1]
pminub m0, m6
pminub m5, m6 ; mv ? 1 : 0
paddb m0, m0 ; nnz ? 2 : 0
pmaxub m5, m0
vextracti128 [bs1], m5, 1
pshufb xm5, [transpose_shuf]
mova [bs0], xm5
mova m2, [pb_1]
por m0, m1
pminub m0, m2
pminub m4, m2 ; mv ? 1 : 0
paddb m0, m0 ; nnz ? 2 : 0
pmaxub m0, m4
vextracti128 [bs1], m0, 1
pshufb xm0, [transpose_shuf]
mova [bs0], xm0
RET
%macro LOAD_BYTES_ZMM 1
vpermd m1, m6, [%1-12]
pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX
%endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX
INIT_ZMM avx512
cglobal deblock_strength, 5,5
mova m6, [load_bytes_zmm_shuf]
shl r4d, 8
add r4d, 3 - (1<<8)
vpbroadcastw m5, r4d
mov r4d, 0x34cc34cc ; {1,-1} * 11001100b
kmovb k1, r4d
vpbroadcastd m4, r4d
movifnidn t0d, r5m
psrld m7, m6, 4
pxor xm3, xm3
.lists:
vbroadcasti64x2 m2, [mv+32]
vinserti64x2 m0, m2, [mv-32], 2
vbroadcasti64x2 m1, [mv+ 0]
vinserti64x2 m0, m0, [mv- 4], 0
vbroadcasti64x2 m1 {k1}, [mv+64]
vinserti64x2 m0, m0, [mv+60], 1
psubw m0, m1
vinserti64x2 m1, m1, [mv+28], 0
vbroadcasti64x2 m2 {k1}, [mv+96]
vinserti64x2 m1, m1, [mv+92], 1
psubw m1, m2
packsswb m0, m1
pabsb m0, m0
psubusb m0, m5
LOAD_BYTES_ZMM ref
pmaddubsw m1, m4 ; E-F F-G G-H H-I ...
vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1
add r1, 40
add r2, 4*8*5
dec t0d
jge .lists
LOAD_BYTES_ZMM nnz
mova ym2, [pb_1]
vptestmw k1, m1, m1
vptestmw k2, m3, m3
vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0
vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0
vextracti128 [bs1], ym0, 1
pshufb xm0, [transpose_shuf]
mova [bs0], xm0
RET
Loading
Loading
@@ -83,11 +83,11 @@ cextern deinterleave_shufd
%endmacro
%endif
 
%macro AVG_END 0
lea t4, [t4+t5*2*SIZEOF_PIXEL]
%macro AVG_END 0-1 2 ; rows
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
sub eax, %1
jg .height_loop
RET
%endmacro
Loading
Loading
@@ -147,17 +147,24 @@ cextern deinterleave_shufd
%endmacro
 
%macro BIWEIGHT_START_SSSE3 0
movzx t6d, byte r6m ; FIXME x86_64
mov t7d, 64
sub t7d, t6d
shl t7d, 8
add t6d, t7d
mova m4, [pw_512]
movd xm3, t6d
movzx t6d, byte r6m ; FIXME x86_64
%if mmsize > 16
vbroadcasti128 m4, [pw_512]
%else
mova m4, [pw_512]
%endif
lea t7d, [t6+(64<<8)]
shl t6d, 8
sub t7d, t6d
%if cpuflag(avx512)
vpbroadcastw m3, t7d
%else
movd xm3, t7d
%if cpuflag(avx2)
vpbroadcastw m3, xm3
vpbroadcastw m3, xm3
%else
SPLATW m3, m3 ; weight_dst,src
SPLATW m3, m3 ; weight_dst,src
%endif
%endif
%endmacro
 
Loading
Loading
@@ -268,6 +275,66 @@ cglobal pixel_avg_weight_w16
mova [t0], xm0
vextracti128 [t0+t1], m0, 1
AVG_END
INIT_YMM avx512
cglobal pixel_avg_weight_w8
BIWEIGHT_START
kxnorb k1, k1, k1
kaddb k1, k1, k1
AVG_START 5
.height_loop:
movq xm0, [t2]
movq xm2, [t4]
movq xm1, [t2+t3]
movq xm5, [t4+t5]
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
vpbroadcastq m0 {k1}, [t2]
vpbroadcastq m2 {k1}, [t4]
vpbroadcastq m1 {k1}, [t2+t3]
vpbroadcastq m5 {k1}, [t4+t5]
punpcklbw m0, m2
punpcklbw m1, m5
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m4
pmulhrsw m1, m4
packuswb m0, m1
vextracti128 xmm1, m0, 1
movq [t0], xm0
movhps [t0+t1], xm0
lea t0, [t0+t1*2]
movq [t0], xmm1
movhps [t0+t1], xmm1
AVG_END 4
INIT_ZMM avx512
cglobal pixel_avg_weight_w16
BIWEIGHT_START
AVG_START 5
.height_loop:
movu xm0, [t2]
movu xm1, [t4]
vinserti128 ym0, [t2+t3], 1
vinserti128 ym1, [t4+t5], 1
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
vinserti32x4 m0, [t2], 2
vinserti32x4 m1, [t4], 2
vinserti32x4 m0, [t2+t3], 3
vinserti32x4 m1, [t4+t5], 3
SBUTTERFLY bw, 0, 1, 2
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m4
pmulhrsw m1, m4
packuswb m0, m1
mova [t0], xm0
vextracti128 [t0+t1], ym0, 1
lea t0, [t0+t1*2]
vextracti32x4 [t0], m0, 2
vextracti32x4 [t0+t1], m0, 3
AVG_END 4
%endif ;HIGH_BIT_DEPTH
 
;=============================================================================
Loading
Loading
@@ -738,6 +805,12 @@ INIT_XMM avx2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16, 8
INIT_XMM avx512
AVGH 16, 16
AVGH 16, 8
AVGH 8, 16
AVGH 8, 8
AVGH 8, 4
 
%endif ;HIGH_BIT_DEPTH
 
Loading
Loading
@@ -2125,7 +2198,7 @@ INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
MC_CHROMA_SSSE3
INIT_XMM ssse3, cache64
INIT_XMM cache64, ssse3
MC_CHROMA_SSSE3
INIT_XMM avx
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
Loading
Loading
Loading
Loading
@@ -30,18 +30,15 @@
%include "x86inc.asm"
%include "x86util.asm"
 
SECTION_RODATA 32
pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
SECTION_RODATA 64
 
%if HIGH_BIT_DEPTH
v210_mask: times 4 dq 0xc00ffc003ff003ff
v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
Loading
Loading
@@ -58,6 +55,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
 
pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
Loading
Loading
@@ -1044,8 +1048,8 @@ PLANE_COPY_CORE 1
%endif ; HIGH_BIT_DEPTH
%endmacro
 
%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
mova m0, [%3]
%macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned
mov%6 m0, [%3]
%if mmsize == 32
pshufb m0, %5
vpermq m0, m0, q3120
Loading
Loading
@@ -1056,7 +1060,7 @@ PLANE_COPY_CORE 1
vextracti128 [%2], m0, 1
%endif
%elif HIGH_BIT_DEPTH
mova m1, [%3+mmsize]
mov%6 m1, [%3+mmsize]
psrld m2, m0, 16
psrld m3, m1, 16
pand m0, %5
Loading
Loading
@@ -1181,8 +1185,8 @@ cglobal store_interleave_chroma, 5,5
 
%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
; pixel *dstv, intptr_t i_dstv,
; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta,
; pixel *dstb, intptr_t i_dstb,
; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
Loading
Loading
@@ -1400,43 +1404,64 @@ cglobal plane_copy_deinterleave_v210, 7,7,7
%define org_w r6m
%define h dword r7m
%endif
FIX_STRIDES r1, r3, r6d
shl r5, 2
add r0, r6
add r2, r6
neg r6
mov src, r4
mov org_w, r6
mova m2, [v210_mask]
mova m3, [v210_luma_shuf]
mova m4, [v210_chroma_shuf]
mova m5, [v210_mult] ; also functions as vpermd index for avx2
pshufd m6, m5, q1102
FIX_STRIDES r1, r3, r6d
shl r5, 2
add r0, r6
add r2, r6
neg r6
mov src, r4
mov org_w, r6
%if cpuflag(avx512)
vpbroadcastd m2, [v210_mask]
vpbroadcastd m3, [v210_shuf_avx512]
psrlw m3, 6 ; dw 0, 4
mova m4, [v210_shuf_avx512] ; luma
psrlw m5, m4, 8 ; chroma
%else
%if mmsize == 32
vbroadcasti128 m2, [v210_mask]
vbroadcasti128 m3, [v210_luma_shuf]
vbroadcasti128 m4, [v210_chroma_shuf]
%else
mova m2, [v210_mask]
mova m3, [v210_luma_shuf]
mova m4, [v210_chroma_shuf]
%endif
mova m5, [v210_mult] ; also functions as vpermd index for avx2
pshufd m6, m5, q1102
%endif
ALIGN 16
.loop:
movu m1, [r4]
pandn m0, m2, m1
pand m1, m2
pshufb m0, m3
pshufb m1, m4
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
movu m1, [r4]
pandn m0, m2, m1
pand m1, m2
%if cpuflag(avx512)
psrld m0, 10
vpsrlvw m1, m3
mova m6, m0
vpermt2w m0, m4, m1
vpermt2w m1, m5, m6
%else
pshufb m0, m3
pshufb m1, m4
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
%if mmsize == 32
vpermd m0, m5, m0
vpermd m1, m5, m1
vpermd m0, m5, m0
vpermd m1, m5, m1
%endif
%endif
movu [r0+r6], m0
movu [r2+r6], m1
add r4, mmsize
add r6, 3*mmsize/4
movu [r0+r6], m0
movu [r2+r6], m1
add r4, mmsize
add r6, mmsize*3/4
jl .loop
add r0, r1
add r2, r3
add src, r5
mov r4, src
mov r6, org_w
dec h
add r0, r1
add r2, r3
add src, r5
mov r4, src
mov r6, org_w
dec h
jg .loop
RET
%endmacro ; PLANE_DEINTERLEAVE_V210
Loading
Loading
@@ -1461,6 +1486,8 @@ PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
INIT_ZMM avx512
PLANE_DEINTERLEAVE_V210
%else
INIT_XMM sse2
PLANE_DEINTERLEAVE_RGB
Loading
Loading
@@ -1473,82 +1500,85 @@ LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
%endif
 
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
; These functions are not general-use; not only do they require aligned input, but memcpy
; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128.
 
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
%macro MEMCPY 0
cglobal memcpy_aligned, 3,3
%if mmsize == 16
%if mmsize == 32
test r2d, 16
jz .copy2
mova m0, [r1+r2-16]
mova [r0+r2-16], m0
jz .copy32
mova xm0, [r1+r2-16]
mova [r0+r2-16], xm0
sub r2d, 16
.copy2:
%endif
test r2d, 2*mmsize
jz .copy4start
jle .ret
.copy32:
%endif
test r2d, mmsize
jz .loop
mova m0, [r1+r2-mmsize]
mova [r0+r2-mmsize], m0
sub r2d, mmsize
jle .ret
.loop:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
sub r2d, 2*mmsize
.copy4start:
test r2d, r2d
jz .ret
.copy4:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova m2, [r1+r2-3*mmsize]
mova m3, [r1+r2-4*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
mova [r0+r2-3*mmsize], m2
mova [r0+r2-4*mmsize], m3
sub r2d, 4*mmsize
jg .copy4
jg .loop
.ret:
REP_RET
RET
%endmacro
 
INIT_MMX mmx
MEMCPY
INIT_XMM sse
MEMCPY
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
%macro MEMZERO 0
cglobal memzero_aligned, 2,2
add r0, r1
neg r1
%if mmsize == 8
pxor m0, m0
%else
xorps m0, m0
%endif
.loop:
%assign i 0
%rep %1
mova [r0 + r1 + i], m0
%assign i i+mmsize
%assign %%i mmsize
%rep 128 / mmsize
movaps [r0 + r1 - %%i], m0
%assign %%i %%i+mmsize
%endrep
add r1, mmsize*%1
jl .loop
sub r1d, 128
jg .loop
RET
%endmacro
 
INIT_MMX mmx
MEMZERO 8
INIT_XMM sse
MEMZERO 8
MEMCPY
MEMZERO
INIT_YMM avx
MEMZERO 4
MEMCPY
MEMZERO
INIT_ZMM avx512
MEMZERO
cglobal memcpy_aligned, 3,4
dec r2d ; offset of the last byte
rorx r3d, r2d, 2
and r2d, ~63
and r3d, 15 ; n = number of dwords minus one to copy in the tail
mova m0, [r1+r2]
not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff
shrx r3d, r3d, r3d ; 0xffff >> (n^15)
kmovw k1, r3d ; (1 << (n+1)) - 1
vmovdqa32 [r0+r2] {k1}, m0
sub r2d, 64
jl .ret
.loop:
mova m0, [r1+r2]
mova [r0+r2], m0
sub r2d, 64
jge .loop
.ret:
RET
 
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
Loading
Loading
@@ -2147,13 +2177,13 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
lea r0, [r0+r5*2]
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
add r3, r5
add r4, r5
neg r5
sub r1, r5
sub r3, r5
sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
Loading
Loading
@@ -2165,9 +2195,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
pminsd m3, m0
pmaddwd m1, m0
psubd m3, m0, m3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
Loading
Loading
@@ -2184,7 +2213,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
pminsw xm3, xm0
psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
Loading
Loading
@@ -2194,7 +2223,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
Loading
Loading
@@ -2205,7 +2233,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
vcvtps2dq m1, m1
cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
Loading
Loading
@@ -2219,6 +2247,39 @@ MBTREE_AVX
INIT_YMM avx2
MBTREE_AVX
 
INIT_ZMM avx512
cglobal mbtree_propagate_cost, 6,6
vbroadcastss m5, [r5]
mov r5d, 0x3fff3fff
vpbroadcastd ym4, r5d
mov r5d, r6m
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
neg r5
sub r4, r5
sub r3, r5
sub r0, r5
.loop:
pmovzxwd m0, [r2+r5] ; intra
pmovzxwd m1, [r1+r5] ; prop
pmovzxwd m2, [r4+r5] ; invq
pand ym3, ym4, [r3+r5] ; inter
pmovzxwd m3, ym3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
vdivps m1, m0, {rn-sae}
fmaddps m1, m2, m5, m1
mulps m1, m3
cvtps2dq m1, m1
vpmovsdw [r0+r5], m1
add r5, 32
jl .loop
RET
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
Loading
Loading
@@ -2372,6 +2433,112 @@ cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
jl .loop
RET
 
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
; uint16_t *lowres_costs, int bipred_weight, int mb_y,
; int width, int height, int stride, int list_mask );
;-----------------------------------------------------------------------------
INIT_ZMM avx512
cglobal mbtree_propagate_list_internal, 5,7,21
mova xm16, [pw_0xc000]
vpbroadcastw xm17, r5m ; bipred_weight << 9
vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT)
vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf]
vbroadcasti32x8 m6, [pd_0123]
vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
vbroadcasti128 m7, [pd_8]
vbroadcasti128 m8, [pw_31]
vbroadcasti128 m9, [pw_32]
psllw m10, m9, 4
pcmpeqw ym19, ym19 ; pw_m1
vpbroadcastw ym20, r7m ; width
psrld m11, m7, 3 ; pd_1
psrld m12, m8, 16 ; pd_31
vpbroadcastd m13, r8m ; height
vpbroadcastd m14, r9m ; stride
pslld m15, m14, 16
por m15, m11 ; {1, stride, 1, stride} ...
lea r4, [r4+2*r0] ; lowres_costs
lea r3, [r3+2*r0] ; propagate_amount
lea r2, [r2+4*r0] ; mvs
neg r0
mov r6d, 0x5555ffff
kmovd k4, r6d
kshiftrd k5, k4, 16 ; 0x5555
kshiftlw k6, k4, 8 ; 0xff00
.loop:
vbroadcasti128 ym1, [r4+2*r0]
mova xm4, [r3+2*r0]
vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3)
vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
vptestmw k1, ym1, ym18
vpermw m4, m5, m4
vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy}
psraw m0, m3, 5
paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
paddd m6, m7 ; i_mb_x += 8
pand m3, m8 ; {x, y}
vprold m1, m3, 20 ; {y, x} << 4
psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
pmullw m3, m1
paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
pslld ym1, ym0, 16
psubw ym1, ym19
vmovdqu16 ym1 {k5}, ym0
vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
kunpckwd k2, k2, k2
psrad m1, m0, 16
paddd m1 {k6}, m11
vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
pmaddwd m0, m15
paddd m0 {k6}, m14 ; idx0 | idx2
vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
; We're handling dwords, but the offsets are in words so there may be partial overlaps.
; We can work around this by handling dword-aligned and -unaligned offsets separately.
vptestmd k0, m0, m11
kandnw k2, k0, k1 ; dword-aligned offsets
kmovw k3, k2
vpgatherdd m3 {k2}, [r1+2*m0]
; If there are conflicts in the offsets we have to handle them before storing the results.
; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel
; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets.
vpconflictd m4, m0
vpbroadcastmw2d m1, k1
vptestmd k2, m1, m4
ktestw k2, k2
jz .no_conflicts
pand m1, m4 ; mask away unused offsets to avoid false positives
vplzcntd m1, m1
pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb
.conflict_loop:
vpermd m4 {k2}{z}, m1, m2
vpermd m1 {k2}, m1, m1 ; shift the index one step forward
paddsw m2, m4 ; add the weights of conflicting offsets
vpcmpd k2, m1, m12, 2
ktestw k2, k2
jnz .conflict_loop
.no_conflicts:
paddsw m3, m2
vpscatterdd [r1+2*m0] {k3}, m3
kandw k1, k0, k1 ; dword-unaligned offsets
kmovw k2, k1
vpgatherdd m1 {k1}, [r1+2*m0]
paddsw m1, m2 ; all conflicts have already been resolved
vpscatterdd [r1+2*m0] {k2}, m1
add r0, 8
jl .loop
RET
%endif
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
Loading
Loading
Loading
Loading
@@ -32,7 +32,8 @@
void func##_mmx2 args;\
void func##_sse2 args;\
void func##_ssse3 args;\
void func##_avx2 args;
void func##_avx2 args;\
void func##_avx512 args;
 
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
Loading
Loading
@@ -99,17 +100,17 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
uint8_t *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta,
uint8_t *dstb, intptr_t i_dstb,
uint8_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta,
uint16_t *dstb, intptr_t i_dstb,
uint16_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
Loading
Loading
@@ -123,15 +124,18 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
Loading
Loading
@@ -143,11 +147,12 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_memzero_aligned_avx( void *dst, size_t n );
void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
void x264_memzero_aligned_sse ( void *dst, size_t n );
void x264_memzero_aligned_avx ( void *dst, size_t n );
void x264_memzero_aligned_avx512( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
Loading
Loading
@@ -160,14 +165,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
Loading
Loading
@@ -179,7 +186,7 @@ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src,
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(cache64_ssse3)
MC_CHROMA(avx)
MC_CHROMA(avx2)
 
Loading
Loading
@@ -498,6 +505,15 @@ PLANE_COPY(32, avx)
PLANE_COPY_SWAP(16, ssse3)
PLANE_COPY_SWAP(32, avx2)
 
#if HIGH_BIT_DEPTH
PLANE_COPY_YUYV(64, sse2)
PLANE_COPY_YUYV(64, avx)
#else
PLANE_COPY_YUYV(32, sse2)
PLANE_COPY_YUYV(32, ssse3)
#endif
PLANE_COPY_YUYV(64, avx2)
PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
Loading
Loading
@@ -538,6 +554,21 @@ PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
PROPAGATE_LIST(avx2)
 
#if ARCH_X86_64
void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
uint16_t *lowres_costs, int bipred_weight, int mb_y,
int width, int height, int stride, int list_mask );
static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list )
{
x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9,
mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride,
(1 << LOWRES_COST_SHIFT) << list );
}
#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
Loading
Loading
@@ -547,8 +578,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
 
Loading
Loading
@@ -606,6 +635,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
 
if( cpu&X264_CPU_SSE2_IS_FAST )
{
Loading
Loading
@@ -661,6 +691,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
Loading
Loading
@@ -677,6 +708,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
}
#else // !HIGH_BIT_DEPTH
 
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
Loading
Loading
@@ -702,6 +738,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
Loading
Loading
@@ -763,6 +800,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_ssse3;
}
 
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
Loading
Loading
@@ -779,7 +817,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( cpu&X264_CPU_CACHELINE_64 )
{
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_chroma = x264_mc_chroma_cache64_ssse3;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
if( cpu&X264_CPU_SLOW_ATOM )
Loading
Loading
@@ -828,10 +866,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
}
#endif // HIGH_BIT_DEPTH
 
if( !(cpu&X264_CPU_AVX) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->plane_copy = x264_plane_copy_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
Loading
Loading
@@ -844,10 +892,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
if( !(cpu&X264_CPU_AVX512) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->memzero_aligned = x264_memzero_aligned_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
#if ARCH_X86_64
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512;
#endif
}
This diff is collapsed.
Loading
Loading
@@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X1( sad, avx2 )
DECL_X1( sad, avx512 )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
Loading
Loading
@@ -59,6 +60,7 @@ DECL_X4( sad, ssse3 )
DECL_X4( sad, xop )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
DECL_X4( sad, avx512 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
Loading
Loading
@@ -75,6 +77,7 @@ DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( satd, avx2 )
DECL_X1( satd, avx512 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
Loading
Loading
@@ -83,6 +86,7 @@ DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sa8d, avx2 )
DECL_X1( sa8d, avx512 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
Loading
Loading
@@ -92,11 +96,10 @@ DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
 
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
Loading
Loading
@@ -165,16 +168,14 @@ void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
Loading
Loading
Loading
Loading
@@ -468,7 +468,7 @@ PREDICT_4x4 w, wd, dq, qdq
INIT_MMX mmx2
PREDICT_4x4 b, bw, wd, dq
INIT_MMX ssse3
%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
%define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3
PREDICT_4x4 b, bw, wd, dq
%endif
 
Loading
Loading
@@ -940,7 +940,7 @@ INIT_XMM sse2
PREDICT_8x8_DDLR
INIT_XMM ssse3
PREDICT_8x8_DDLR
INIT_XMM ssse3, cache64
INIT_XMM cache64, ssse3
PREDICT_8x8_DDLR
%elif ARCH_X86_64 == 0
INIT_MMX mmx2
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment