Skip to content
Snippets Groups Projects
Commit d8de3cd6 authored by Leo Ma's avatar Leo Ma
Browse files

Upgrade libx264


Signed-off-by: default avatarLeo Ma <begeekmyfriend@gmail.com>
parent 0984dd59
No related branches found
No related tags found
No related merge requests found
Showing
with 1305 additions and 593 deletions
Loading
Loading
@@ -511,8 +511,8 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64;
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3;
}
if( !(cpu&X264_CPU_AVX) )
return;
Loading
Loading
@@ -604,6 +604,6 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3;
#endif // HIGH_BIT_DEPTH
}
Loading
Loading
@@ -93,12 +93,12 @@ void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
Loading
Loading
@@ -129,7 +129,7 @@ void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
Loading
Loading
Loading
Loading
@@ -30,7 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
 
SECTION_RODATA 32
SECTION_RODATA 64
%if HIGH_BIT_DEPTH
decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15
%else
dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
%endif
 
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
Loading
Loading
@@ -42,14 +49,6 @@ SECTION_RODATA 32
dw %4, %2, %6, %2, %4, %2, %6, %2
%endmacro
 
dequant4_scale:
DQM4 10, 13, 16
DQM4 11, 14, 18
DQM4 13, 16, 20
DQM4 14, 18, 23
DQM4 16, 20, 25
DQM4 18, 23, 29
dequant8_scale:
DQM8 20, 18, 32, 19, 25, 24
DQM8 22, 19, 35, 21, 28, 26
Loading
Loading
@@ -58,6 +57,14 @@ dequant8_scale:
DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43
 
dequant4_scale:
DQM4 10, 13, 16
DQM4 11, 14, 18
DQM4 13, 16, 20
DQM4 14, 18, 23
DQM4 16, 20, 25
DQM4 18, 23, 29
decimate_mask_table4:
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
Loading
Loading
@@ -743,6 +750,163 @@ DEQUANT 4, 4, 4
DEQUANT 8, 6, 4
%endif
 
%macro DEQUANT_START_AVX512 1-2 0 ; shift, flat
%if %2 == 0
movifnidn t2d, r2m
%endif
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1d, [t0*5]
sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if %2
%ifdef PIC
%define dmf r1+t2
lea r1, [dequant8_scale]
%else
%define dmf t2+dequant8_scale
%endif
%elif ARCH_X86_64
%define dmf r1+t2
%else
%define dmf r1
add r1, r1mp ; dequant_mf[i_mf]
%endif
movifnidn r0, r0mp
%endmacro
INIT_ZMM avx512
cglobal dequant_4x4, 0,3
DEQUANT_START_AVX512 6
mova m0, [dmf]
%if HIGH_BIT_DEPTH
pmaddwd m0, [r0]
%endif
sub t0d, 4
jl .rshift
%if HIGH_BIT_DEPTH
vpbroadcastd m1, t0d
vpsllvd m0, m1
mova [r0], m0
%else
vpbroadcastw ym1, t0d
vpmovsdw ym0, m0
pmullw ym0, [r0]
vpsllvw ym0, ym1
mova [r0], ym0
%endif
RET
.rshift:
%if HIGH_BIT_DEPTH == 0
pmovzxwd m1, [r0]
pmaddwd m0, m1
%endif
mov r1d, 1<<31
shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
neg t0d
vpbroadcastd m1, r1d
vpbroadcastd m2, t0d
paddd m0, m1
vpsravd m0, m2
%if HIGH_BIT_DEPTH
mova [r0], m0
%else
vpmovsdw [r0], m0
%endif
RET
cglobal dequant_8x8, 0,3
DEQUANT_START_AVX512 8
mova m0, [dmf+0*64]
mova m1, [dmf+1*64]
mova m2, [dmf+2*64]
mova m3, [dmf+3*64]
%if HIGH_BIT_DEPTH
pmaddwd m0, [r0+0*64]
pmaddwd m1, [r0+1*64]
pmaddwd m2, [r0+2*64]
pmaddwd m3, [r0+3*64]
%else
mova m6, [dequant_shuf_avx512]
%endif
sub t0d, 6
jl .rshift
%if HIGH_BIT_DEPTH
vpbroadcastd m4, t0d
vpsllvd m0, m4
vpsllvd m1, m4
vpsllvd m2, m4
vpsllvd m3, m4
jmp .end
.rshift:
%else
vpbroadcastw m4, t0d
vpermt2w m0, m6, m1
vpermt2w m2, m6, m3
pmullw m0, [r0]
pmullw m2, [r0+64]
vpsllvw m0, m4
vpsllvw m2, m4
mova [r0], m0
mova [r0+64], m2
RET
.rshift:
pmovzxwd m4, [r0+0*32]
pmovzxwd m5, [r0+1*32]
pmaddwd m0, m4
pmaddwd m1, m5
pmovzxwd m4, [r0+2*32]
pmovzxwd m5, [r0+3*32]
pmaddwd m2, m4
pmaddwd m3, m5
%endif
mov r1d, 1<<31
shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
neg t0d
vpbroadcastd m4, r1d
vpbroadcastd m5, t0d
paddd m0, m4
paddd m1, m4
vpsravd m0, m5
vpsravd m1, m5
paddd m2, m4
paddd m3, m4
vpsravd m2, m5
vpsravd m3, m5
%if HIGH_BIT_DEPTH
.end:
mova [r0+0*64], m0
mova [r0+1*64], m1
mova [r0+2*64], m2
mova [r0+3*64], m3
%else
vpermt2w m0, m6, m1
vpermt2w m2, m6, m3
mova [r0], m0
mova [r0+64], m2
%endif
RET
%if HIGH_BIT_DEPTH == 0
cglobal dequant_8x8_flat16, 0,3
movifnidn t2d, r2m
cmp t2d, 12
jl dequant_8x8_avx512
sub t2d, 12
DEQUANT_START_AVX512 6, 1
vpbroadcastw m0, t0d
mova m1, [dmf]
vpsllvw m1, m0
pmullw m0, m1, [r0]
pmullw m1, [r0+64]
mova [r0], m0
mova [r0+64], m1
RET
%endif
%undef dmf
%macro DEQUANT_DC 2
cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
Loading
Loading
@@ -1208,13 +1372,12 @@ cglobal denoise_dct, 4,4,4
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
 
%macro DECIMATE_MASK 5
%if mmsize==16
%macro DECIMATE_MASK 4
%if HIGH_BIT_DEPTH
movdqa m0, [%3+ 0]
movdqa m1, [%3+32]
packssdw m0, [%3+16]
packssdw m1, [%3+48]
mova m0, [%3+0*16]
packssdw m0, [%3+1*16]
mova m1, [%3+2*16]
packssdw m1, [%3+3*16]
ABSW2 m0, m1, m0, m1, m3, m4
%else
ABSW m0, [%3+ 0], m3
Loading
Loading
@@ -1226,40 +1389,35 @@ cglobal denoise_dct, 4,4,4
pcmpgtb m0, %4
pmovmskb %1, m2
pmovmskb %2, m0
%else ; mmsize==8
%endmacro
%macro DECIMATE_MASK16_AVX512 0
mova m0, [r0]
%if HIGH_BIT_DEPTH
movq m0, [%3+ 0]
movq m1, [%3+16]
movq m2, [%3+32]
movq m3, [%3+48]
packssdw m0, [%3+ 8]
packssdw m1, [%3+24]
packssdw m2, [%3+40]
packssdw m3, [%3+56]
%else
movq m0, [%3+ 0]
movq m1, [%3+ 8]
movq m2, [%3+16]
movq m3, [%3+24]
%endif
ABSW2 m0, m1, m0, m1, m6, m7
ABSW2 m2, m3, m2, m3, m6, m7
packsswb m0, m1
packsswb m2, m3
pxor m4, m4
pxor m6, m6
pcmpeqb m4, m0
pcmpeqb m6, m2
pcmpgtb m0, %4
pcmpgtb m2, %4
pmovmskb %5, m4
pmovmskb %1, m6
shl %1, 8
or %1, %5
pmovmskb %5, m0
pmovmskb %2, m2
shl %2, 8
or %2, %5
vptestmd k0, m0, m0
pabsd m0, m0
vpcmpud k1, m0, [pd_1] {1to16}, 6
%else
vptestmw k0, m0, m0
pabsw m0, m0
vpcmpuw k1, m0, [pw_1], 6
%endif
%endmacro
%macro SHRX 2
%if cpuflag(bmi2)
shrx %1, %1, %2
%else
shr %1, %2b ; %2 has to be rcx/ecx
%endif
%endmacro
%macro BLSR 2
%if cpuflag(bmi1)
blsr %1, %2
%else
lea %1, [%2-1]
and %1, %2
%endif
%endmacro
 
Loading
Loading
@@ -1269,33 +1427,60 @@ cextern decimate_table8
%macro DECIMATE4x4 1
 
cglobal decimate_score%1, 1,3
%ifdef PIC
lea r4, [decimate_table4]
lea r5, [decimate_mask_table4]
%define table r4
%define mask_table r5
%if cpuflag(avx512)
DECIMATE_MASK16_AVX512
xor eax, eax
kmovw edx, k0
%if %1 == 15
shr edx, 1
%else
%define table decimate_table4
%define mask_table decimate_mask_table4
test edx, edx
%endif
DECIMATE_MASK edx, eax, r0, [pb_1], ecx
jz .ret
ktestw k1, k1
jnz .ret9
%else
DECIMATE_MASK edx, eax, r0, [pb_1]
xor edx, 0xffff
je .ret
jz .ret
test eax, eax
jne .ret9
%if %1==15
jnz .ret9
%if %1 == 15
shr edx, 1
%endif
%endif
%ifdef PIC
lea r4, [decimate_mask_table4]
%define mask_table r4
%else
%define mask_table decimate_mask_table4
%endif
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
%if ARCH_X86_64
xor edx, ecx
jz .ret
%if cpuflag(lzcnt)
lzcnt ecx, ecx
lea r5, [decimate_table4-32]
add r5, rcx
%else
bsr ecx, ecx
lea r5, [decimate_table4-1]
sub r5, rcx
%endif
%define table r5
%else
cmp edx, ecx
je .ret
jz .ret
bsr ecx, ecx
shr edx, 1
shr edx, cl
SHRX edx, ecx
%define table decimate_table4
%endif
tzcnt ecx, edx
shr edx, 1
shr edx, cl
SHRX edx, ecx
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
.ret:
Loading
Loading
@@ -1303,175 +1488,224 @@ cglobal decimate_score%1, 1,3
.ret9:
mov eax, 9
RET
%endmacro
 
%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
; 2x gt1 output, 2x nz output, 1x mask
%macro DECIMATE_MASK64_AVX2 5
pabsw m0, [r0+ 0]
pabsw m2, [r0+32]
pabsw m1, [r0+64]
pabsw m3, [r0+96]
packsswb m0, m2
packsswb m1, m3
pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
pcmpgtb m3, m1, %5 ; we can save latency by doing them here
pmovmskb %1, m2
pmovmskb %2, m3
or %1, %2
jne .ret9
%macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high
mova m0, [r0+0*32]
packsswb m0, [r0+1*32]
mova m1, [r0+2*32]
packsswb m1, [r0+3*32]
mova m4, [pb_1]
pabsb m2, m0
pabsb m3, m1
por m2, m3 ; the > 1 checks don't care about order, so
ptest m4, m2 ; we can save latency by doing them here
jnc .ret9
vpermq m0, m0, q3120
vpermq m1, m1, q3120
pxor m4, m4
pcmpeqb m0, m4
pcmpeqb m1, m4
pmovmskb %3, m0
pmovmskb %4, m1
pmovmskb %1, m0
pmovmskb %2, m1
%endmacro
 
%macro DECIMATE8x8 0
%macro DECIMATE_MASK64_AVX512 0
mova m0, [r0]
%if HIGH_BIT_DEPTH
packssdw m0, [r0+1*64]
mova m1, [r0+2*64]
packssdw m1, [r0+3*64]
packsswb m0, m1
vbroadcasti32x4 m1, [pb_1]
pabsb m2, m0
vpcmpub k0, m2, m1, 6
ktestq k0, k0
jnz .ret9
mova m1, [decimate_shuf_avx512]
vpermd m0, m1, m0
vptestmb k1, m0, m0
%else
mova m1, [r0+64]
vbroadcasti32x4 m3, [pb_1]
packsswb m2, m0, m1
pabsb m2, m2
vpcmpub k0, m2, m3, 6
ktestq k0, k0
jnz .ret9
vptestmw k1, m0, m0
vptestmw k2, m1, m1
%endif
%endmacro
 
%macro DECIMATE8x8 0
%if ARCH_X86_64
cglobal decimate_score64, 1,5
%if mmsize == 64
DECIMATE_MASK64_AVX512
xor eax, eax
%if HIGH_BIT_DEPTH
kmovq r1, k1
test r1, r1
jz .ret
%else
kortestd k1, k2
jz .ret
kunpckdq k1, k2, k1
kmovq r1, k1
%endif
%elif mmsize == 32
DECIMATE_MASK64_AVX2 r1d, eax
not r1
shl rax, 32
xor r1, rax
jz .ret
%else
mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5
test eax, eax
jnz .ret9
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5
shl r2d, 16
or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5
shl r2, 32
or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5
not r1
shl r2, 48
xor r1, r2
jz .ret
add eax, r3d
jnz .ret9
%endif
%ifdef PIC
lea r4, [decimate_table8]
%define table r4
%else
%define table decimate_table8
%endif
mova m5, [pb_1]
%if mmsize==32
DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
shl r3, 32
or r1, r3
xor r1, -1
je .ret
%else
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
test eax, eax
jne .ret9
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
shl r2d, 16
or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
shl r2, 32
or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
shl r2, 48
or r1, r2
xor r1, -1
je .ret
add eax, r3d
jne .ret9
%endif
mov al, -6
mov al, -6
.loop:
tzcnt rcx, r1
shr r1, cl
add al, byte [table + rcx]
jge .ret9
shr r1, 1
jne .loop
add al, 6
add al, byte [table + rcx]
jge .ret9
shr r1, 1
SHRX r1, rcx
%if cpuflag(bmi2)
test r1, r1
%endif
jnz .loop
add al, 6
.ret:
REP_RET
.ret9:
mov eax, 9
mov eax, 9
RET
 
%else ; ARCH
%if mmsize == 8
cglobal decimate_score64, 1,6
cglobal decimate_score64, 1,4
%if mmsize == 64
DECIMATE_MASK64_AVX512
xor eax, eax
%if HIGH_BIT_DEPTH
kshiftrq k2, k1, 32
%endif
kmovd r2, k1
kmovd r3, k2
test r2, r2
jz .tryret
%elif mmsize == 32
DECIMATE_MASK64_AVX2 r2, r3
xor eax, eax
not r3
xor r2, -1
jz .tryret
%else
cglobal decimate_score64, 1,5
%endif
mova m5, [pb_1]
%if mmsize==32
DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
xor r3, -1
je .tryret
xor r4, -1
.cont:
%else
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
test r2, r2
jne .ret9
DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
shl r4, 16
or r3, r4
DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
or r2, r1
DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
shl r1, 16
or r4, r1
xor r3, -1
je .tryret
xor r4, -1
.cont:
add r0, r2
jne .ret9
%endif
mov al, -6
mova m5, [pb_1]
DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5
test r1, r1
jnz .ret9
DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5
not r2
shl r3, 16
xor r2, r3
mov r0m, r2
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5
or r2, r1
DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5
add r0, r2
jnz .ret9
mov r2, r0m
not r3
shl r1, 16
xor r3, r1
test r2, r2
jz .tryret
%endif
mov al, -6
.loop:
tzcnt ecx, r2
add al, byte [decimate_table8 + ecx]
jge .ret9
sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well
jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31
shrd r2, r3, cl
SHRX r3, ecx
%if notcpuflag(bmi2)
test r2, r2
%endif
jnz .loop
BLSR r2, r3
jz .end
.largerun:
tzcnt ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
add al, byte [decimate_table8 + ecx]
jge .ret9
shrd r3, r4, 1
shr r4, 1
test r3, r3
jne .loop
test r4, r4
jne .loop
add al, 6
.ret:
REP_RET
.tryret:
xor r4, -1
jne .cont
shr r3, 1
SHRX r3, ecx
.loop2:
tzcnt ecx, r3
add al, byte [decimate_table8 + ecx]
jge .ret9
shr r3, 1
SHRX r3, ecx
.run31:
test r3, r3
jnz .loop2
.end:
add al, 6
RET
.tryret:
BLSR r2, r3
jz .ret
mov al, -6
jmp .largerun
.ret9:
mov eax, 9
RET
.largerun:
mov r3, r4
xor r4, r4
tzcnt ecx, r3
shr r3, cl
shr r3, 1
jne .loop
add al, 6
RET
.ret:
REP_RET
%endif ; ARCH
%endmacro
 
%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE8x8
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
DECIMATE8x8
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
DECIMATE8x8
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
%else
INIT_YMM avx2
DECIMATE8x8
INIT_YMM avx512
%endif
DECIMATE4x4 15
DECIMATE4x4 16
INIT_ZMM avx512
DECIMATE8x8
 
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
Loading
Loading
@@ -1556,7 +1790,7 @@ cglobal coeff_last4, 1,3
 
INIT_MMX mmx2
COEFF_LAST4
INIT_MMX mmx2, lzcnt
INIT_MMX lzcnt
COEFF_LAST4
 
%macro COEFF_LAST8 0
Loading
Loading
@@ -1579,7 +1813,7 @@ COEFF_LAST8
%endif
INIT_XMM sse2
COEFF_LAST8
INIT_XMM sse2, lzcnt
INIT_XMM lzcnt
COEFF_LAST8
 
%else ; !HIGH_BIT_DEPTH
Loading
Loading
@@ -1642,7 +1876,7 @@ cglobal coeff_last8, 1,3
 
INIT_MMX mmx2
COEFF_LAST48
INIT_MMX mmx2, lzcnt
INIT_MMX lzcnt
COEFF_LAST48
%endif ; HIGH_BIT_DEPTH
 
Loading
Loading
@@ -1707,7 +1941,7 @@ COEFF_LAST
%endif
INIT_XMM sse2
COEFF_LAST
INIT_XMM sse2, lzcnt
INIT_XMM lzcnt
COEFF_LAST
 
%macro LAST_MASK_AVX2 2
Loading
Loading
@@ -1729,7 +1963,7 @@ COEFF_LAST
%endmacro
 
%if ARCH_X86_64 == 0
INIT_YMM avx2,lzcnt
INIT_YMM avx2
cglobal coeff_last64, 1,2
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
Loading
Loading
@@ -1744,7 +1978,7 @@ cglobal coeff_last64, 1,2
add eax, 32
RET
%else
INIT_YMM avx2,lzcnt
INIT_YMM avx2
cglobal coeff_last64, 1,3
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
Loading
Loading
@@ -1756,6 +1990,70 @@ cglobal coeff_last64, 1,3
RET
%endif
 
%macro COEFF_LAST_AVX512 2 ; num, w/d
cglobal coeff_last%1, 1,2
mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
vptestm%2 k0, m0, m0
%if %1 == 15
mov eax, 30
kmovw r1d, k0
lzcnt r1d, r1d
sub eax, r1d
%else
kmovw eax, k0
lzcnt eax, eax
xor eax, 31
%endif
RET
%endmacro
%macro COEFF_LAST64_AVX512 1 ; w/d
cglobal coeff_last64, 1,2
pxor xm0, xm0
vpcmp%1 k0, m0, [r0+0*64], 4
vpcmp%1 k1, m0, [r0+1*64], 4
%if HIGH_BIT_DEPTH
vpcmp%1 k2, m0, [r0+2*64], 4
vpcmp%1 k3, m0, [r0+3*64], 4
kunpckwd k0, k1, k0
kunpckwd k1, k3, k2
%endif
%if ARCH_X86_64
kunpckdq k0, k1, k0
kmovq rax, k0
lzcnt rax, rax
xor eax, 63
%else
kmovd r1d, k1
kmovd eax, k0
lzcnt r1d, r1d
lzcnt eax, eax
xor r1d, 32
cmovnz eax, r1d
xor eax, 31
%endif
RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 4, d
INIT_YMM avx512
COEFF_LAST_AVX512 8, d
INIT_ZMM avx512
COEFF_LAST_AVX512 15, d
COEFF_LAST_AVX512 16, d
COEFF_LAST64_AVX512 d
%else ; !HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 8, w
INIT_YMM avx512
COEFF_LAST_AVX512 15, w
COEFF_LAST_AVX512 16, w
INIT_ZMM avx512
COEFF_LAST64_AVX512 w
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
Loading
Loading
@@ -1833,15 +2131,17 @@ COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
INIT_MMX lzcnt
COEFF_LEVELRUN 4
%if HIGH_BIT_DEPTH == 0
COEFF_LEVELRUN 8
%endif
INIT_XMM lzcnt
%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
 
; Similar to the one above, but saves the DCT
; coefficients in m0/m1 so we don't have to load
Loading
Loading
@@ -1968,7 +2268,7 @@ INIT_XMM ssse3, lzcnt
COEFF_LEVELRUN_LUT 8
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
INIT_XMM avx2, lzcnt
INIT_XMM avx2
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
%endif
Loading
Loading
@@ -66,12 +66,15 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
Loading
Loading
@@ -85,16 +88,16 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmx2( dctcoef *dct );
int x264_decimate_score15_sse2( dctcoef *dct );
int x264_decimate_score15_ssse3( dctcoef *dct );
int x264_decimate_score16_mmx2( dctcoef *dct );
int x264_decimate_score15_avx512( dctcoef *dct );
int x264_decimate_score16_sse2( dctcoef *dct );
int x264_decimate_score16_ssse3( dctcoef *dct );
int x264_decimate_score64_mmx2( dctcoef *dct );
int x264_decimate_score16_avx512( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_decimate_score64_avx2( int16_t *dct );
int x264_decimate_score64_avx512( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
Loading
Loading
@@ -104,33 +107,37 @@ int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
int x264_coeff_last4_lzcnt( dctcoef *dct );
int x264_coeff_last8_lzcnt( dctcoef *dct );
int x264_coeff_last15_lzcnt( dctcoef *dct );
int x264_coeff_last16_lzcnt( dctcoef *dct );
int x264_coeff_last64_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2 ( dctcoef *dct );
int x264_coeff_last4_avx512( int32_t *dct );
int x264_coeff_last8_avx512( dctcoef *dct );
int x264_coeff_last15_avx512( dctcoef *dct );
int x264_coeff_last16_avx512( dctcoef *dct );
int x264_coeff_last64_avx512( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
Loading
Loading
Loading
Loading
@@ -106,8 +106,6 @@ SAD 4, 16
SAD 4, 8
SAD 4, 4
 
;=============================================================================
; SAD XMM
;=============================================================================
Loading
Loading
@@ -119,118 +117,64 @@ SAD 4, 4
RET
%endmacro
 
%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x16, 4,4,8
movu m0, [r2]
movu m1, [r2+r3]
lea r2, [r2+2*r3]
movu m2, [r2]
movu m3, [r2+r3]
lea r2, [r2+2*r3]
psadbw m0, [r0]
psadbw m1, [r0+r1]
lea r0, [r0+2*r1]
movu m4, [r2]
paddw m0, m1
psadbw m2, [r0]
psadbw m3, [r0+r1]
lea r0, [r0+2*r1]
movu m5, [r2+r3]
lea r2, [r2+2*r3]
paddw m2, m3
movu m6, [r2]
movu m7, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m2
psadbw m4, [r0]
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
movu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
movu m3, [r2]
paddw m0, m4
movu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
movu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
movu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
movu m7, [r2]
paddw m0, m1
movu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
lea r0, [r0+2*r1]
paddw m5, m6
psadbw m7, [r0]
psadbw m1, [r0+r1]
paddw m7, m1
paddw m0, m5
paddw m0, m7
SAD_END_SSE2
;-----------------------------------------------------------------------------
; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x8, 4,4
movu m0, [r2]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
movu m3, [r2]
movu m4, [r2+r3]
psadbw m0, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m0, m2
paddw m3, m4
paddw m0, m3
movu m1, [r2]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
movu m3, [r2]
movu m4, [r2+r3]
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
%macro SAD_W16 1 ; h
cglobal pixel_sad_16x%1, 4,4
%ifidn cpuname, sse2
.skip_prologue:
%endif
%assign %%i 0
%if ARCH_X86_64
lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
lea r5, [3*r3]
%rep %1/4
movu m1, [r2]
psadbw m1, [r0]
movu m3, [r2+r3]
psadbw m3, [r0+r1]
movu m2, [r2+2*r3]
psadbw m2, [r0+2*r1]
movu m4, [r2+r5]
psadbw m4, [r0+r6]
%if %%i != %1/4-1
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
%endif
paddw m1, m3
paddw m2, m4
ACCUM paddw, 0, 1, %%i
paddw m0, m2
%assign %%i %%i+1
%endrep
%else ; The cost of having to save and restore registers on x86-32
%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
movu m1, [r2]
psadbw m1, [r0]
movu m2, [r2+r3]
psadbw m2, [r0+r1]
%if %%i != %1/2-1
lea r2, [r2+2*r3]
lea r0, [r0+2*r1]
%endif
ACCUM paddw, 0, 1, %%i
paddw m0, m2
%assign %%i %%i+1
%endrep
%endif
SAD_END_SSE2
%endmacro
 
INIT_XMM sse2
SAD_W16
SAD_W16 16
SAD_W16 8
INIT_XMM sse3
SAD_W16
SAD_W16 16
SAD_W16 8
INIT_XMM sse2, aligned
SAD_W16
SAD_W16 16
SAD_W16 8
 
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
Loading
Loading
@@ -259,7 +203,132 @@ cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
SAD_END_SSE2
%macro SAD_W48_AVX512 3 ; w, h, d/q
cglobal pixel_sad_%1x%2, 4,4
kxnorb k1, k1, k1
kaddb k1, k1, k1
%assign %%i 0
%if ARCH_X86_64 && %2 != 4
lea r6, [3*r1]
lea r5, [3*r3]
%rep %2/4
mov%3 m1, [r0]
vpbroadcast%3 m1 {k1}, [r0+r1]
mov%3 m3, [r2]
vpbroadcast%3 m3 {k1}, [r2+r3]
mov%3 m2, [r0+2*r1]
vpbroadcast%3 m2 {k1}, [r0+r6]
mov%3 m4, [r2+2*r3]
vpbroadcast%3 m4 {k1}, [r2+r5]
%if %%i != %2/4-1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
psadbw m1, m3
psadbw m2, m4
ACCUM paddd, 0, 1, %%i
paddd m0, m2
%assign %%i %%i+1
%endrep
%else
%rep %2/2
mov%3 m1, [r0]
vpbroadcast%3 m1 {k1}, [r0+r1]
mov%3 m2, [r2]
vpbroadcast%3 m2 {k1}, [r2+r3]
%if %%i != %2/2-1
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
psadbw m1, m2
ACCUM paddd, 0, 1, %%i
%assign %%i %%i+1
%endrep
%endif
%if %1 == 8
punpckhqdq m1, m0, m0
paddd m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_XMM avx512
SAD_W48_AVX512 4, 4, d
SAD_W48_AVX512 4, 8, d
SAD_W48_AVX512 4, 16, d
SAD_W48_AVX512 8, 4, q
SAD_W48_AVX512 8, 8, q
SAD_W48_AVX512 8, 16, q
%macro SAD_W16_AVX512_START 1 ; h
cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which
jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
lea r1, [3*r3]
%endmacro
%macro SAD_W16_AVX512_END 0
paddd m0, m1
paddd m0, m2
paddd m0, m3
%if mmsize == 64
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
%endif
vextracti128 xm1, ym0, 1
paddd xmm0, xm0, xm1
punpckhqdq xmm1, xmm0, xmm0
paddd xmm0, xmm1
movd eax, xmm0
RET
%endmacro
INIT_YMM avx512
cglobal pixel_sad_16x8, 4,4
SAD_W16_AVX512_START 8
movu xm0, [r2]
vinserti128 m0, [r2+r3], 1
psadbw m0, [r0+0*32]
movu xm1, [r2+2*r3]
vinserti128 m1, [r2+r1], 1
lea r2, [r2+4*r3]
psadbw m1, [r0+1*32]
movu xm2, [r2]
vinserti128 m2, [r2+r3], 1
psadbw m2, [r0+2*32]
movu xm3, [r2+2*r3]
vinserti128 m3, [r2+r1], 1
psadbw m3, [r0+3*32]
SAD_W16_AVX512_END
INIT_ZMM avx512
cglobal pixel_sad_16x16, 4,4
SAD_W16_AVX512_START 16
movu xm0, [r2]
vinserti128 ym0, [r2+r3], 1
movu xm1, [r2+4*r3]
vinserti32x4 m0, [r2+2*r3], 2
vinserti32x4 m1, [r2+2*r1], 2
vinserti32x4 m0, [r2+r1], 3
lea r2, [r2+4*r3]
vinserti32x4 m1, [r2+r3], 1
psadbw m0, [r0+0*64]
vinserti32x4 m1, [r2+r1], 3
lea r2, [r2+4*r3]
psadbw m1, [r0+1*64]
movu xm2, [r2]
vinserti128 ym2, [r2+r3], 1
movu xm3, [r2+4*r3]
vinserti32x4 m2, [r2+2*r3], 2
vinserti32x4 m3, [r2+2*r1], 2
vinserti32x4 m2, [r2+r1], 3
lea r2, [r2+4*r3]
vinserti32x4 m3, [r2+r3], 1
psadbw m2, [r0+2*64]
vinserti32x4 m3, [r2+r1], 3
psadbw m3, [r0+3*64]
SAD_W16_AVX512_END
 
;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, intptr_t stride );
Loading
Loading
@@ -1548,6 +1617,225 @@ SAD_X_AVX2 3, 16, 8, 7
SAD_X_AVX2 4, 16, 16, 8
SAD_X_AVX2 4, 16, 8, 8
 
%macro SAD_X_W4_AVX512 2 ; x, h
cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
mov t1d, 0xa
kmovb k1, t1d
lea t1, [3*t0]
kaddb k2, k1, k1
kshiftlb k3, k1, 2
%assign %%i 0
%rep %2/4
movu m6, [r0+%%i*64]
vmovddup m6 {k1}, [r0+%%i*64+32]
movd xmm2, [r1]
movd xmm4, [r1+t0]
vpbroadcastd xmm2 {k1}, [r1+2*t0]
vpbroadcastd xmm4 {k1}, [r1+t1]
vpbroadcastd xmm2 {k2}, [r2+t0]
vpbroadcastd xmm4 {k2}, [r2]
vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
vpmovqd s1, m6 ; s0 s2 s1 s3
movd xmm3, [r3]
movd xmm5, [r3+t0]
vpbroadcastd xmm3 {k1}, [r3+2*t0]
vpbroadcastd xmm5 {k1}, [r3+t1]
%if %1 == 4
vpbroadcastd xmm3 {k2}, [r4+t0]
vpbroadcastd xmm5 {k2}, [r4]
vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
pshufd s2, s1, q1032
psadbw xmm2, s1
psadbw xmm4, s2
psadbw xmm3, s1
psadbw xmm5, s2
%if %%i
paddd xmm0, xmm2
paddd xmm1, xmm3
paddd xmm0, xmm4
paddd xmm1, xmm5
%else
paddd xmm0, xmm2, xmm4
paddd xmm1, xmm3, xmm5
%endif
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
packusdw xmm0, xmm1
mova [t2], xmm0
RET
%endmacro
%macro SAD_X_W8_AVX512 2 ; x, h
cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
kxnorb k3, k3, k3
lea t1, [3*t0]
kaddb k1, k3, k3
kshiftlb k2, k3, 2
kshiftlb k3, k3, 3
%assign %%i 0
%rep %2/4
movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
movq xm2, [r1]
movq xm4, [r1+2*t0]
vpbroadcastq xm2 {k1}, [r2]
vpbroadcastq xm4 {k1}, [r2+2*t0]
vpbroadcastq m2 {k2}, [r1+t0]
vpbroadcastq m4 {k2}, [r1+t1]
vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
movq xm3, [r3]
movq xm5, [r3+2*t0]
%if %1 == 4
vpbroadcastq xm3 {k1}, [r4]
vpbroadcastq xm5 {k1}, [r4+2*t0]
%endif
vpbroadcastq m3 {k2}, [r3+t0]
vpbroadcastq m5 {k2}, [r3+t1]
%if %1 == 4
vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
psadbw m2, m6
psadbw m4, m7
psadbw m3, m6
psadbw m5, m7
ACCUM paddd, 0, 2, %%i
ACCUM paddd, 1, 3, %%i
paddd m0, m4
paddd m1, m5
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
packusdw m0, m1
vextracti128 xm1, m0, 1
paddd xm0, xm1
mova [t2], xm0
RET
%endmacro
%macro SAD_X_W16_AVX512 2 ; x, h
cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
lea t1, [3*t0]
%assign %%i 0
%rep %2/4
mova m6, [r0+%%i*64] ; s0 s1 s2 s3
movu xm2, [r3]
movu xm4, [r3+t0]
%if %1 == 4
vinserti128 ym2, [r4+t0], 1
vinserti128 ym4, [r4], 1
%endif
vinserti32x4 m2, [r1+2*t0], 2
vinserti32x4 m4, [r1+t1], 2
vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
vpermq m7, m6, q1032 ; s1 s0 s3 s2
movu xm3, [r1]
movu xm5, [r1+t0]
vinserti128 ym3, [r2+t0], 1
vinserti128 ym5, [r2], 1
vinserti32x4 m3, [r3+2*t0], 2
vinserti32x4 m5, [r3+t1], 2
%if %1 == 4
vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
psadbw m2, m6
psadbw m4, m7
psadbw m3, m6
psadbw m5, m7
ACCUM paddd, 0, 2, %%i
ACCUM paddd, 1, 3, %%i
paddd m0, m4
paddd m1, m5
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
mov t1d, 0x1111
kmovw k1, t1d
vshufi32x4 m0, m0, q1032
paddd m0, m1
punpckhqdq m1, m0, m0
paddd m0, m1
vpcompressd m0 {k1}{z}, m0
mova [t2], xm0
RET
%endmacro
; t0 = stride, t1 = tmp/stride3, t2 = scores
%if WIN64
%define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
%define s2 xmm17 ; they're callee-saved on win64
DECLARE_REG_TMP 4, 6, 0
%else
%define s1 xmm6
%define s2 xmm7
%if ARCH_X86_64
DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
%else
DECLARE_REG_TMP 4, 5, 0
%endif
%endif
INIT_YMM avx512
SAD_X_W4_AVX512 3, 4 ; x3_4x4
SAD_X_W4_AVX512 3, 8 ; x3_4x8
SAD_X_W8_AVX512 3, 4 ; x3_8x4
SAD_X_W8_AVX512 3, 8 ; x3_8x8
SAD_X_W8_AVX512 3, 16 ; x3_8x16
INIT_ZMM avx512
SAD_X_W16_AVX512 3, 8 ; x3_16x8
SAD_X_W16_AVX512 3, 16 ; x3_16x16
DECLARE_REG_TMP 5, 6, 0
INIT_YMM avx512
SAD_X_W4_AVX512 4, 4 ; x4_4x4
SAD_X_W4_AVX512 4, 8 ; x4_4x8
SAD_X_W8_AVX512 4, 4 ; x4_8x4
SAD_X_W8_AVX512 4, 8 ; x4_8x8
SAD_X_W8_AVX512 4, 16 ; x4_8x16
INIT_ZMM avx512
SAD_X_W16_AVX512 4, 8 ; x4_16x8
SAD_X_W16_AVX512 4, 16 ; x4_16x16
;=============================================================================
; SAD cacheline split
;=============================================================================
Loading
Loading
Loading
Loading
@@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endmacro
 
%define required_stack_alignment ((mmsize + 15) & ~15)
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
 
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
Loading
Loading
@@ -414,10 +416,10 @@ DECLARE_REG 7, rdi, 64
DECLARE_REG 8, rsi, 72
DECLARE_REG 9, rbx, 80
DECLARE_REG 10, rbp, 88
DECLARE_REG 11, R12, 96
DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
DECLARE_REG 11, R14, 96
DECLARE_REG 12, R15, 104
DECLARE_REG 13, R12, 112
DECLARE_REG 14, R13, 120
 
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
Loading
Loading
@@ -436,15 +438,16 @@ DECLARE_REG 14, R15, 120
 
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%if xmm_regs_used > 6
%if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
%if xmm_regs_used > 7
%if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i 8
%rep xmm_regs_used-8
%rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
Loading
Loading
@@ -453,53 +456,56 @@ DECLARE_REG 14, R15, 120
 
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 8
ASSERT xmm_regs_used <= 16 + high_mm_regs
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
%assign %%pad (xmm_regs_used-8)*16 + 32
%assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
WIN64_PUSH_XMM
%endmacro
 
%macro WIN64_RESTORE_XMM_INTERNAL 1
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep xmm_regs_used-8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i xmm_regs_used - high_mm_regs
%rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
%endif
%if stack_size_padded > 0
%if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
add %1, stack_size_padded
add rsp, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
%if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
%if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
 
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
%macro WIN64_RESTORE_XMM 0
WIN64_RESTORE_XMM_INTERNAL
%assign stack_offset (stack_offset-stack_size_padded)
%assign stack_size_padded 0
%assign xmm_regs_used 0
%endmacro
 
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
 
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
Loading
Loading
@@ -518,14 +524,15 @@ DECLARE_REG 7, R10, 16
DECLARE_REG 8, R11, 24
DECLARE_REG 9, rbx, 32
DECLARE_REG 10, rbp, 40
DECLARE_REG 11, R12, 48
DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
DECLARE_REG 11, R14, 48
DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
 
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
%assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
Loading
Loading
@@ -535,7 +542,7 @@ DECLARE_REG 14, R15, 72
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
 
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
%macro RET 0
%if stack_size_padded > 0
Loading
Loading
@@ -546,7 +553,7 @@ DECLARE_REG 14, R15, 72
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
Loading
Loading
@@ -591,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
 
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
%macro RET 0
%if stack_size_padded > 0
Loading
Loading
@@ -602,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
Loading
Loading
@@ -613,7 +620,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM 1
%macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
Loading
Loading
@@ -624,7 +631,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
%if has_epilogue
%if has_epilogue || cpuflag(ssse3)
RET
%else
rep ret
Loading
Loading
@@ -712,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
Loading
Loading
@@ -775,24 +782,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
%assign cpuflags_avx (1<<11)| cpuflags_sse42
%assign cpuflags_xop (1<<12)| cpuflags_avx
%assign cpuflags_fma4 (1<<13)| cpuflags_avx
%assign cpuflags_fma3 (1<<14)| cpuflags_avx
%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
%assign cpuflags_slowctz (1<<18)
%assign cpuflags_lzcnt (1<<19)
%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<21)
%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
%assign cpuflags_aesni (1<<12)| cpuflags_sse42
%assign cpuflags_avx (1<<13)| cpuflags_sse42
%assign cpuflags_xop (1<<14)| cpuflags_avx
%assign cpuflags_fma4 (1<<15)| cpuflags_avx
%assign cpuflags_fma3 (1<<16)| cpuflags_avx
%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
%assign cpuflags_cache32 (1<<21)
%assign cpuflags_cache64 (1<<22)
%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<24)
 
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
Loading
Loading
@@ -835,7 +843,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 
%if ARCH_X86_64 || cpuflag(sse2)
%ifdef __NASM_VER__
ALIGNMODE k8
ALIGNMODE p6
%else
CPU amdnop
%endif
Loading
Loading
@@ -848,11 +856,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%endmacro
 
; Merge mmx and sse*
; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
; (All 3 remain in sync through SWAP.)
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
; (All 4 remain in sync through SWAP.)
 
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
Loading
Loading
@@ -862,6 +871,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%undef %1%2
%endmacro
 
; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
%if ARCH_X86_64 && cpuflag(avx512)
%assign %%i %1
%rep 16-%1
%assign %%i_high %%i+16
SWAP %%i, %%i_high
%assign %%i %%i+1
%endrep
%endif
%endmacro
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
Loading
Loading
@@ -877,7 +898,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
%rep 8
%rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
Loading
Loading
@@ -891,7 +912,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
Loading
Loading
@@ -904,6 +925,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%if WIN64
; Swap callee-saved registers with volatile registers
AVX512_MM_PERMUTATION 6
%endif
%endmacro
 
%macro INIT_YMM 0-1+
Loading
Loading
@@ -912,7 +937,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
Loading
Loading
@@ -925,6 +950,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
%macro INIT_ZMM 0-1+
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_ZMM %1
%define mmsize 64
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
%undef movh
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, zmm %+ %%i
CAT_XDEFINE nnzmm, %%i, %%i
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
 
INIT_XMM
Loading
Loading
@@ -933,18 +981,26 @@ INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
%define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
%define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
%define ymmzmm%1 ymm%1
%define zmmmm%1 mm%1
%define zmmxmm%1 xmm%1
%define zmmymm%1 ymm%1
%define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
%define zm%1 zmm %+ m%1
%endmacro
 
%assign i 0
%rep 16
%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
Loading
Loading
@@ -1032,7 +1088,11 @@ INIT_XMM
 
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
call_internal %1 %+ SUFFIX, %1
%ifid %1
call_internal %1 %+ SUFFIX, %1
%else
call %1
%endif
%endmacro
%macro call_internal 2
%xdefine %%i %2
Loading
Loading
@@ -1075,12 +1135,17 @@ INIT_XMM
;=============================================================================
 
%assign i 0
%rep 16
%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
CAT_XDEFINE sizeofzmm, i, 64
CAT_XDEFINE regnumofxmm, i, i
CAT_XDEFINE regnumofymm, i, i
CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
Loading
Loading
@@ -1197,7 +1262,7 @@ INIT_XMM
%endmacro
%endmacro
 
; Instructions with both VEX and non-VEX encodings
; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
Loading
Loading
@@ -1529,15 +1594,48 @@ FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
 
; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
%ifdef __YASM_VER__
%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
%macro vpbroadcastq 2
%if sizeof%1 == 16
movddup %1, %2
%else
vbroadcastsd %1, %2
; Macros for converting VEX instructions to equivalent EVEX ones.
%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
%macro %1 2-7 fnord, fnord, %1, %2, %3
%ifidn %3, fnord
%define %%args %1, %2
%elifidn %4, fnord
%define %%args %1, %2, %3
%else
%define %%args %1, %2, %3, %4
%endif
%assign %%evex_required cpuflag(avx512) & %7
%ifnum regnumof%1
%if regnumof%1 >= 16 || sizeof%1 > 32
%assign %%evex_required 1
%endif
%endmacro
%endif
%endif
%endif
%ifnum regnumof%2
%if regnumof%2 >= 16 || sizeof%2 > 32
%assign %%evex_required 1
%endif
%endif
%if %%evex_required
%6 %%args
%else
%5 %%args ; Prefer VEX over EVEX due to shorter instruction length
%endif
%endmacro
%endmacro
EVEX_INSTR vbroadcastf128, vbroadcastf32x4
EVEX_INSTR vbroadcasti128, vbroadcasti32x4
EVEX_INSTR vextractf128, vextractf32x4
EVEX_INSTR vextracti128, vextracti32x4
EVEX_INSTR vinsertf128, vinsertf32x4
EVEX_INSTR vinserti128, vinserti32x4
EVEX_INSTR vmovdqa, vmovdqa32
EVEX_INSTR vmovdqu, vmovdqu32
EVEX_INSTR vpand, vpandd
EVEX_INSTR vpandn, vpandnd
EVEX_INSTR vpor, vpord
EVEX_INSTR vpxor, vpxord
EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
EVEX_INSTR vrcpss, vrcp14ss, 1
EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
Loading
Loading
@@ -303,24 +303,24 @@
%endmacro
 
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
vextracti128 %2, %1, 1
%define %1 xmm%1
paddd %1, %2
%if sizeof%1 >= 64
vextracti32x8 ymm%2, zmm%1, 1
paddd ymm%1, ymm%2
%endif
%if mmsize >= 16
MOVHL %2, %1
paddd %1, %2
%if sizeof%1 >= 32
vextracti128 xmm%2, ymm%1, 1
paddd xmm%1, xmm%2
%endif
%if sizeof%1 >= 16
MOVHL xmm%2, xmm%1
paddd xmm%1, xmm%2
%endif
%if cpuflag(xop) && sizeof%1 == 16
vphadddq %1, %1
vphadddq xmm%1, xmm%1
%else
PSHUFLW %2, %1, q0032
paddd %1, %2
PSHUFLW xmm%2, xmm%1, q1032
paddd xmm%1, xmm%2
%endif
%undef %1
%undef %2
%endmacro
 
%macro HADDW 2 ; reg, tmp
Loading
Loading
Loading
Loading
@@ -34,37 +34,23 @@
 
typedef struct
{
/* 16x16 */
int i_rd16x16;
x264_me_t me16x16;
x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
int i_cost4x4[4]; /* cost per 8x8 partition */
x264_me_t me4x4[4][4];
/* Sub 8x4 */
int i_cost8x4[4]; /* cost per 8x8 partition */
x264_me_t me8x4[4][2];
/* Sub 4x8 */
int i_cost4x8[4]; /* cost per 8x8 partition */
x264_me_t me4x8[4][2];
/* 16x8 */
int i_cost16x8;
x264_me_t me16x8[2];
/* 8x16 */
int i_cost8x16;
x264_me_t me8x16[2];
int i_rd16x16;
int i_cost8x8;
int i_cost4x4[4]; /* cost per 8x8 partition */
int i_cost8x4[4]; /* cost per 8x8 partition */
int i_cost4x8[4]; /* cost per 8x8 partition */
int i_cost16x8;
int i_cost8x16;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
} x264_mb_analysis_list_t;
 
typedef struct
Loading
Loading
@@ -278,29 +264,31 @@ static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
 
static int init_costs( x264_t *h, float *logs, int qp )
{
int lambda = x264_lambda_tab[qp];
if( h->cost_mv[qp] )
return 0;
int mv_range = h->param.analyse.i_mv_range;
int lambda = x264_lambda_tab[qp];
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv[qp] += 2*4*2048;
for( int i = 0; i <= 2*4*2048; i++ )
CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) );
h->cost_mv[qp] += 2*4*mv_range;
for( int i = 0; i <= 2*4*mv_range; i++ )
{
h->cost_mv[qp][-i] =
h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
}
x264_pthread_mutex_lock( &cost_ref_mutex );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 33; j++ )
x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
x264_pthread_mutex_unlock( &cost_ref_mutex );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
{
for( int j = 0; j < 4; j++ )
{
CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[qp][j] += 2*2048;
for( int i = -2*2048; i < 2*2048; i++ )
CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[qp][j] += 2*mv_range;
for( int i = -2*mv_range; i < 2*mv_range; i++ )
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
}
}
Loading
Loading
@@ -314,12 +302,13 @@ fail:
 
int x264_analyse_init_costs( x264_t *h )
{
float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
int mv_range = h->param.analyse.i_mv_range;
float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) );
if( !logs )
return -1;
 
logs[0] = 0.718f;
for( int i = 1; i <= 2*4*2048; i++ )
for( int i = 1; i <= 2*4*mv_range; i++ )
logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
 
for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
Loading
Loading
@@ -338,13 +327,14 @@ fail:
 
void x264_analyse_free_costs( x264_t *h )
{
int mv_range = h->param.analyse.i_mv_range;
for( int i = 0; i < QP_MAX+1; i++ )
{
if( h->cost_mv[i] )
x264_free( h->cost_mv[i] - 2*4*2048 );
x264_free( h->cost_mv[i] - 2*4*mv_range );
if( h->cost_mv_fpel[i][0] )
for( int j = 0; j < 4; j++ )
x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
x264_free( h->cost_mv_fpel[i][j] - 2*mv_range );
}
}
 
Loading
Loading
@@ -465,11 +455,10 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 
/* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range );
h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 );
if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
{
int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
Loading
Loading
@@ -513,9 +502,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
mb_y = (h->mb.i_mb_y >> j) + (i == 1);
h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range );
h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
}
Loading
Loading
@@ -524,9 +512,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
{
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range );
h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
Loading
Loading
@@ -541,7 +528,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
}
#undef CLIP_FMV
 
a->l0.me16x16.cost =
a->l0.i_rd16x16 =
Loading
Loading
@@ -713,8 +699,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
/* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
h->mb.pic.fenc_hadamard_cache[8] = 0;
if( b_satd )
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}
Loading
Loading
@@ -743,8 +733,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
}
a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+ h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+ h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
return;
}
 
Loading
Loading
@@ -759,8 +749,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
 
for( ; *predict_mode >= 0; predict_mode++ )
{
Loading
Loading
@@ -788,8 +778,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
}
 
/* we calculate the cost */
i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) +
h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
 
a->i_satd_chroma_dir[i_mode] = i_satd;
Loading
Loading
@@ -845,7 +835,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( a->i_satd_i16x16 <= i16x16_thresh )
{
h->predict_16x16[I_PRED_16x16_P]( p_dst );
a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
}
Loading
Loading
@@ -862,7 +852,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_16x16[i_mode]( p_dst );
 
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) +
lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
a->i_satd_i16x16_dir[i_mode] = i_satd;
Loading
Loading
@@ -1065,7 +1055,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_4x4[i_mode]( p_dst_by );
 
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE );
if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
{
i_satd -= lambda * 3;
Loading
Loading
@@ -1735,7 +1725,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
pixel **p_fref, int i8x8, int size, int chroma )
{
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
int i_stride = h->mb.pic.i_stride[1];
int chroma_h_shift = chroma <= CHROMA_422;
Loading
Loading
@@ -1919,8 +1909,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
 
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] );
int i_chroma_cost = 0;
int chromapix = h->luma2chroma_pixel[i_pixel];
 
Loading
Loading
@@ -2013,8 +2003,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
Loading
Loading
@@ -2147,7 +2137,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
else
{
ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int v_shift = CHROMA_V_SHIFT;
 
Loading
Loading
@@ -2483,7 +2473,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
 
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
 
h->mb.i_partition = D_16x8;
Loading
Loading
Loading
Loading
@@ -801,7 +801,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat
 
static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
Loading
Loading
@@ -915,7 +915,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_
 
static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
Loading
Loading
@@ -923,7 +923,7 @@ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t
}
static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
Loading
Loading
@@ -1057,29 +1057,29 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
src = dst;
 
#define MUNGE_8x8_NNZ( MUNGE )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\
{\
MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\
MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\
MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\
MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\
MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\
MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\
}\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\
{\
MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\
MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\
MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\
MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\
MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\
MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\
}\
if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\
{\
MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\
MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\
MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\
}
 
MUNGE_8x8_NNZ( BACKUP )
Loading
Loading
Loading
Loading
@@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
fail = 1;
}
#endif
if( !fail && !(cpuflags & X264_CPU_CMOV) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
fail = 1;
}
if( fail )
{
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
Loading
Loading
@@ -494,7 +489,8 @@ static int x264_validate_parameters( x264_t *h, int b_open )
#endif
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/"
"I444/YV24/BGR/BGRA/RGB supported)\n" );
return -1;
}
 
Loading
Loading
@@ -859,6 +855,11 @@ static int x264_validate_parameters( x264_t *h, int b_open )
h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
}
if( i_csp >= X264_CSP_I444 && h->param.b_cabac )
{
/* Disable 8x8dct during 4:4:4+CABAC encoding for compatibility with libavcodec */
h->param.analyse.b_transform_8x8 = 0;
}
if( h->param.rc.i_rc_method == X264_RC_CQP )
{
float qp_p = h->param.rc.i_qp_constant;
Loading
Loading
@@ -1170,7 +1171,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
if( h->param.analyse.i_mv_range <= 0 )
h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED;
else
h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED);
h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED);
}
 
h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
Loading
Loading
@@ -1530,6 +1531,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_rdo_init();
 
/* init CPU functions */
#if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH
/* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions
* enabled in high bit-depth are insignificant and just causes potential issues with
* unnecessary thermal throttling and whatnot, so keep it disabled for now. */
h->param.cpu &= ~X264_CPU_AVX512;
#endif
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
Loading
Loading
@@ -1566,9 +1573,15 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
&& (h->param.cpu & X264_CPU_SSE42) )
continue;
if( !strcmp(x264_cpu_names[i].name, "LZCNT")
&& (h->param.cpu & X264_CPU_BMI1) )
continue;
if( !strcmp(x264_cpu_names[i].name, "BMI1")
&& (h->param.cpu & X264_CPU_BMI2) )
continue;
if( !strcmp(x264_cpu_names[i].name, "FMA4")
&& (h->param.cpu & X264_CPU_FMA3) )
continue;
if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
Loading
Loading
@@ -1580,14 +1593,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( x264_analyse_init_costs( h ) )
goto fail;
 
static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
/* Checks for known miscompilation issues. */
if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
{
x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
goto fail;
}
/* Must be volatile or else GCC will optimize it out. */
volatile int temp = 392;
if( x264_clz( temp ) != 23 )
Loading
Loading
Loading
Loading
@@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
 
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] );
 
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
Loading
Loading
@@ -283,13 +283,10 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
{
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
int ssd[2];
ALIGNED_ARRAY_8( int, ssd,[2] );
int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
 
int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
if( score < thresh*4 )
score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 )
{
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
Loading
Loading
@@ -350,7 +347,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
 
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
 
if( h->mb.b_lossless )
{
Loading
Loading
@@ -561,9 +558,16 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
 
if( i_mode == I_PRED_4x4_V )
{
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) );
}
else if( i_mode == I_PRED_4x4_H )
{
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
for( int i = 0; i < 4; i++ )
p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
}
else
h->predict_4x4[i_mode]( p_dst );
}
Loading
Loading
@@ -574,9 +578,16 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
 
if( i_mode == I_PRED_8x8_V )
{
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
memcpy( p_dst, &edge[16], 8*sizeof(pixel) );
}
else if( i_mode == I_PRED_8x8_H )
{
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
for( int i = 0; i < 8; i++ )
p_dst[i*FDEC_STRIDE] = edge[14-i];
}
else
h->predict_8x8[i_mode]( p_dst, edge );
}
Loading
Loading
@@ -584,12 +595,21 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
{
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
pixel *p_dst = h->mb.pic.p_fdec[p];
if( i_mode == I_PRED_16x16_V )
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
{
h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) );
}
else if( i_mode == I_PRED_16x16_H )
h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
{
h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
for( int i = 0; i < 16; i++ )
p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
}
else
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
h->predict_16x16[i_mode]( p_dst );
}
 
/*****************************************************************************
Loading
Loading
@@ -780,7 +800,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else if( h->mb.b_transform_8x8 )
{
ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
Loading
Loading
@@ -824,7 +844,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
Loading
Loading
@@ -965,8 +985,8 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
 
Loading
Loading
@@ -1219,7 +1239,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
 
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
Loading
Loading
@@ -1252,7 +1272,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
 
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
Loading
Loading
@@ -1311,7 +1331,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
 
Loading
Loading
@@ -1376,7 +1396,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
Loading
Loading
Loading
Loading
@@ -55,6 +55,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
 
void x264_cabac_mb_skip( x264_t *h, int b_skip );
void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
 
int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
int ctx_block_cat, int b_intra, int idx );
Loading
Loading
@@ -113,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
 
if( b_predict )
{
Loading
Loading
@@ -151,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 
if( b_predict )
Loading
Loading
Loading
Loading
@@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_N( pixel, pix,[16*16] );
ALIGNED_ARRAY_32( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
 
ALIGNED_ARRAY_16( int, costs,[16] );
Loading
Loading
@@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_16( int, costs,[4] );
 
int bmx = m->mv[0];
Loading
Loading
@@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
int chroma_v_shift = CHROMA_V_SHIFT;
Loading
Loading
@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
Loading
Loading
Loading
Loading
@@ -32,10 +32,10 @@
 
typedef struct
{
/* aligning the first member is a gcc hack to force the struct to be
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
/* aligning the first member is a gcc hack to force the struct to be aligned,
* as well as force sizeof(struct) to be a multiple of the alignment. */
/* input */
ALIGNED_16( int i_pixel ); /* PIXEL_WxH */
ALIGNED_64( int i_pixel ); /* PIXEL_WxH */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
Loading
Loading
@@ -53,7 +53,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
ALIGNED_4( int16_t mv[2] );
} ALIGNED_16( x264_me_t );
} ALIGNED_64( x264_me_t );
 
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
Loading
Loading
@@ -66,8 +66,6 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
 
extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];
#define COPY1_IF_LT(x,y)\
if( (y) < (x) )\
(x) = (y);
Loading
Loading
Loading
Loading
@@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
stride <<= b_field;
if( b_chroma )
{
ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int shift = 7 - CHROMA_V_SHIFT;
 
Loading
Loading
@@ -420,7 +420,7 @@ static int x264_macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc
float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f};
int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])};
int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])};
if( PARAM_INTERLACED )
if( h->param.b_interlaced || h->param.b_fake_interlaced )
{
srcdimi[1] = (srcdimi[1]+1)&~1;
dstdimi[1] = (dstdimi[1]+1)&~1;
Loading
Loading
@@ -1469,7 +1469,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
if( h->i_frame == 0 )
{
//384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
double fr = 1. / 172;
double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172);
int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
}
Loading
Loading
Loading
Loading
@@ -58,8 +58,6 @@ int x264_ratecontrol_qp( x264_t * );
int x264_ratecontrol_mb_qp( x264_t *h );
int x264_ratecontrol_end( x264_t *, int bits, int *filler );
void x264_ratecontrol_summary( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
int x264_ratecontrol_get_estimated_size( x264_t const *);
int x264_rc_analyse_slice( x264_t *h );
void x264_threads_distribute_ratecontrol( x264_t *h );
void x264_threads_merge_ratecontrol( x264_t *h );
Loading
Loading
Loading
Loading
@@ -64,9 +64,8 @@ static uint16_t cabac_size_5ones[128];
#include "cabac.c"
 
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
#define COPY_CABAC_PART( pos, size )\
memcpy( &cb->state[pos], &h->cabac.state[pos], size )
sizeof(int) + (CHROMA444 ? 1024+12 : 460) )
#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size )
 
static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
Loading
Loading
@@ -634,8 +633,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
Loading
Loading
@@ -695,7 +694,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
return !!dct[0];
}
 
#if HAVE_MMX && ARCH_X86_64
#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
if( num_coefs == 16 && !dc )
Loading
Loading
Loading
Loading
@@ -783,23 +783,26 @@ int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
 
const x264_level_t x264_levels[] =
{
{ 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
{ 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */
{ 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 },
{ 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 },
{ 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 },
{ 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 },
{ 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
{ 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
{ 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 },
{ 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 },
{ 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 },
{ 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 },
{ 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 },
{ 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 },
{ 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
{ 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
{ 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */
{ 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 },
{ 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 },
{ 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 },
{ 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 },
{ 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
{ 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
{ 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 },
{ 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 },
{ 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 },
{ 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 },
{ 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 },
{ 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 },
{ 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
{ 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 },
{ 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 },
{ 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 },
{ 0 }
};
 
Loading
Loading
Loading
Loading
@@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
int i_lines = fenc->i_lines[p];
int i_width = fenc->i_width[p];
pixel *src = fenc->plane[p];
ALIGNED_ARRAY_16( pixel, buf, [16*16] );
ALIGNED_ARRAY_64( pixel, buf, [16*16] );
int pixoff = 0;
if( w )
{
Loading
Loading
@@ -544,17 +544,18 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( p0 == p1 )
goto lowres_intra_mb;
 
int mv_range = 2 * h->param.analyse.i_mv_range;
// no need for h->mb.mv_min[]
h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range );
h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2;
h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2;
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range );
h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2;
h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2;
}
 
#define LOAD_HPELS_LUMA(dst, src) \
Loading
Loading
@@ -728,13 +729,13 @@ lowres_intra_mb:
if( h->param.analyse.i_subpel_refine > 1 )
{
h->predict_8x8c[I_PRED_CHROMA_P]( pix );
int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
for( int i = 3; i < 9; i++ )
{
h->predict_8x8[i]( pix, edge );
satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
}
}
Loading
Loading
Loading
Loading
@@ -154,10 +154,12 @@ static int convert_csp_to_pix_fmt( int csp )
case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24;
case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24;
case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA;
/* the next csp has no equivalent 16bit depth in swscale */
/* the following has no equivalent 16-bit depth in swscale */
case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12;
case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21;
/* the next csp is no supported by swscale at all */
case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422;
case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422;
/* the following is not supported by swscale at all */
case X264_CSP_NV16:
default: return AV_PIX_FMT_NONE;
}
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment