Skip to content
Snippets Groups Projects
Commit d8de3cd6 authored by Leo Ma's avatar Leo Ma
Browse files

Upgrade libx264


Signed-off-by: default avatarLeo Ma <begeekmyfriend@gmail.com>
parent 0984dd59
No related branches found
No related tags found
No related merge requests found
Showing
with 302 additions and 213 deletions
Loading
Loading
@@ -278,7 +278,8 @@ clean:
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest*
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
rm -rf conftest*
 
install-cli: cli
$(INSTALL) -d $(DESTDIR)$(bindir)
Loading
Loading
Loading
Loading
@@ -569,57 +569,65 @@ endfunc
 
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
mov x5, \h - 4
usubl v6.8h, v16.8b, v18.8b
usubl v7.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smull v2.4s, v6.4h, v6.4h
smull2 v3.4s, v6.8h, v6.8h
add v0.8h, v6.8h, v7.8h
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
mov x3, #16
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
mov x5, \h - 2
usubl v0.8h, v16.8b, v18.8b
usubl v1.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smull v2.4s, v0.4h, v0.4h
smull2 v3.4s, v0.8h, v0.8h
smull v4.4s, v1.4h, v1.4h
smull2 v5.4s, v1.8h, v1.8h
 
usubl v6.8h, v16.8b, v18.8b
 
1: subs x5, x5, #2
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
1: subs x5, x5, #1
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smlal v4.4s, v7.4h, v7.4h
smlal2 v5.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
add v0.8h, v0.8h, v7.8h
add v1.8h, v1.8h, v7.8h
b.gt 1b
 
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
smlal v2.4s, v7.4h, v7.4h
add v0.8h, v0.8h, v7.8h
smlal2 v3.4s, v7.8h, v7.8h
smlal v4.4s, v7.4h, v7.4h
add v1.8h, v1.8h, v7.8h
smlal2 v5.4s, v7.8h, v7.8h
 
saddlv s0, v0.8h
saddlv s1, v1.8h
add v2.4s, v2.4s, v3.4s
add v4.4s, v4.4s, v5.4s
mov w0, v0.s[0]
addv s1, v2.4s
sxtw x0, w0
mov w1, v1.s[0]
mul x0, x0, x0
str w1, [x4]
sub x0, x1, x0, lsr # 6 + (\h >> 4)
addv s2, v2.4s
addv s4, v4.4s
mul w0, w0, w0
mul w1, w1, w1
mov w3, v2.s[0]
mov w4, v4.s[0]
sub w0, w3, w0, lsr # 6 + (\h >> 4)
sub w1, w4, w1, lsr # 6 + (\h >> 4)
str w3, [x2]
add w0, w0, w1
str w4, [x2, #4]
 
ret
endfunc
Loading
Loading
Loading
Loading
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
 
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
Loading
Loading
Loading
Loading
@@ -28,15 +28,10 @@
 
.syntax unified
 
#if HAVE_NEON
.arch armv7-a
#elif HAVE_ARMV6T2
.arch armv6t2
#elif HAVE_ARMV6
.arch armv6
#endif
#ifndef __APPLE__
.arch armv7-a
.fpu neon
#endif
 
#ifdef PREFIX
# define EXTERN_ASM _
Loading
Loading
@@ -50,6 +45,14 @@
# define ELF @
#endif
 
#ifdef __MACH__
# define MACH
# define NONMACH @
#else
# define MACH @
# define NONMACH
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
Loading
Loading
@@ -76,6 +79,7 @@ ELF .size \name, . - \name
FUNC .endfunc
.purgem endfunc
.endm
.text
.align 2
.if \export == 1
.global EXTERN_ASM\name
Loading
Loading
@@ -99,7 +103,8 @@ ELF .size \name, . - \name
.if HAVE_SECTION_DATA_REL_RO && \relocate
.section .data.rel.ro
.else
.section .rodata
NONMACH .section .rodata
MACH .const_data
.endif
.align \align
\name:
Loading
Loading
Loading
Loading
@@ -26,14 +26,12 @@
 
#include "asm.S"
 
.section .rodata
.align 4
scan4x4_frame:
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 2,3, 8,9, 16,17, 10,11
.byte 12,13, 6,7, 14,15, 20,21
.byte 10,11, 12,13, 6,7, 14,15
endconst
 
.text
 
Loading
Loading
Loading
Loading
@@ -28,10 +28,9 @@
 
#include "asm.S"
 
.section .rodata
.align 4
pw_0to15:
const pw_0to15, align=4
.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
endconst
 
.text
 
Loading
Loading
@@ -140,7 +139,7 @@ MEMCPY_ALIGNED 16, 8
MEMCPY_ALIGNED 8, 16
MEMCPY_ALIGNED 8, 8
 
const memcpy_table align=2, relocate=1
const memcpy_table, align=2, relocate=1
.word memcpy_aligned_16_16_neon
.word memcpy_aligned_16_8_neon
.word memcpy_aligned_8_16_neon
Loading
Loading
Loading
Loading
@@ -26,9 +26,7 @@
 
#include "asm.S"
 
.section .rodata
.align 4
const mask_array, align=4
.rept 16
.byte 0xff
.endr
Loading
Loading
@@ -36,11 +34,14 @@ mask_ff:
.rept 16
.byte 0
.endr
endconst
 
mask_ac4:
const mask_ac4, align=4
.short 0, -1, -1, -1, 0, -1, -1, -1
mask_ac8:
endconst
const mask_ac8, align=4
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
 
.text
 
Loading
Loading
@@ -718,13 +719,24 @@ function x264_var_end, export=0
bx lr
endfunc
 
.macro DIFF_SUM diff da db lastdiff
vld1.64 {\da}, [r0,:64], r1
vld1.64 {\db}, [r2,:64], r3
.ifnb \lastdiff
vadd.s16 q0, q0, \lastdiff
.macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2
vld1.64 {\da1}, [r0,:64]!
vld1.64 {\db1}, [r1,:64], r3
.ifnb \lastdiff1
vadd.s16 \acc1, \acc1, \lastdiff1
vadd.s16 \acc2, \acc2, \lastdiff2
.endif
vsubl.u8 \diff, \da, \db
vld1.64 {\da2}, [r0,:64]!
vld1.64 {\db2}, [r1,:64], r3
vsubl.u8 \diff1, \da1, \db1
vsubl.u8 \diff2, \da2, \db2
.endm
.macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16
\vmlal \acc1, \d0, \d0
vmlal.s16 \acc1, \d1, \d1
\vmlal \acc2, \d2, \d2
vmlal.s16 \acc2, \d3, \d3
.endm
 
.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
Loading
Loading
@@ -733,77 +745,89 @@ endfunc
.endm
 
function x264_pixel_var2_8x8_neon
DIFF_SUM q0, d0, d1
DIFF_SUM q8, d16, d17
SQR_ACC q1, d0, d1, vmull.s16
DIFF_SUM q9, d18, d19, q8
SQR_ACC q2, d16, d17, vmull.s16
mov r3, #16
DIFF_SUM q0, q10, d0, d1, d20, d21
DIFF_SUM q8, q11, d16, d17, d22, d23
SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16
DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16
.rept 2
DIFF_SUM q8, d16, d17, q9
SQR_ACC q1, d18, d19
DIFF_SUM q9, d18, d19, q8
SQR_ACC q2, d16, d17
DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
.endr
DIFF_SUM q8, d16, d17, q9
SQR_ACC q1, d18, d19
DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
vadd.s16 q0, q0, q8
SQR_ACC q2, d16, d17
vadd.s16 q10, q10, q11
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
 
ldr ip, [sp]
vadd.s16 d0, d0, d1
vadd.s16 d20, d20, d21
vadd.s32 q1, q1, q2
vadd.s32 q13, q13, q14
vpaddl.s16 d0, d0
vpaddl.s16 d20, d20
vadd.s32 d1, d2, d3
vpadd.s32 d0, d0, d1
vadd.s32 d26, d26, d27
vpadd.s32 d0, d0, d20 @ sum
vpadd.s32 d1, d1, d26 @ sqr
vmul.s32 d0, d0, d0 @ sum*sum
vshr.s32 d0, d0, #6
vsub.s32 d0, d1, d0
vpadd.s32 d0, d0, d0
 
vmov r0, r1, d0
vst1.32 {d0[1]}, [ip,:32]
mul r0, r0, r0
sub r0, r1, r0, lsr #6
vst1.32 {d1}, [r2,:64]
bx lr
endfunc
 
function x264_pixel_var2_8x16_neon
vld1.64 {d16}, [r0,:64], r1
vld1.64 {d17}, [r2,:64], r3
vld1.64 {d18}, [r0,:64], r1
vld1.64 {d19}, [r2,:64], r3
vsubl.u8 q10, d16, d17
vsubl.u8 q11, d18, d19
SQR_ACC q1, d20, d21, vmull.s16
vld1.64 {d16}, [r0,:64], r1
vadd.s16 q0, q10, q11
vld1.64 {d17}, [r2,:64], r3
SQR_ACC q2, d22, d23, vmull.s16
mov ip, #14
1: subs ip, ip, #2
vld1.64 {d18}, [r0,:64], r1
mov r3, #16
vld1.64 {d16}, [r0,:64]!
vld1.64 {d17}, [r1,:64], r3
vld1.64 {d18}, [r0,:64]!
vld1.64 {d19}, [r1,:64], r3
vsubl.u8 q0, d16, d17
vsubl.u8 q3, d18, d19
SQR_ACC q1, d0, d1, vmull.s16
vld1.64 {d16}, [r0,:64]!
mov ip, #15
vld1.64 {d17}, [r1,:64], r3
SQR_ACC q2, d6, d7, vmull.s16
1: subs ip, ip, #1
vld1.64 {d18}, [r0,:64]!
vsubl.u8 q10, d16, d17
vld1.64 {d19}, [r2,:64], r3
vld1.64 {d19}, [r1,:64], r3
vadd.s16 q0, q0, q10
SQR_ACC q1, d20, d21
vsubl.u8 q11, d18, d19
beq 2f
vld1.64 {d16}, [r0,:64], r1
vadd.s16 q0, q0, q11
vld1.64 {d17}, [r2,:64], r3
vld1.64 {d16}, [r0,:64]!
vadd.s16 q3, q3, q11
vld1.64 {d17}, [r1,:64], r3
SQR_ACC q2, d22, d23
b 1b
2:
vadd.s16 q0, q0, q11
vadd.s16 q3, q3, q11
SQR_ACC q2, d22, d23
 
ldr ip, [sp]
vadd.s16 d0, d0, d1
vadd.s32 q1, q1, q2
vadd.s16 d6, d6, d7
vpaddl.s16 d0, d0
vadd.s32 d1, d2, d3
vpadd.s32 d0, d0, d1
vpaddl.s16 d6, d6
vadd.s32 d2, d2, d3
vadd.s32 d4, d4, d5
vpadd.s32 d0, d0, d6 @ sum
vpadd.s32 d2, d2, d4 @ sqr
vmul.s32 d0, d0, d0 @ sum*sum
vshr.s32 d0, d0, #7
vsub.s32 d0, d2, d0
vpadd.s32 d0, d0, d0
 
vmov r0, r1, d0
vst1.32 {d0[1]}, [ip,:32]
mul r0, r0, r0
sub r0, r1, r0, lsr #7
vst1.32 {d2}, [r2,:64]
bx lr
endfunc
 
Loading
Loading
Loading
Loading
@@ -63,8 +63,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
 
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
Loading
Loading
Loading
Loading
@@ -27,10 +27,9 @@
 
#include "asm.S"
 
.section .rodata
.align 4
p16weight: .short 1,2,3,4,5,6,7,8
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
 
.text
 
Loading
Loading
Loading
Loading
@@ -26,19 +26,20 @@
 
#include "asm.S"
 
.section .rodata
.align 4
pmovmskb_byte:
const pmovmskb_byte, align=4
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
endconst
 
mask_2bit:
const mask_2bit, align=4
.byte 3,12,48,192,3,12,48,192
.byte 3,12,48,192,3,12,48,192
endconst
 
mask_1bit:
const mask_1bit, align=4
.byte 128,64,32,16,8,4,2,1
.byte 128,64,32,16,8,4,2,1
endconst
 
.text
 
Loading
Loading
Loading
Loading
@@ -43,16 +43,19 @@ uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
 
Loading
Loading
@@ -116,7 +119,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
 
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
#if ARCH_X86_64
#if ARCH_X86_64 && !defined( __MACH__ )
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
Loading
Loading
@@ -126,18 +129,17 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_mmx2;
if( cpu&X264_CPU_SSE2 )
{
#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
}
#endif
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
#if ARCH_X86_64
#if ARCH_X86_64 && !defined( __MACH__ )
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
Loading
Loading
@@ -152,8 +154,14 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
if( cpu&X264_CPU_AVX2 )
{
pf->nal_escape = x264_nal_escape_avx2;
if( cpu&X264_CPU_BMI2 )
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
}
#endif
#endif
Loading
Loading
Loading
Loading
@@ -42,7 +42,7 @@ typedef struct
uint8_t *p_end;
 
/* aligned for memcpy_aligned starting here */
ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
 
/* context */
uint8_t state[1024];
Loading
Loading
Loading
Loading
@@ -669,7 +669,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
{
if( !strcmp(value, "1b") )
p->i_level_idc = 9;
else if( atof(value) < 6 )
else if( atof(value) < 7 )
p->i_level_idc = (int)(10*atof(value)+.5);
else
p->i_level_idc = atoi(value);
Loading
Loading
@@ -1143,6 +1143,8 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
[X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, },
[X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, },
[X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, },
[X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, },
Loading
Loading
Loading
Loading
@@ -635,11 +635,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
ALIGNED_N( dctcoef luma16x16_dc[3][16] );
ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_N( dctcoef luma8x8[12][64] );
ALIGNED_N( dctcoef luma4x4[16*3][16] );
ALIGNED_64( dctcoef luma8x8[12][64] );
ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
 
/* MB table and cache for current frame/mb */
Loading
Loading
@@ -729,7 +729,7 @@ struct x264_t
int8_t *type; /* mb type */
uint8_t *partition; /* mb partition */
int8_t *qp; /* mb qp */
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
/* actually has only 7 entries; set to 8 for write-combining optimizations */
uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */
Loading
Loading
@@ -740,8 +740,7 @@ struct x264_t
int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of
* NOTE: this will fail on resolutions above 2^16 MBs... */
uint32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */
uint8_t *field;
 
/* buffer for weighted versions of the reference frames */
Loading
Loading
@@ -778,26 +777,27 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
 
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
 
/* Psy trellis DCT data */
ALIGNED_16( dctcoef fenc_dct8[4][64] );
ALIGNED_16( dctcoef fenc_dct4[16][16] );
 
/* Psy RD SATD/SA8D scores cache */
ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
ALIGNED_N( uint32_t fenc_satd_cache[32] );
ALIGNED_64( uint32_t fenc_satd_cache[32] );
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
int i4x4_cbp;
int i8x8_cbp;
 
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
Loading
Loading
@@ -822,10 +822,10 @@ struct x264_t
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
 
/* i_non_zero_count if available else 0x80 */
ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
 
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
Loading
Loading
@@ -930,8 +930,8 @@ struct x264_t
uint32_t (*nr_residual_sum)[64];
uint32_t *nr_count;
 
ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
 
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
Loading
Loading
Loading
Loading
@@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
Loading
Loading
@@ -56,6 +55,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"LZCNT", SSE2|X264_CPU_LZCNT},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
Loading
Loading
@@ -66,16 +66,17 @@ const x264_cpu_name_t x264_cpu_names[] =
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"FMA3", AVX|X264_CPU_FMA3},
{"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2},
{"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
{"AVX2", AVX2},
{"AVX512", AVX2|X264_CPU_AVX512},
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
Loading
Loading
@@ -118,7 +119,7 @@ static void sigill_handler( int sig )
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
uint64_t x264_cpu_xgetbv( int xcr );
 
uint32_t x264_cpu_detect( void )
{
Loading
Loading
@@ -126,15 +127,14 @@ uint32_t x264_cpu_detect( void )
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
int cache;
uint64_t xcr0 = 0;
 
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
 
x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
max_basic_cap = eax;
x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
 
Loading
Loading
@@ -145,28 +145,24 @@ uint32_t x264_cpu_detect( void )
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x00008000 )
cpu |= X264_CPU_CMOV;
else
return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
/* Check OXSAVE and AVX bits */
if( (ecx&0x18000000) == 0x18000000 )
if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
xcr0 = x264_cpu_xgetbv( 0 );
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
cpu |= X264_CPU_AVX;
if( ecx&0x10000000 )
cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
}
Loading
Loading
@@ -175,20 +171,25 @@ uint32_t x264_cpu_detect( void )
if( max_basic_cap >= 7 )
{
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
/* AVX2 requires OS support, but BMI1/2 don't. */
if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
cpu |= X264_CPU_AVX2;
if( ebx&0x00000008 )
{
cpu |= X264_CPU_BMI1;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
if( ebx&0x00000020 )
cpu |= X264_CPU_AVX2;
if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
{
if( (ebx&0xD0030000) == 0xD0030000 )
cpu |= X264_CPU_AVX512;
}
}
}
 
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
 
Loading
Loading
@@ -228,8 +229,6 @@ uint32_t x264_cpu_detect( void )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( !(cpu&X264_CPU_LZCNT) )
cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
Loading
Loading
@@ -254,7 +253,6 @@ uint32_t x264_cpu_detect( void )
else if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
Loading
Loading
@@ -268,7 +266,7 @@ uint32_t x264_cpu_detect( void )
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
cache = (ebx&0xff00)>>5; // cflush size
int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
Loading
Loading
Loading
Loading
@@ -56,7 +56,7 @@ void x264_cpu_sfence( void );
* alignment between functions (osdep.h handles manual alignment of arrays
* if it doesn't).
*/
#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
intptr_t x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
Loading
Loading
@@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... );
 
typedef struct
{
const char name[16];
const char *name;
uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];
Loading
Loading
Loading
Loading
@@ -711,6 +711,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512;
dctf->add8x8_idct = x264_add8x8_idct_avx512;
}
#endif //HAVE_MMX
 
#if HAVE_ALTIVEC
Loading
Loading
@@ -986,6 +996,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
}
#endif // ARCH_X86_64
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#else
#if HAVE_MMX
Loading
Loading
@@ -1026,6 +1043,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
Loading
Loading
@@ -1068,6 +1092,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
}
#else
if( cpu&X264_CPU_MMX )
{
Loading
Loading
@@ -1091,6 +1120,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
}
#endif // HIGH_BIT_DEPTH
#endif
#if !HIGH_BIT_DEPTH
Loading
Loading
Loading
Loading
@@ -75,7 +75,6 @@ typedef struct
} x264_zigzag_function_t;
 
void x264_dct_init( int cpu, x264_dct_function_t *dctf );
void x264_dct_init_weights( void );
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
 
#endif
Loading
Loading
@@ -676,21 +676,21 @@ void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, i
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
 
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
Loading
Loading
@@ -803,7 +803,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
Loading
Loading
@@ -852,6 +851,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_strength = x264_deblock_strength_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->deblock_strength = x264_deblock_strength_avx512;
}
}
#endif
 
Loading
Loading
Loading
Loading
@@ -54,6 +54,8 @@ static int x264_frame_internal_csp( int external_csp )
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
case X264_CSP_YUYV:
case X264_CSP_UYVY:
case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
Loading
Loading
@@ -76,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_padv = PADV << PARAM_INTERLACED;
int align = 16;
#if ARCH_X86 || ARCH_X86_64
if( h->param.cpu&X264_CPU_CACHELINE_64 )
if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
align = 64;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
align = 32;
Loading
Loading
@@ -221,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
 
/* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
Loading
Loading
@@ -408,7 +412,13 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 
uint8_t *pix[3];
int stride[3];
if( i_csp == X264_CSP_V210 )
if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
{
int p = i_csp == X264_CSP_UYVY;
h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
(pixel*)src->img.plane[0], src->img.i_stride[0], h->param.i_width, h->param.i_height );
}
else if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment