FFmpeg
mlpdsp_init.c
Go to the documentation of this file.
1 /*
2  * MLP DSP functions x86-optimized
3  * Copyright (c) 2009 Ramiro Polla
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <stdint.h>
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavutil/macros.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/mlpdsp.h"
30 #include "libavcodec/mlp.h"
31 
32 #define REMATRIX_CHANNEL_FUNC(opt) \
33 void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
34  const int32_t *coeffs, \
35  const uint8_t *bypassed_lsbs, \
36  const int8_t *noise_buffer, \
37  int index, \
38  unsigned int dest_ch, \
39  uint16_t blockpos, \
40  unsigned int maxchan, \
41  int matrix_noise_shift, \
42  int access_unit_size_pow2, \
43  int32_t mask);
44 
46 REMATRIX_CHANNEL_FUNC(avx2_bmi2)
47 
48 #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
49 
50 extern char ff_mlp_firorder_8;
51 extern char ff_mlp_firorder_7;
52 extern char ff_mlp_firorder_6;
53 extern char ff_mlp_firorder_5;
54 extern char ff_mlp_firorder_4;
55 extern char ff_mlp_firorder_3;
56 extern char ff_mlp_firorder_2;
57 extern char ff_mlp_firorder_1;
58 extern char ff_mlp_firorder_0;
59 
60 extern char ff_mlp_iirorder_4;
61 extern char ff_mlp_iirorder_3;
62 extern char ff_mlp_iirorder_2;
63 extern char ff_mlp_iirorder_1;
64 extern char ff_mlp_iirorder_0;
65 
66 static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
67  &ff_mlp_firorder_2, &ff_mlp_firorder_3,
68  &ff_mlp_firorder_4, &ff_mlp_firorder_5,
69  &ff_mlp_firorder_6, &ff_mlp_firorder_7,
70  &ff_mlp_firorder_8 };
71 static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
72  &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
73  &ff_mlp_iirorder_4 };
74 
75 #if ARCH_X86_64
76 
77 #define MLPMUL(label, offset, offs, offc) \
78  LABEL_MANGLE(label)": \n\t" \
79  "movslq "offset"+"offs"(%0), %%rax\n\t" \
80  "movslq "offset"+"offc"(%1), %%rdx\n\t" \
81  "imul %%rdx, %%rax\n\t" \
82  "add %%rax, %%rsi\n\t"
83 
84 #define FIRMULREG(label, offset, firc)\
85  LABEL_MANGLE(label)": \n\t" \
86  "movslq "#offset"(%0), %%rax\n\t" \
87  "imul %"#firc", %%rax\n\t" \
88  "add %%rax, %%rsi\n\t"
89 
90 #define CLEAR_ACCUM \
91  "xor %%rsi, %%rsi\n\t"
92 
93 #define SHIFT_ACCUM \
94  "shr %%cl, %%rsi\n\t"
95 
96 #define ACCUM "%%rdx"
97 #define RESULT "%%rsi"
98 #define RESULT32 "%%esi"
99 
100 #else /* if ARCH_X86_32 */
101 
102 #define MLPMUL(label, offset, offs, offc) \
103  LABEL_MANGLE(label)": \n\t" \
104  "mov "offset"+"offs"(%0), %%eax\n\t" \
105  "imull "offset"+"offc"(%1) \n\t" \
106  "add %%eax , %%esi\n\t" \
107  "adc %%edx , %%ecx\n\t"
108 
109 #define FIRMULREG(label, offset, firc) \
110  MLPMUL(label, #offset, "0", "0")
111 
112 #define CLEAR_ACCUM \
113  "xor %%esi, %%esi\n\t" \
114  "xor %%ecx, %%ecx\n\t"
115 
116 #define SHIFT_ACCUM \
117  "mov %%ecx, %%edx\n\t" \
118  "mov %%esi, %%eax\n\t" \
119  "movzbl %7 , %%ecx\n\t" \
120  "shrd %%cl, %%edx, %%eax\n\t" \
121 
122 #define ACCUM "%%edx"
123 #define RESULT "%%eax"
124 #define RESULT32 "%%eax"
125 
126 #endif /* !ARCH_X86_64 */
127 
128 #define BINC AV_STRINGIFY(4* MAX_CHANNELS)
129 #define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
130 #define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
131 
132 #define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
133 #define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
134 
135 static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
136  int firorder, int iirorder,
137  unsigned int filter_shift, int32_t mask,
138  int blocksize, int32_t *sample_buffer)
139 {
140  const void *firjump = firtable[firorder];
141  const void *iirjump = iirtable[iirorder];
142 
143  blocksize = -blocksize;
144 
145  __asm__ volatile(
146  "1: \n\t"
147  CLEAR_ACCUM
148  "jmp *%5 \n\t"
149  FIRMUL (ff_mlp_firorder_8, 0x1c )
150  FIRMUL (ff_mlp_firorder_7, 0x18 )
151  FIRMUL (ff_mlp_firorder_6, 0x14 )
152  FIRMUL (ff_mlp_firorder_5, 0x10 )
153  FIRMUL (ff_mlp_firorder_4, 0x0c )
154  FIRMUL (ff_mlp_firorder_3, 0x08 )
155  FIRMUL (ff_mlp_firorder_2, 0x04 )
156  FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
157  LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
158  "jmp *%6 \n\t"
159  IIRMUL (ff_mlp_iirorder_4, 0x0c )
160  IIRMUL (ff_mlp_iirorder_3, 0x08 )
161  IIRMUL (ff_mlp_iirorder_2, 0x04 )
162  IIRMUL (ff_mlp_iirorder_1, 0x00 )
163  LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
164  SHIFT_ACCUM
165  "mov "RESULT" ,"ACCUM" \n\t"
166  "add (%2) ,"RESULT" \n\t"
167  "and %4 ,"RESULT" \n\t"
168  "sub $4 , %0 \n\t"
169  "mov "RESULT32", (%0) \n\t"
170  "mov "RESULT32", (%2) \n\t"
171  "add $"BINC" , %2 \n\t"
172  "sub "ACCUM" ,"RESULT" \n\t"
173  "mov "RESULT32","IOFFS"(%0) \n\t"
174  "incl %3 \n\t"
175  "js 1b \n\t"
176  : /* 0*/"+r"(state),
177  /* 1*/"+r"(coeff),
178  /* 2*/"+r"(sample_buffer),
179 #if ARCH_X86_64
180  /* 3*/"+r"(blocksize)
181  : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
182  /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
183  , /* 8*/"r"((int64_t)coeff[0])
184  : "rax", "rdx", "rsi"
185 #else /* ARCH_X86_32 */
186  /* 3*/"+m"(blocksize)
187  : /* 4*/"m"( mask), /* 5*/"m"(firjump),
188  /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
189  : "eax", "edx", "esi", "ecx"
190 #endif /* !ARCH_X86_64 */
191  );
192 }
193 
194 #endif /* HAVE_7REGS && HAVE_INLINE_ASM */
195 
197 {
198  int cpu_flags = av_get_cpu_flags();
199 #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
200  if (INLINE_MMX(cpu_flags))
201  c->mlp_filter_channel = mlp_filter_channel_x86;
202 #endif
203  if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
204  c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
205  if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
206  c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
207 }
INLINE_MMX
#define INLINE_MMX(flags)
Definition: cpu.h:87
cpu.h
r
const char * r
Definition: vf_curves.c:116
sub
static float sub(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:31
LABEL_MANGLE
#define LABEL_MANGLE(a)
Definition: asm.h:103
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
b
#define b
Definition: input.c:34
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
REMATRIX_CHANNEL_FUNC
#define REMATRIX_CHANNEL_FUNC(opt)
Definition: mlpdsp_init.c:32
macros.h
av_cold
#define av_cold
Definition: attributes.h:90
mask
static const uint16_t mask[17]
Definition: lzw.c:38
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
cpu.h
mlpdsp.h
state
static struct @327 state
asm.h
attributes.h
ff_mlpdsp_init_x86
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
Definition: mlpdsp_init.c:196
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ACCUM
#define ACCUM(k, x, d)
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
add
static float add(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:35
AV_CPU_FLAG_BMI2
#define AV_CPU_FLAG_BMI2
Bit Manipulation Instruction Set 2.
Definition: cpu.h:55
mlp.h
x86_reg
int x86_reg
Definition: asm.h:72
int32_t
int32_t
Definition: audioconvert.c:56
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:78
MLPDSPContext
Definition: mlpdsp.h:49