FFmpeg
celp_filters_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Nedeljko Babic (nbabic@mips.com)
30  *
31  * various filters for CELP-based codecs optimized for MIPS
32  *
33  * This file is part of FFmpeg.
34  *
35  * FFmpeg is free software; you can redistribute it and/or
36  * modify it under the terms of the GNU Lesser General Public
37  * License as published by the Free Software Foundation; either
38  * version 2.1 of the License, or (at your option) any later version.
39  *
40  * FFmpeg is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
43  * Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public
46  * License along with FFmpeg; if not, write to the Free Software
47  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48  */
49 
50 /**
51  * @file
52  * Reference: libavcodec/celp_filters.c
53  */
54 #include "config.h"
55 #include "libavutil/attributes.h"
56 #include "libavutil/common.h"
58 #include "libavutil/mips/asmdefs.h"
59 
60 #if HAVE_INLINE_ASM
61 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
62 static void ff_celp_lp_synthesis_filterf_mips(float *out,
63  const float *filter_coeffs,
64  const float* in, int buffer_length,
65  int filter_length)
66 {
67  int i,n;
68 
69  float out0, out1, out2, out3;
70  float old_out0, old_out1, old_out2, old_out3;
71  float a,b,c;
72  const float *p_filter_coeffs;
73  float *p_out;
74 
75  a = filter_coeffs[0];
76  b = filter_coeffs[1];
77  c = filter_coeffs[2];
78  b -= filter_coeffs[0] * filter_coeffs[0];
79  c -= filter_coeffs[1] * filter_coeffs[0];
80  c -= filter_coeffs[0] * b;
81 
82  old_out0 = out[-4];
83  old_out1 = out[-3];
84  old_out2 = out[-2];
85  old_out3 = out[-1];
86  for (n = 0; n <= buffer_length - 4; n+=4) {
87  p_filter_coeffs = filter_coeffs;
88  p_out = out;
89 
90  out0 = in[0];
91  out1 = in[1];
92  out2 = in[2];
93  out3 = in[3];
94 
95  __asm__ volatile(
96  "lwc1 $f2, 8(%[filter_coeffs]) \n\t"
97  "lwc1 $f1, 4(%[filter_coeffs]) \n\t"
98  "lwc1 $f0, 0(%[filter_coeffs]) \n\t"
99  "nmsub.s %[out0], %[out0], $f2, %[old_out1] \n\t"
100  "nmsub.s %[out1], %[out1], $f2, %[old_out2] \n\t"
101  "nmsub.s %[out2], %[out2], $f2, %[old_out3] \n\t"
102  "lwc1 $f3, 12(%[filter_coeffs]) \n\t"
103  "nmsub.s %[out0], %[out0], $f1, %[old_out2] \n\t"
104  "nmsub.s %[out1], %[out1], $f1, %[old_out3] \n\t"
105  "nmsub.s %[out2], %[out2], $f3, %[old_out2] \n\t"
106  "nmsub.s %[out0], %[out0], $f0, %[old_out3] \n\t"
107  "nmsub.s %[out3], %[out3], $f3, %[old_out3] \n\t"
108  "nmsub.s %[out1], %[out1], $f3, %[old_out1] \n\t"
109  "nmsub.s %[out0], %[out0], $f3, %[old_out0] \n\t"
110 
111  : [out0]"+f"(out0), [out1]"+f"(out1),
112  [out2]"+f"(out2), [out3]"+f"(out3)
113  : [old_out0]"f"(old_out0), [old_out1]"f"(old_out1),
114  [old_out2]"f"(old_out2), [old_out3]"f"(old_out3),
115  [filter_coeffs]"r"(filter_coeffs)
116  : "$f0", "$f1", "$f2", "$f3", "$f4", "memory"
117  );
118 
119  for (i = 5; i <= filter_length; i += 2) {
120  __asm__ volatile(
121  "lwc1 %[old_out3], -20(%[p_out]) \n\t"
122  "lwc1 $f5, 16(%[p_filter_coeffs]) \n\t"
123  PTR_ADDIU "%[p_out], -8 \n\t"
124  PTR_ADDIU "%[p_filter_coeffs], 8 \n\t"
125  "nmsub.s %[out1], %[out1], $f5, %[old_out0] \n\t"
126  "nmsub.s %[out3], %[out3], $f5, %[old_out2] \n\t"
127  "lwc1 $f4, 12(%[p_filter_coeffs]) \n\t"
128  "lwc1 %[old_out2], -16(%[p_out]) \n\t"
129  "nmsub.s %[out0], %[out0], $f5, %[old_out3] \n\t"
130  "nmsub.s %[out2], %[out2], $f5, %[old_out1] \n\t"
131  "nmsub.s %[out1], %[out1], $f4, %[old_out3] \n\t"
132  "nmsub.s %[out3], %[out3], $f4, %[old_out1] \n\t"
133  "mov.s %[old_out1], %[old_out3] \n\t"
134  "nmsub.s %[out0], %[out0], $f4, %[old_out2] \n\t"
135  "nmsub.s %[out2], %[out2], $f4, %[old_out0] \n\t"
136 
137  : [out0]"+f"(out0), [out1]"+f"(out1),
138  [out2]"+f"(out2), [out3]"+f"(out3), [old_out0]"+f"(old_out0),
139  [old_out1]"+f"(old_out1), [old_out2]"+f"(old_out2),
140  [old_out3]"+f"(old_out3),[p_filter_coeffs]"+r"(p_filter_coeffs),
141  [p_out]"+r"(p_out)
142  :
143  : "$f4", "$f5", "memory"
144  );
145  FFSWAP(float, old_out0, old_out2);
146  }
147 
148  __asm__ volatile(
149  "nmsub.s %[out3], %[out3], %[a], %[out2] \n\t"
150  "nmsub.s %[out2], %[out2], %[a], %[out1] \n\t"
151  "nmsub.s %[out3], %[out3], %[b], %[out1] \n\t"
152  "nmsub.s %[out1], %[out1], %[a], %[out0] \n\t"
153  "nmsub.s %[out2], %[out2], %[b], %[out0] \n\t"
154  "nmsub.s %[out3], %[out3], %[c], %[out0] \n\t"
155 
156  : [out0]"+f"(out0), [out1]"+f"(out1),
157  [out2]"+f"(out2), [out3]"+f"(out3)
158  : [a]"f"(a), [b]"f"(b), [c]"f"(c)
159  );
160 
161  out[0] = out0;
162  out[1] = out1;
163  out[2] = out2;
164  out[3] = out3;
165 
166  old_out0 = out0;
167  old_out1 = out1;
168  old_out2 = out2;
169  old_out3 = out3;
170 
171  out += 4;
172  in += 4;
173  }
174 
175  out -= n;
176  in -= n;
177  for (; n < buffer_length; n++) {
178  float out_val, out_val_i, fc_val;
179  p_filter_coeffs = filter_coeffs;
180  p_out = &out[n];
181  out_val = in[n];
182  for (i = 1; i <= filter_length; i++) {
183  __asm__ volatile(
184  "lwc1 %[fc_val], 0(%[p_filter_coeffs]) \n\t"
185  "lwc1 %[out_val_i], -4(%[p_out]) \n\t"
186  PTR_ADDIU "%[p_filter_coeffs], 4 \n\t"
187  PTR_ADDIU "%[p_out], -4 \n\t"
188  "nmsub.s %[out_val], %[out_val], %[fc_val], %[out_val_i] \n\t"
189 
190  : [fc_val]"=&f"(fc_val), [out_val]"+f"(out_val),
191  [out_val_i]"=&f"(out_val_i), [p_out]"+r"(p_out),
192  [p_filter_coeffs]"+r"(p_filter_coeffs)
193  :
194  : "memory"
195  );
196  }
197  out[n] = out_val;
198  }
199 }
200 
201 static void ff_celp_lp_zero_synthesis_filterf_mips(float *out,
202  const float *filter_coeffs,
203  const float *in, int buffer_length,
204  int filter_length)
205 {
206  int i,n;
207  float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
208  float sum_out3, sum_out2, sum_out1;
209  const float *p_filter_coeffs, *p_in;
210 
211  for (n = 0; n < buffer_length; n+=8) {
212  p_in = &in[n];
213  p_filter_coeffs = filter_coeffs;
214  sum_out8 = in[n+7];
215  sum_out7 = in[n+6];
216  sum_out6 = in[n+5];
217  sum_out5 = in[n+4];
218  sum_out4 = in[n+3];
219  sum_out3 = in[n+2];
220  sum_out2 = in[n+1];
221  sum_out1 = in[n];
222  i = filter_length;
223 
224  /* i is always greater than 0
225  * outer loop is unrolled eight times so there is less memory access
226  * inner loop is unrolled two times
227  */
228  __asm__ volatile(
229  "filt_lp_inner%=: \n\t"
230  "lwc1 %[fc_val], 0(%[p_filter_coeffs]) \n\t"
231  "lwc1 $f7, 6*4(%[p_in]) \n\t"
232  "lwc1 $f6, 5*4(%[p_in]) \n\t"
233  "lwc1 $f5, 4*4(%[p_in]) \n\t"
234  "lwc1 $f4, 3*4(%[p_in]) \n\t"
235  "lwc1 $f3, 2*4(%[p_in]) \n\t"
236  "lwc1 $f2, 4(%[p_in]) \n\t"
237  "lwc1 $f1, 0(%[p_in]) \n\t"
238  "lwc1 $f0, -4(%[p_in]) \n\t"
239  "addiu %[i], -2 \n\t"
240  "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f7 \n\t"
241  "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f6 \n\t"
242  "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f5 \n\t"
243  "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f4 \n\t"
244  "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f3 \n\t"
245  "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f2 \n\t"
246  "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f1 \n\t"
247  "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f0 \n\t"
248  "lwc1 %[fc_val], 4(%[p_filter_coeffs]) \n\t"
249  "lwc1 $f7, -8(%[p_in]) \n\t"
250  PTR_ADDIU "%[p_filter_coeffs], 8 \n\t"
251  PTR_ADDIU "%[p_in], -8 \n\t"
252  "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f6 \n\t"
253  "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f5 \n\t"
254  "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f4 \n\t"
255  "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f3 \n\t"
256  "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f2 \n\t"
257  "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f1 \n\t"
258  "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f0 \n\t"
259  "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f7 \n\t"
260  "bgtz %[i], filt_lp_inner%= \n\t"
261 
262  : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
263  [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
264  [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
265  [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
266  [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
267  [p_in]"+r"(p_in), [i]"+r"(i)
268  :
269  : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "memory"
270  );
271 
272  out[n+7] = sum_out8;
273  out[n+6] = sum_out7;
274  out[n+5] = sum_out6;
275  out[n+4] = sum_out5;
276  out[n+3] = sum_out4;
277  out[n+2] = sum_out3;
278  out[n+1] = sum_out2;
279  out[n] = sum_out1;
280  }
281 }
282 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
283 #endif /* HAVE_INLINE_ASM */
284 
286 {
287 #if HAVE_INLINE_ASM
288 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
289  c->celp_lp_synthesis_filterf = ff_celp_lp_synthesis_filterf_mips;
290  c->celp_lp_zero_synthesis_filterf = ff_celp_lp_zero_synthesis_filterf_mips;
291 #endif
292 #endif
293 }
out
FILE * out
Definition: movenc.c:54
b
#define b
Definition: input.c:34
asmdefs.h
celp_filters.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
CELPFContext
Definition: celp_filters.h:28
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
common.h
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
ff_celp_filter_init_mips
void ff_celp_filter_init_mips(CELPFContext *c)
Definition: celp_filters_mips.c:285