FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
aaccoder_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Stanislav Ocovaj (socovaj@mips.com)
30  * Szabolcs Pal (sabolc@mips.com)
31  *
32  * AAC coefficients encoder optimized for MIPS floating-point architecture
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50 
51 /**
52  * @file
53  * Reference: libavcodec/aaccoder.c
54  */
55 
56 #include "libavutil/libm.h"
57 
58 #include <float.h>
59 #include "libavutil/mathematics.h"
60 #include "libavcodec/avcodec.h"
61 #include "libavcodec/put_bits.h"
62 #include "libavcodec/aac.h"
63 #include "libavcodec/aacenc.h"
64 #include "libavcodec/aactab.h"
65 
66 #if HAVE_INLINE_ASM
67 typedef struct BandCodingPath {
68  int prev_idx;
69  float cost;
70  int run;
72 
73 static const uint8_t run_value_bits_long[64] = {
74  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
75  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10,
76  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
77  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
78 };
79 
80 static const uint8_t run_value_bits_short[16] = {
81  3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
82 };
83 
84 static const uint8_t *run_value_bits[2] = {
86 };
87 
88 static const uint8_t uquad_sign_bits[81] = {
89  0, 1, 1, 1, 2, 2, 1, 2, 2,
90  1, 2, 2, 2, 3, 3, 2, 3, 3,
91  1, 2, 2, 2, 3, 3, 2, 3, 3,
92  1, 2, 2, 2, 3, 3, 2, 3, 3,
93  2, 3, 3, 3, 4, 4, 3, 4, 4,
94  2, 3, 3, 3, 4, 4, 3, 4, 4,
95  1, 2, 2, 2, 3, 3, 2, 3, 3,
96  2, 3, 3, 3, 4, 4, 3, 4, 4,
97  2, 3, 3, 3, 4, 4, 3, 4, 4
98 };
99 
100 static const uint8_t upair7_sign_bits[64] = {
101  0, 1, 1, 1, 1, 1, 1, 1,
102  1, 2, 2, 2, 2, 2, 2, 2,
103  1, 2, 2, 2, 2, 2, 2, 2,
104  1, 2, 2, 2, 2, 2, 2, 2,
105  1, 2, 2, 2, 2, 2, 2, 2,
106  1, 2, 2, 2, 2, 2, 2, 2,
107  1, 2, 2, 2, 2, 2, 2, 2,
108  1, 2, 2, 2, 2, 2, 2, 2,
109 };
110 
111 static const uint8_t upair12_sign_bits[169] = {
112  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
125 };
126 
127 static const uint8_t esc_sign_bits[289] = {
128  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
144  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
145 };
146 
147 static void abs_pow34_v(float *out, const float *in, const int size) {
148 #ifndef USE_REALLY_FULL_SEARCH
149  int i;
150  float a, b, c, d;
151  float ax, bx, cx, dx;
152 
153  for (i = 0; i < size; i += 4) {
154  a = fabsf(in[i ]);
155  b = fabsf(in[i+1]);
156  c = fabsf(in[i+2]);
157  d = fabsf(in[i+3]);
158 
159  ax = sqrtf(a);
160  bx = sqrtf(b);
161  cx = sqrtf(c);
162  dx = sqrtf(d);
163 
164  a = a * ax;
165  b = b * bx;
166  c = c * cx;
167  d = d * dx;
168 
169  out[i ] = sqrtf(a);
170  out[i+1] = sqrtf(b);
171  out[i+2] = sqrtf(c);
172  out[i+3] = sqrtf(d);
173  }
174 #endif /* USE_REALLY_FULL_SEARCH */
175 }
176 
177 static float find_max_val(int group_len, int swb_size, const float *scaled) {
178  float maxval = 0.0f;
179  int w2, i;
180  for (w2 = 0; w2 < group_len; w2++) {
181  for (i = 0; i < swb_size; i++) {
182  maxval = FFMAX(maxval, scaled[w2*128+i]);
183  }
184  }
185  return maxval;
186 }
187 
188 static int find_min_book(float maxval, int sf) {
190  float Q34 = sqrtf(Q * sqrtf(Q));
191  int qmaxval, cb;
192  qmaxval = maxval * Q34 + 0.4054f;
193  if (qmaxval == 0) cb = 0;
194  else if (qmaxval == 1) cb = 1;
195  else if (qmaxval == 2) cb = 3;
196  else if (qmaxval <= 4) cb = 5;
197  else if (qmaxval <= 7) cb = 7;
198  else if (qmaxval <= 12) cb = 9;
199  else cb = 11;
200  return cb;
201 }
202 
203 /**
204  * Functions developed from template function and optimized for quantizing and encoding band
205  */
206 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
207  PutBitContext *pb, const float *in,
208  const float *scaled, int size, int scale_idx,
209  int cb, const float lambda, const float uplim,
210  int *bits)
211 {
212  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
213  int i;
214  int qc1, qc2, qc3, qc4;
215 
216  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
217  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
218 
219  abs_pow34_v(s->scoefs, in, size);
220  scaled = s->scoefs;
221  for (i = 0; i < size; i += 4) {
222  int curidx;
223  int *in_int = (int *)&in[i];
224 
225  qc1 = scaled[i ] * Q34 + 0.4054f;
226  qc2 = scaled[i+1] * Q34 + 0.4054f;
227  qc3 = scaled[i+2] * Q34 + 0.4054f;
228  qc4 = scaled[i+3] * Q34 + 0.4054f;
229 
230  __asm__ volatile (
231  ".set push \n\t"
232  ".set noreorder \n\t"
233 
234  "slt %[qc1], $zero, %[qc1] \n\t"
235  "slt %[qc2], $zero, %[qc2] \n\t"
236  "slt %[qc3], $zero, %[qc3] \n\t"
237  "slt %[qc4], $zero, %[qc4] \n\t"
238  "lw $t0, 0(%[in_int]) \n\t"
239  "lw $t1, 4(%[in_int]) \n\t"
240  "lw $t2, 8(%[in_int]) \n\t"
241  "lw $t3, 12(%[in_int]) \n\t"
242  "srl $t0, $t0, 31 \n\t"
243  "srl $t1, $t1, 31 \n\t"
244  "srl $t2, $t2, 31 \n\t"
245  "srl $t3, $t3, 31 \n\t"
246  "subu $t4, $zero, %[qc1] \n\t"
247  "subu $t5, $zero, %[qc2] \n\t"
248  "subu $t6, $zero, %[qc3] \n\t"
249  "subu $t7, $zero, %[qc4] \n\t"
250  "movn %[qc1], $t4, $t0 \n\t"
251  "movn %[qc2], $t5, $t1 \n\t"
252  "movn %[qc3], $t6, $t2 \n\t"
253  "movn %[qc4], $t7, $t3 \n\t"
254 
255  ".set pop \n\t"
256 
257  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
258  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
259  : [in_int]"r"(in_int)
260  : "t0", "t1", "t2", "t3",
261  "t4", "t5", "t6", "t7",
262  "memory"
263  );
264 
265  curidx = qc1;
266  curidx *= 3;
267  curidx += qc2;
268  curidx *= 3;
269  curidx += qc3;
270  curidx *= 3;
271  curidx += qc4;
272  curidx += 40;
273 
274  put_bits(pb, p_bits[curidx], p_codes[curidx]);
275  }
276 }
277 
278 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
279  PutBitContext *pb, const float *in,
280  const float *scaled, int size, int scale_idx,
281  int cb, const float lambda, const float uplim,
282  int *bits)
283 {
284  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
285  int i;
286  int qc1, qc2, qc3, qc4;
287 
288  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
289  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
290 
291  abs_pow34_v(s->scoefs, in, size);
292  scaled = s->scoefs;
293  for (i = 0; i < size; i += 4) {
294  int curidx, sign, count;
295  int *in_int = (int *)&in[i];
296  uint8_t v_bits;
297  unsigned int v_codes;
298 
299  qc1 = scaled[i ] * Q34 + 0.4054f;
300  qc2 = scaled[i+1] * Q34 + 0.4054f;
301  qc3 = scaled[i+2] * Q34 + 0.4054f;
302  qc4 = scaled[i+3] * Q34 + 0.4054f;
303 
304  __asm__ volatile (
305  ".set push \n\t"
306  ".set noreorder \n\t"
307 
308  "ori $t4, $zero, 2 \n\t"
309  "ori %[sign], $zero, 0 \n\t"
310  "slt $t0, $t4, %[qc1] \n\t"
311  "slt $t1, $t4, %[qc2] \n\t"
312  "slt $t2, $t4, %[qc3] \n\t"
313  "slt $t3, $t4, %[qc4] \n\t"
314  "movn %[qc1], $t4, $t0 \n\t"
315  "movn %[qc2], $t4, $t1 \n\t"
316  "movn %[qc3], $t4, $t2 \n\t"
317  "movn %[qc4], $t4, $t3 \n\t"
318  "lw $t0, 0(%[in_int]) \n\t"
319  "lw $t1, 4(%[in_int]) \n\t"
320  "lw $t2, 8(%[in_int]) \n\t"
321  "lw $t3, 12(%[in_int]) \n\t"
322  "slt $t0, $t0, $zero \n\t"
323  "movn %[sign], $t0, %[qc1] \n\t"
324  "slt $t1, $t1, $zero \n\t"
325  "slt $t2, $t2, $zero \n\t"
326  "slt $t3, $t3, $zero \n\t"
327  "sll $t0, %[sign], 1 \n\t"
328  "or $t0, $t0, $t1 \n\t"
329  "movn %[sign], $t0, %[qc2] \n\t"
330  "slt $t4, $zero, %[qc1] \n\t"
331  "slt $t1, $zero, %[qc2] \n\t"
332  "slt %[count], $zero, %[qc3] \n\t"
333  "sll $t0, %[sign], 1 \n\t"
334  "or $t0, $t0, $t2 \n\t"
335  "movn %[sign], $t0, %[qc3] \n\t"
336  "slt $t2, $zero, %[qc4] \n\t"
337  "addu %[count], %[count], $t4 \n\t"
338  "addu %[count], %[count], $t1 \n\t"
339  "sll $t0, %[sign], 1 \n\t"
340  "or $t0, $t0, $t3 \n\t"
341  "movn %[sign], $t0, %[qc4] \n\t"
342  "addu %[count], %[count], $t2 \n\t"
343 
344  ".set pop \n\t"
345 
346  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
347  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
348  [sign]"=&r"(sign), [count]"=&r"(count)
349  : [in_int]"r"(in_int)
350  : "t0", "t1", "t2", "t3", "t4",
351  "memory"
352  );
353 
354  curidx = qc1;
355  curidx *= 3;
356  curidx += qc2;
357  curidx *= 3;
358  curidx += qc3;
359  curidx *= 3;
360  curidx += qc4;
361 
362  v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
363  v_bits = p_bits[curidx] + count;
364  put_bits(pb, v_bits, v_codes);
365  }
366 }
367 
368 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
369  PutBitContext *pb, const float *in,
370  const float *scaled, int size, int scale_idx,
371  int cb, const float lambda, const float uplim,
372  int *bits)
373 {
374  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
375  int i;
376  int qc1, qc2, qc3, qc4;
377 
378  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
379  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
380 
381  abs_pow34_v(s->scoefs, in, size);
382  scaled = s->scoefs;
383  for (i = 0; i < size; i += 4) {
384  int curidx, curidx2;
385  int *in_int = (int *)&in[i];
386  uint8_t v_bits;
387  unsigned int v_codes;
388 
389  qc1 = scaled[i ] * Q34 + 0.4054f;
390  qc2 = scaled[i+1] * Q34 + 0.4054f;
391  qc3 = scaled[i+2] * Q34 + 0.4054f;
392  qc4 = scaled[i+3] * Q34 + 0.4054f;
393 
394  __asm__ volatile (
395  ".set push \n\t"
396  ".set noreorder \n\t"
397 
398  "ori $t4, $zero, 4 \n\t"
399  "slt $t0, $t4, %[qc1] \n\t"
400  "slt $t1, $t4, %[qc2] \n\t"
401  "slt $t2, $t4, %[qc3] \n\t"
402  "slt $t3, $t4, %[qc4] \n\t"
403  "movn %[qc1], $t4, $t0 \n\t"
404  "movn %[qc2], $t4, $t1 \n\t"
405  "movn %[qc3], $t4, $t2 \n\t"
406  "movn %[qc4], $t4, $t3 \n\t"
407  "lw $t0, 0(%[in_int]) \n\t"
408  "lw $t1, 4(%[in_int]) \n\t"
409  "lw $t2, 8(%[in_int]) \n\t"
410  "lw $t3, 12(%[in_int]) \n\t"
411  "srl $t0, $t0, 31 \n\t"
412  "srl $t1, $t1, 31 \n\t"
413  "srl $t2, $t2, 31 \n\t"
414  "srl $t3, $t3, 31 \n\t"
415  "subu $t4, $zero, %[qc1] \n\t"
416  "subu $t5, $zero, %[qc2] \n\t"
417  "subu $t6, $zero, %[qc3] \n\t"
418  "subu $t7, $zero, %[qc4] \n\t"
419  "movn %[qc1], $t4, $t0 \n\t"
420  "movn %[qc2], $t5, $t1 \n\t"
421  "movn %[qc3], $t6, $t2 \n\t"
422  "movn %[qc4], $t7, $t3 \n\t"
423 
424  ".set pop \n\t"
425 
426  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
427  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
428  : [in_int]"r"(in_int)
429  : "t0", "t1", "t2", "t3",
430  "t4", "t5", "t6", "t7",
431  "memory"
432  );
433 
434  curidx = 9 * qc1;
435  curidx += qc2 + 40;
436 
437  curidx2 = 9 * qc3;
438  curidx2 += qc4 + 40;
439 
440  v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
441  v_bits = p_bits[curidx] + p_bits[curidx2];
442  put_bits(pb, v_bits, v_codes);
443  }
444 }
445 
446 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
447  PutBitContext *pb, const float *in,
448  const float *scaled, int size, int scale_idx,
449  int cb, const float lambda, const float uplim,
450  int *bits)
451 {
452  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
453  int i;
454  int qc1, qc2, qc3, qc4;
455 
456  uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
457  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
458 
459  abs_pow34_v(s->scoefs, in, size);
460  scaled = s->scoefs;
461  for (i = 0; i < size; i += 4) {
462  int curidx, sign1, count1, sign2, count2;
463  int *in_int = (int *)&in[i];
464  uint8_t v_bits;
465  unsigned int v_codes;
466 
467  qc1 = scaled[i ] * Q34 + 0.4054f;
468  qc2 = scaled[i+1] * Q34 + 0.4054f;
469  qc3 = scaled[i+2] * Q34 + 0.4054f;
470  qc4 = scaled[i+3] * Q34 + 0.4054f;
471 
472  __asm__ volatile (
473  ".set push \n\t"
474  ".set noreorder \n\t"
475 
476  "ori $t4, $zero, 7 \n\t"
477  "ori %[sign1], $zero, 0 \n\t"
478  "ori %[sign2], $zero, 0 \n\t"
479  "slt $t0, $t4, %[qc1] \n\t"
480  "slt $t1, $t4, %[qc2] \n\t"
481  "slt $t2, $t4, %[qc3] \n\t"
482  "slt $t3, $t4, %[qc4] \n\t"
483  "movn %[qc1], $t4, $t0 \n\t"
484  "movn %[qc2], $t4, $t1 \n\t"
485  "movn %[qc3], $t4, $t2 \n\t"
486  "movn %[qc4], $t4, $t3 \n\t"
487  "lw $t0, 0(%[in_int]) \n\t"
488  "lw $t1, 4(%[in_int]) \n\t"
489  "lw $t2, 8(%[in_int]) \n\t"
490  "lw $t3, 12(%[in_int]) \n\t"
491  "slt $t0, $t0, $zero \n\t"
492  "movn %[sign1], $t0, %[qc1] \n\t"
493  "slt $t2, $t2, $zero \n\t"
494  "movn %[sign2], $t2, %[qc3] \n\t"
495  "slt $t1, $t1, $zero \n\t"
496  "sll $t0, %[sign1], 1 \n\t"
497  "or $t0, $t0, $t1 \n\t"
498  "movn %[sign1], $t0, %[qc2] \n\t"
499  "slt $t3, $t3, $zero \n\t"
500  "sll $t0, %[sign2], 1 \n\t"
501  "or $t0, $t0, $t3 \n\t"
502  "movn %[sign2], $t0, %[qc4] \n\t"
503  "slt %[count1], $zero, %[qc1] \n\t"
504  "slt $t1, $zero, %[qc2] \n\t"
505  "slt %[count2], $zero, %[qc3] \n\t"
506  "slt $t2, $zero, %[qc4] \n\t"
507  "addu %[count1], %[count1], $t1 \n\t"
508  "addu %[count2], %[count2], $t2 \n\t"
509 
510  ".set pop \n\t"
511 
512  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
513  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
514  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
515  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
516  : [in_int]"r"(in_int)
517  : "t0", "t1", "t2", "t3", "t4",
518  "memory"
519  );
520 
521  curidx = 8 * qc1;
522  curidx += qc2;
523 
524  v_codes = (p_codes[curidx] << count1) | sign1;
525  v_bits = p_bits[curidx] + count1;
526  put_bits(pb, v_bits, v_codes);
527 
528  curidx = 8 * qc3;
529  curidx += qc4;
530 
531  v_codes = (p_codes[curidx] << count2) | sign2;
532  v_bits = p_bits[curidx] + count2;
533  put_bits(pb, v_bits, v_codes);
534  }
535 }
536 
537 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
538  PutBitContext *pb, const float *in,
539  const float *scaled, int size, int scale_idx,
540  int cb, const float lambda, const float uplim,
541  int *bits)
542 {
543  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
544  int i;
545  int qc1, qc2, qc3, qc4;
546 
547  uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
548  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
549 
550  abs_pow34_v(s->scoefs, in, size);
551  scaled = s->scoefs;
552  for (i = 0; i < size; i += 4) {
553  int curidx, sign1, count1, sign2, count2;
554  int *in_int = (int *)&in[i];
555  uint8_t v_bits;
556  unsigned int v_codes;
557 
558  qc1 = scaled[i ] * Q34 + 0.4054f;
559  qc2 = scaled[i+1] * Q34 + 0.4054f;
560  qc3 = scaled[i+2] * Q34 + 0.4054f;
561  qc4 = scaled[i+3] * Q34 + 0.4054f;
562 
563  __asm__ volatile (
564  ".set push \n\t"
565  ".set noreorder \n\t"
566 
567  "ori $t4, $zero, 12 \n\t"
568  "ori %[sign1], $zero, 0 \n\t"
569  "ori %[sign2], $zero, 0 \n\t"
570  "slt $t0, $t4, %[qc1] \n\t"
571  "slt $t1, $t4, %[qc2] \n\t"
572  "slt $t2, $t4, %[qc3] \n\t"
573  "slt $t3, $t4, %[qc4] \n\t"
574  "movn %[qc1], $t4, $t0 \n\t"
575  "movn %[qc2], $t4, $t1 \n\t"
576  "movn %[qc3], $t4, $t2 \n\t"
577  "movn %[qc4], $t4, $t3 \n\t"
578  "lw $t0, 0(%[in_int]) \n\t"
579  "lw $t1, 4(%[in_int]) \n\t"
580  "lw $t2, 8(%[in_int]) \n\t"
581  "lw $t3, 12(%[in_int]) \n\t"
582  "slt $t0, $t0, $zero \n\t"
583  "movn %[sign1], $t0, %[qc1] \n\t"
584  "slt $t2, $t2, $zero \n\t"
585  "movn %[sign2], $t2, %[qc3] \n\t"
586  "slt $t1, $t1, $zero \n\t"
587  "sll $t0, %[sign1], 1 \n\t"
588  "or $t0, $t0, $t1 \n\t"
589  "movn %[sign1], $t0, %[qc2] \n\t"
590  "slt $t3, $t3, $zero \n\t"
591  "sll $t0, %[sign2], 1 \n\t"
592  "or $t0, $t0, $t3 \n\t"
593  "movn %[sign2], $t0, %[qc4] \n\t"
594  "slt %[count1], $zero, %[qc1] \n\t"
595  "slt $t1, $zero, %[qc2] \n\t"
596  "slt %[count2], $zero, %[qc3] \n\t"
597  "slt $t2, $zero, %[qc4] \n\t"
598  "addu %[count1], %[count1], $t1 \n\t"
599  "addu %[count2], %[count2], $t2 \n\t"
600 
601  ".set pop \n\t"
602 
603  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
604  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
605  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
606  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
607  : [in_int]"r"(in_int)
608  : "t0", "t1", "t2", "t3", "t4",
609  "memory"
610  );
611 
612  curidx = 13 * qc1;
613  curidx += qc2;
614 
615  v_codes = (p_codes[curidx] << count1) | sign1;
616  v_bits = p_bits[curidx] + count1;
617  put_bits(pb, v_bits, v_codes);
618 
619  curidx = 13 * qc3;
620  curidx += qc4;
621 
622  v_codes = (p_codes[curidx] << count2) | sign2;
623  v_bits = p_bits[curidx] + count2;
624  put_bits(pb, v_bits, v_codes);
625  }
626 }
627 
628 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
629  PutBitContext *pb, const float *in,
630  const float *scaled, int size, int scale_idx,
631  int cb, const float lambda, const float uplim,
632  int *bits)
633 {
634  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
635  int i;
636  int qc1, qc2, qc3, qc4;
637 
638  uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
639  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
640  float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
641 
642  abs_pow34_v(s->scoefs, in, size);
643  scaled = s->scoefs;
644 
645  if (cb < 11) {
646  for (i = 0; i < size; i += 4) {
647  int curidx, curidx2, sign1, count1, sign2, count2;
648  int *in_int = (int *)&in[i];
649  uint8_t v_bits;
650  unsigned int v_codes;
651 
652  qc1 = scaled[i ] * Q34 + 0.4054f;
653  qc2 = scaled[i+1] * Q34 + 0.4054f;
654  qc3 = scaled[i+2] * Q34 + 0.4054f;
655  qc4 = scaled[i+3] * Q34 + 0.4054f;
656 
657  __asm__ volatile (
658  ".set push \n\t"
659  ".set noreorder \n\t"
660 
661  "ori $t4, $zero, 16 \n\t"
662  "ori %[sign1], $zero, 0 \n\t"
663  "ori %[sign2], $zero, 0 \n\t"
664  "slt $t0, $t4, %[qc1] \n\t"
665  "slt $t1, $t4, %[qc2] \n\t"
666  "slt $t2, $t4, %[qc3] \n\t"
667  "slt $t3, $t4, %[qc4] \n\t"
668  "movn %[qc1], $t4, $t0 \n\t"
669  "movn %[qc2], $t4, $t1 \n\t"
670  "movn %[qc3], $t4, $t2 \n\t"
671  "movn %[qc4], $t4, $t3 \n\t"
672  "lw $t0, 0(%[in_int]) \n\t"
673  "lw $t1, 4(%[in_int]) \n\t"
674  "lw $t2, 8(%[in_int]) \n\t"
675  "lw $t3, 12(%[in_int]) \n\t"
676  "slt $t0, $t0, $zero \n\t"
677  "movn %[sign1], $t0, %[qc1] \n\t"
678  "slt $t2, $t2, $zero \n\t"
679  "movn %[sign2], $t2, %[qc3] \n\t"
680  "slt $t1, $t1, $zero \n\t"
681  "sll $t0, %[sign1], 1 \n\t"
682  "or $t0, $t0, $t1 \n\t"
683  "movn %[sign1], $t0, %[qc2] \n\t"
684  "slt $t3, $t3, $zero \n\t"
685  "sll $t0, %[sign2], 1 \n\t"
686  "or $t0, $t0, $t3 \n\t"
687  "movn %[sign2], $t0, %[qc4] \n\t"
688  "slt %[count1], $zero, %[qc1] \n\t"
689  "slt $t1, $zero, %[qc2] \n\t"
690  "slt %[count2], $zero, %[qc3] \n\t"
691  "slt $t2, $zero, %[qc4] \n\t"
692  "addu %[count1], %[count1], $t1 \n\t"
693  "addu %[count2], %[count2], $t2 \n\t"
694 
695  ".set pop \n\t"
696 
697  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
698  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
699  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
700  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
701  : [in_int]"r"(in_int)
702  : "t0", "t1", "t2", "t3", "t4",
703  "memory"
704  );
705 
706  curidx = 17 * qc1;
707  curidx += qc2;
708  curidx2 = 17 * qc3;
709  curidx2 += qc4;
710 
711  v_codes = (p_codes[curidx] << count1) | sign1;
712  v_bits = p_bits[curidx] + count1;
713  put_bits(pb, v_bits, v_codes);
714 
715  v_codes = (p_codes[curidx2] << count2) | sign2;
716  v_bits = p_bits[curidx2] + count2;
717  put_bits(pb, v_bits, v_codes);
718  }
719  } else {
720  for (i = 0; i < size; i += 4) {
721  int curidx, curidx2, sign1, count1, sign2, count2;
722  int *in_int = (int *)&in[i];
723  uint8_t v_bits;
724  unsigned int v_codes;
725  int c1, c2, c3, c4;
726 
727  qc1 = scaled[i ] * Q34 + 0.4054f;
728  qc2 = scaled[i+1] * Q34 + 0.4054f;
729  qc3 = scaled[i+2] * Q34 + 0.4054f;
730  qc4 = scaled[i+3] * Q34 + 0.4054f;
731 
732  __asm__ volatile (
733  ".set push \n\t"
734  ".set noreorder \n\t"
735 
736  "ori $t4, $zero, 16 \n\t"
737  "ori %[sign1], $zero, 0 \n\t"
738  "ori %[sign2], $zero, 0 \n\t"
739  "shll_s.w %[c1], %[qc1], 18 \n\t"
740  "shll_s.w %[c2], %[qc2], 18 \n\t"
741  "shll_s.w %[c3], %[qc3], 18 \n\t"
742  "shll_s.w %[c4], %[qc4], 18 \n\t"
743  "srl %[c1], %[c1], 18 \n\t"
744  "srl %[c2], %[c2], 18 \n\t"
745  "srl %[c3], %[c3], 18 \n\t"
746  "srl %[c4], %[c4], 18 \n\t"
747  "slt $t0, $t4, %[qc1] \n\t"
748  "slt $t1, $t4, %[qc2] \n\t"
749  "slt $t2, $t4, %[qc3] \n\t"
750  "slt $t3, $t4, %[qc4] \n\t"
751  "movn %[qc1], $t4, $t0 \n\t"
752  "movn %[qc2], $t4, $t1 \n\t"
753  "movn %[qc3], $t4, $t2 \n\t"
754  "movn %[qc4], $t4, $t3 \n\t"
755  "lw $t0, 0(%[in_int]) \n\t"
756  "lw $t1, 4(%[in_int]) \n\t"
757  "lw $t2, 8(%[in_int]) \n\t"
758  "lw $t3, 12(%[in_int]) \n\t"
759  "slt $t0, $t0, $zero \n\t"
760  "movn %[sign1], $t0, %[qc1] \n\t"
761  "slt $t2, $t2, $zero \n\t"
762  "movn %[sign2], $t2, %[qc3] \n\t"
763  "slt $t1, $t1, $zero \n\t"
764  "sll $t0, %[sign1], 1 \n\t"
765  "or $t0, $t0, $t1 \n\t"
766  "movn %[sign1], $t0, %[qc2] \n\t"
767  "slt $t3, $t3, $zero \n\t"
768  "sll $t0, %[sign2], 1 \n\t"
769  "or $t0, $t0, $t3 \n\t"
770  "movn %[sign2], $t0, %[qc4] \n\t"
771  "slt %[count1], $zero, %[qc1] \n\t"
772  "slt $t1, $zero, %[qc2] \n\t"
773  "slt %[count2], $zero, %[qc3] \n\t"
774  "slt $t2, $zero, %[qc4] \n\t"
775  "addu %[count1], %[count1], $t1 \n\t"
776  "addu %[count2], %[count2], $t2 \n\t"
777 
778  ".set pop \n\t"
779 
780  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
781  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
782  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
783  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
784  [c1]"=&r"(c1), [c2]"=&r"(c2),
785  [c3]"=&r"(c3), [c4]"=&r"(c4)
786  : [in_int]"r"(in_int)
787  : "t0", "t1", "t2", "t3", "t4",
788  "memory"
789  );
790 
791  curidx = 17 * qc1;
792  curidx += qc2;
793 
794  curidx2 = 17 * qc3;
795  curidx2 += qc4;
796 
797  v_codes = (p_codes[curidx] << count1) | sign1;
798  v_bits = p_bits[curidx] + count1;
799  put_bits(pb, v_bits, v_codes);
800 
801  if (p_vectors[curidx*2 ] == 64.0f) {
802  int len = av_log2(c1);
803  v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
804  put_bits(pb, len * 2 - 3, v_codes);
805  }
806  if (p_vectors[curidx*2+1] == 64.0f) {
807  int len = av_log2(c2);
808  v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
809  put_bits(pb, len*2-3, v_codes);
810  }
811 
812  v_codes = (p_codes[curidx2] << count2) | sign2;
813  v_bits = p_bits[curidx2] + count2;
814  put_bits(pb, v_bits, v_codes);
815 
816  if (p_vectors[curidx2*2 ] == 64.0f) {
817  int len = av_log2(c3);
818  v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
819  put_bits(pb, len* 2 - 3, v_codes);
820  }
821  if (p_vectors[curidx2*2+1] == 64.0f) {
822  int len = av_log2(c4);
823  v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
824  put_bits(pb, len * 2 - 3, v_codes);
825  }
826  }
827  }
828 }
829 
830 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
831  PutBitContext *pb, const float *in,
832  const float *scaled, int size, int scale_idx,
833  int cb, const float lambda, const float uplim,
834  int *bits) = {
835  NULL,
836  quantize_and_encode_band_cost_SQUAD_mips,
837  quantize_and_encode_band_cost_SQUAD_mips,
838  quantize_and_encode_band_cost_UQUAD_mips,
839  quantize_and_encode_band_cost_UQUAD_mips,
840  quantize_and_encode_band_cost_SPAIR_mips,
841  quantize_and_encode_band_cost_SPAIR_mips,
842  quantize_and_encode_band_cost_UPAIR7_mips,
843  quantize_and_encode_band_cost_UPAIR7_mips,
844  quantize_and_encode_band_cost_UPAIR12_mips,
845  quantize_and_encode_band_cost_UPAIR12_mips,
846  quantize_and_encode_band_cost_ESC_mips,
847 };
848 
849 #define quantize_and_encode_band_cost( \
850  s, pb, in, scaled, size, scale_idx, cb, \
851  lambda, uplim, bits) \
852  quantize_and_encode_band_cost_arr[cb]( \
853  s, pb, in, scaled, size, scale_idx, cb, \
854  lambda, uplim, bits)
855 
856 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
857  const float *in, int size, int scale_idx,
858  int cb, const float lambda)
859 {
860  quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
861  INFINITY, NULL);
862 }
863 
864 /**
865  * Functions developed from template function and optimized for getting the number of bits
866  */
867 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
868  PutBitContext *pb, const float *in,
869  const float *scaled, int size, int scale_idx,
870  int cb, const float lambda, const float uplim,
871  int *bits)
872 {
873  return 0;
874 }
875 
876 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
877  PutBitContext *pb, const float *in,
878  const float *scaled, int size, int scale_idx,
879  int cb, const float lambda, const float uplim,
880  int *bits)
881 {
882  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
883  int i;
884  int qc1, qc2, qc3, qc4;
885  int curbits = 0;
886 
887  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
888 
889  for (i = 0; i < size; i += 4) {
890  int curidx;
891  int *in_int = (int *)&in[i];
892 
893  qc1 = scaled[i ] * Q34 + 0.4054f;
894  qc2 = scaled[i+1] * Q34 + 0.4054f;
895  qc3 = scaled[i+2] * Q34 + 0.4054f;
896  qc4 = scaled[i+3] * Q34 + 0.4054f;
897 
898  __asm__ volatile (
899  ".set push \n\t"
900  ".set noreorder \n\t"
901 
902  "slt %[qc1], $zero, %[qc1] \n\t"
903  "slt %[qc2], $zero, %[qc2] \n\t"
904  "slt %[qc3], $zero, %[qc3] \n\t"
905  "slt %[qc4], $zero, %[qc4] \n\t"
906  "lw $t0, 0(%[in_int]) \n\t"
907  "lw $t1, 4(%[in_int]) \n\t"
908  "lw $t2, 8(%[in_int]) \n\t"
909  "lw $t3, 12(%[in_int]) \n\t"
910  "srl $t0, $t0, 31 \n\t"
911  "srl $t1, $t1, 31 \n\t"
912  "srl $t2, $t2, 31 \n\t"
913  "srl $t3, $t3, 31 \n\t"
914  "subu $t4, $zero, %[qc1] \n\t"
915  "subu $t5, $zero, %[qc2] \n\t"
916  "subu $t6, $zero, %[qc3] \n\t"
917  "subu $t7, $zero, %[qc4] \n\t"
918  "movn %[qc1], $t4, $t0 \n\t"
919  "movn %[qc2], $t5, $t1 \n\t"
920  "movn %[qc3], $t6, $t2 \n\t"
921  "movn %[qc4], $t7, $t3 \n\t"
922 
923  ".set pop \n\t"
924 
925  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
926  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
927  : [in_int]"r"(in_int)
928  : "t0", "t1", "t2", "t3",
929  "t4", "t5", "t6", "t7",
930  "memory"
931  );
932 
933  curidx = qc1;
934  curidx *= 3;
935  curidx += qc2;
936  curidx *= 3;
937  curidx += qc3;
938  curidx *= 3;
939  curidx += qc4;
940  curidx += 40;
941 
942  curbits += p_bits[curidx];
943  }
944  return curbits;
945 }
946 
947 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
948  PutBitContext *pb, const float *in,
949  const float *scaled, int size, int scale_idx,
950  int cb, const float lambda, const float uplim,
951  int *bits)
952 {
953  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
954  int i;
955  int curbits = 0;
956  int qc1, qc2, qc3, qc4;
957 
958  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
959 
960  for (i = 0; i < size; i += 4) {
961  int curidx;
962 
963  qc1 = scaled[i ] * Q34 + 0.4054f;
964  qc2 = scaled[i+1] * Q34 + 0.4054f;
965  qc3 = scaled[i+2] * Q34 + 0.4054f;
966  qc4 = scaled[i+3] * Q34 + 0.4054f;
967 
968  __asm__ volatile (
969  ".set push \n\t"
970  ".set noreorder \n\t"
971 
972  "ori $t4, $zero, 2 \n\t"
973  "slt $t0, $t4, %[qc1] \n\t"
974  "slt $t1, $t4, %[qc2] \n\t"
975  "slt $t2, $t4, %[qc3] \n\t"
976  "slt $t3, $t4, %[qc4] \n\t"
977  "movn %[qc1], $t4, $t0 \n\t"
978  "movn %[qc2], $t4, $t1 \n\t"
979  "movn %[qc3], $t4, $t2 \n\t"
980  "movn %[qc4], $t4, $t3 \n\t"
981 
982  ".set pop \n\t"
983 
984  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
985  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
986  :
987  : "t0", "t1", "t2", "t3", "t4"
988  );
989 
990  curidx = qc1;
991  curidx *= 3;
992  curidx += qc2;
993  curidx *= 3;
994  curidx += qc3;
995  curidx *= 3;
996  curidx += qc4;
997 
998  curbits += p_bits[curidx];
999  curbits += uquad_sign_bits[curidx];
1000  }
1001  return curbits;
1002 }
1003 
1004 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1005  PutBitContext *pb, const float *in,
1006  const float *scaled, int size, int scale_idx,
1007  int cb, const float lambda, const float uplim,
1008  int *bits)
1009 {
1010  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1011  int i;
1012  int qc1, qc2, qc3, qc4;
1013  int curbits = 0;
1014 
1015  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1016 
1017  for (i = 0; i < size; i += 4) {
1018  int curidx, curidx2;
1019  int *in_int = (int *)&in[i];
1020 
1021  qc1 = scaled[i ] * Q34 + 0.4054f;
1022  qc2 = scaled[i+1] * Q34 + 0.4054f;
1023  qc3 = scaled[i+2] * Q34 + 0.4054f;
1024  qc4 = scaled[i+3] * Q34 + 0.4054f;
1025 
1026  __asm__ volatile (
1027  ".set push \n\t"
1028  ".set noreorder \n\t"
1029 
1030  "ori $t4, $zero, 4 \n\t"
1031  "slt $t0, $t4, %[qc1] \n\t"
1032  "slt $t1, $t4, %[qc2] \n\t"
1033  "slt $t2, $t4, %[qc3] \n\t"
1034  "slt $t3, $t4, %[qc4] \n\t"
1035  "movn %[qc1], $t4, $t0 \n\t"
1036  "movn %[qc2], $t4, $t1 \n\t"
1037  "movn %[qc3], $t4, $t2 \n\t"
1038  "movn %[qc4], $t4, $t3 \n\t"
1039  "lw $t0, 0(%[in_int]) \n\t"
1040  "lw $t1, 4(%[in_int]) \n\t"
1041  "lw $t2, 8(%[in_int]) \n\t"
1042  "lw $t3, 12(%[in_int]) \n\t"
1043  "srl $t0, $t0, 31 \n\t"
1044  "srl $t1, $t1, 31 \n\t"
1045  "srl $t2, $t2, 31 \n\t"
1046  "srl $t3, $t3, 31 \n\t"
1047  "subu $t4, $zero, %[qc1] \n\t"
1048  "subu $t5, $zero, %[qc2] \n\t"
1049  "subu $t6, $zero, %[qc3] \n\t"
1050  "subu $t7, $zero, %[qc4] \n\t"
1051  "movn %[qc1], $t4, $t0 \n\t"
1052  "movn %[qc2], $t5, $t1 \n\t"
1053  "movn %[qc3], $t6, $t2 \n\t"
1054  "movn %[qc4], $t7, $t3 \n\t"
1055 
1056  ".set pop \n\t"
1057 
1058  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1059  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1060  : [in_int]"r"(in_int)
1061  : "t0", "t1", "t2", "t3",
1062  "t4", "t5", "t6", "t7",
1063  "memory"
1064  );
1065 
1066  curidx = 9 * qc1;
1067  curidx += qc2 + 40;
1068 
1069  curidx2 = 9 * qc3;
1070  curidx2 += qc4 + 40;
1071 
1072  curbits += p_bits[curidx] + p_bits[curidx2];
1073  }
1074  return curbits;
1075 }
1076 
1077 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1078  PutBitContext *pb, const float *in,
1079  const float *scaled, int size, int scale_idx,
1080  int cb, const float lambda, const float uplim,
1081  int *bits)
1082 {
1083  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1084  int i;
1085  int qc1, qc2, qc3, qc4;
1086  int curbits = 0;
1087 
1088  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1089 
1090  for (i = 0; i < size; i += 4) {
1091  int curidx, curidx2;
1092 
1093  qc1 = scaled[i ] * Q34 + 0.4054f;
1094  qc2 = scaled[i+1] * Q34 + 0.4054f;
1095  qc3 = scaled[i+2] * Q34 + 0.4054f;
1096  qc4 = scaled[i+3] * Q34 + 0.4054f;
1097 
1098  __asm__ volatile (
1099  ".set push \n\t"
1100  ".set noreorder \n\t"
1101 
1102  "ori $t4, $zero, 7 \n\t"
1103  "slt $t0, $t4, %[qc1] \n\t"
1104  "slt $t1, $t4, %[qc2] \n\t"
1105  "slt $t2, $t4, %[qc3] \n\t"
1106  "slt $t3, $t4, %[qc4] \n\t"
1107  "movn %[qc1], $t4, $t0 \n\t"
1108  "movn %[qc2], $t4, $t1 \n\t"
1109  "movn %[qc3], $t4, $t2 \n\t"
1110  "movn %[qc4], $t4, $t3 \n\t"
1111 
1112  ".set pop \n\t"
1113 
1114  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1115  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1116  :
1117  : "t0", "t1", "t2", "t3", "t4"
1118  );
1119 
1120  curidx = 8 * qc1;
1121  curidx += qc2;
1122 
1123  curidx2 = 8 * qc3;
1124  curidx2 += qc4;
1125 
1126  curbits += p_bits[curidx] +
1127  upair7_sign_bits[curidx] +
1128  p_bits[curidx2] +
1129  upair7_sign_bits[curidx2];
1130  }
1131  return curbits;
1132 }
1133 
1134 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1135  PutBitContext *pb, const float *in,
1136  const float *scaled, int size, int scale_idx,
1137  int cb, const float lambda, const float uplim,
1138  int *bits)
1139 {
1140  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1141  int i;
1142  int qc1, qc2, qc3, qc4;
1143  int curbits = 0;
1144 
1145  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1146 
1147  for (i = 0; i < size; i += 4) {
1148  int curidx, curidx2;
1149 
1150  qc1 = scaled[i ] * Q34 + 0.4054f;
1151  qc2 = scaled[i+1] * Q34 + 0.4054f;
1152  qc3 = scaled[i+2] * Q34 + 0.4054f;
1153  qc4 = scaled[i+3] * Q34 + 0.4054f;
1154 
1155  __asm__ volatile (
1156  ".set push \n\t"
1157  ".set noreorder \n\t"
1158 
1159  "ori $t4, $zero, 12 \n\t"
1160  "slt $t0, $t4, %[qc1] \n\t"
1161  "slt $t1, $t4, %[qc2] \n\t"
1162  "slt $t2, $t4, %[qc3] \n\t"
1163  "slt $t3, $t4, %[qc4] \n\t"
1164  "movn %[qc1], $t4, $t0 \n\t"
1165  "movn %[qc2], $t4, $t1 \n\t"
1166  "movn %[qc3], $t4, $t2 \n\t"
1167  "movn %[qc4], $t4, $t3 \n\t"
1168 
1169  ".set pop \n\t"
1170 
1171  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1172  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1173  :
1174  : "t0", "t1", "t2", "t3", "t4"
1175  );
1176 
1177  curidx = 13 * qc1;
1178  curidx += qc2;
1179 
1180  curidx2 = 13 * qc3;
1181  curidx2 += qc4;
1182 
1183  curbits += p_bits[curidx] +
1184  p_bits[curidx2] +
1185  upair12_sign_bits[curidx] +
1186  upair12_sign_bits[curidx2];
1187  }
1188  return curbits;
1189 }
1190 
1191 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1192  PutBitContext *pb, const float *in,
1193  const float *scaled, int size, int scale_idx,
1194  int cb, const float lambda, const float uplim,
1195  int *bits)
1196 {
1197  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1198  int i;
1199  int qc1, qc2, qc3, qc4;
1200  int curbits = 0;
1201 
1202  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1203 
1204  for (i = 0; i < size; i += 4) {
1205  int curidx, curidx2;
1206  int cond0, cond1, cond2, cond3;
1207  int c1, c2, c3, c4;
1208 
1209  qc1 = scaled[i ] * Q34 + 0.4054f;
1210  qc2 = scaled[i+1] * Q34 + 0.4054f;
1211  qc3 = scaled[i+2] * Q34 + 0.4054f;
1212  qc4 = scaled[i+3] * Q34 + 0.4054f;
1213 
1214  __asm__ volatile (
1215  ".set push \n\t"
1216  ".set noreorder \n\t"
1217 
1218  "ori $t4, $zero, 15 \n\t"
1219  "ori $t5, $zero, 16 \n\t"
1220  "shll_s.w %[c1], %[qc1], 18 \n\t"
1221  "shll_s.w %[c2], %[qc2], 18 \n\t"
1222  "shll_s.w %[c3], %[qc3], 18 \n\t"
1223  "shll_s.w %[c4], %[qc4], 18 \n\t"
1224  "srl %[c1], %[c1], 18 \n\t"
1225  "srl %[c2], %[c2], 18 \n\t"
1226  "srl %[c3], %[c3], 18 \n\t"
1227  "srl %[c4], %[c4], 18 \n\t"
1228  "slt %[cond0], $t4, %[qc1] \n\t"
1229  "slt %[cond1], $t4, %[qc2] \n\t"
1230  "slt %[cond2], $t4, %[qc3] \n\t"
1231  "slt %[cond3], $t4, %[qc4] \n\t"
1232  "movn %[qc1], $t5, %[cond0] \n\t"
1233  "movn %[qc2], $t5, %[cond1] \n\t"
1234  "movn %[qc3], $t5, %[cond2] \n\t"
1235  "movn %[qc4], $t5, %[cond3] \n\t"
1236  "ori $t5, $zero, 31 \n\t"
1237  "clz %[c1], %[c1] \n\t"
1238  "clz %[c2], %[c2] \n\t"
1239  "clz %[c3], %[c3] \n\t"
1240  "clz %[c4], %[c4] \n\t"
1241  "subu %[c1], $t5, %[c1] \n\t"
1242  "subu %[c2], $t5, %[c2] \n\t"
1243  "subu %[c3], $t5, %[c3] \n\t"
1244  "subu %[c4], $t5, %[c4] \n\t"
1245  "sll %[c1], %[c1], 1 \n\t"
1246  "sll %[c2], %[c2], 1 \n\t"
1247  "sll %[c3], %[c3], 1 \n\t"
1248  "sll %[c4], %[c4], 1 \n\t"
1249  "addiu %[c1], %[c1], -3 \n\t"
1250  "addiu %[c2], %[c2], -3 \n\t"
1251  "addiu %[c3], %[c3], -3 \n\t"
1252  "addiu %[c4], %[c4], -3 \n\t"
1253  "subu %[cond0], $zero, %[cond0] \n\t"
1254  "subu %[cond1], $zero, %[cond1] \n\t"
1255  "subu %[cond2], $zero, %[cond2] \n\t"
1256  "subu %[cond3], $zero, %[cond3] \n\t"
1257  "and %[c1], %[c1], %[cond0] \n\t"
1258  "and %[c2], %[c2], %[cond1] \n\t"
1259  "and %[c3], %[c3], %[cond2] \n\t"
1260  "and %[c4], %[c4], %[cond3] \n\t"
1261 
1262  ".set pop \n\t"
1263 
1264  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1265  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1266  [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1267  [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1268  [c1]"=&r"(c1), [c2]"=&r"(c2),
1269  [c3]"=&r"(c3), [c4]"=&r"(c4)
1270  :
1271  : "t4", "t5"
1272  );
1273 
1274  curidx = 17 * qc1;
1275  curidx += qc2;
1276 
1277  curidx2 = 17 * qc3;
1278  curidx2 += qc4;
1279 
1280  curbits += p_bits[curidx];
1281  curbits += esc_sign_bits[curidx];
1282  curbits += p_bits[curidx2];
1283  curbits += esc_sign_bits[curidx2];
1284 
1285  curbits += c1;
1286  curbits += c2;
1287  curbits += c3;
1288  curbits += c4;
1289  }
1290  return curbits;
1291 }
1292 
1293 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1294  PutBitContext *pb, const float *in,
1295  const float *scaled, int size, int scale_idx,
1296  int cb, const float lambda, const float uplim,
1297  int *bits) = {
1298  get_band_numbits_ZERO_mips,
1299  get_band_numbits_SQUAD_mips,
1300  get_band_numbits_SQUAD_mips,
1301  get_band_numbits_UQUAD_mips,
1302  get_band_numbits_UQUAD_mips,
1303  get_band_numbits_SPAIR_mips,
1304  get_band_numbits_SPAIR_mips,
1305  get_band_numbits_UPAIR7_mips,
1306  get_band_numbits_UPAIR7_mips,
1307  get_band_numbits_UPAIR12_mips,
1308  get_band_numbits_UPAIR12_mips,
1309  get_band_numbits_ESC_mips,
1310 };
1311 
1312 #define get_band_numbits( \
1313  s, pb, in, scaled, size, scale_idx, cb, \
1314  lambda, uplim, bits) \
1315  get_band_numbits_arr[cb]( \
1316  s, pb, in, scaled, size, scale_idx, cb, \
1317  lambda, uplim, bits)
1318 
1319 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1320  const float *scaled, int size, int scale_idx,
1321  int cb, const float lambda, const float uplim,
1322  int *bits)
1323 {
1324  return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1325 }
1326 
1327 /**
1328  * Functions developed from template function and optimized for getting the band cost
1329  */
1330 #if HAVE_MIPSFPU
1331 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1332  PutBitContext *pb, const float *in,
1333  const float *scaled, int size, int scale_idx,
1334  int cb, const float lambda, const float uplim,
1335  int *bits)
1336 {
1337  int i;
1338  float cost = 0;
1339 
1340  for (i = 0; i < size; i += 4) {
1341  cost += in[i ] * in[i ];
1342  cost += in[i+1] * in[i+1];
1343  cost += in[i+2] * in[i+2];
1344  cost += in[i+3] * in[i+3];
1345  }
1346  if (bits)
1347  *bits = 0;
1348  return cost * lambda;
1349 }
1350 
1351 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1352  PutBitContext *pb, const float *in,
1353  const float *scaled, int size, int scale_idx,
1354  int cb, const float lambda, const float uplim,
1355  int *bits)
1356 {
1357  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1358  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1359  int i;
1360  float cost = 0;
1361  int qc1, qc2, qc3, qc4;
1362  int curbits = 0;
1363 
1364  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1365  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1366 
1367  for (i = 0; i < size; i += 4) {
1368  const float *vec;
1369  int curidx;
1370  int *in_int = (int *)&in[i];
1371  float *in_pos = (float *)&in[i];
1372  float di0, di1, di2, di3;
1373 
1374  qc1 = scaled[i ] * Q34 + 0.4054f;
1375  qc2 = scaled[i+1] * Q34 + 0.4054f;
1376  qc3 = scaled[i+2] * Q34 + 0.4054f;
1377  qc4 = scaled[i+3] * Q34 + 0.4054f;
1378 
1379  __asm__ volatile (
1380  ".set push \n\t"
1381  ".set noreorder \n\t"
1382 
1383  "slt %[qc1], $zero, %[qc1] \n\t"
1384  "slt %[qc2], $zero, %[qc2] \n\t"
1385  "slt %[qc3], $zero, %[qc3] \n\t"
1386  "slt %[qc4], $zero, %[qc4] \n\t"
1387  "lw $t0, 0(%[in_int]) \n\t"
1388  "lw $t1, 4(%[in_int]) \n\t"
1389  "lw $t2, 8(%[in_int]) \n\t"
1390  "lw $t3, 12(%[in_int]) \n\t"
1391  "srl $t0, $t0, 31 \n\t"
1392  "srl $t1, $t1, 31 \n\t"
1393  "srl $t2, $t2, 31 \n\t"
1394  "srl $t3, $t3, 31 \n\t"
1395  "subu $t4, $zero, %[qc1] \n\t"
1396  "subu $t5, $zero, %[qc2] \n\t"
1397  "subu $t6, $zero, %[qc3] \n\t"
1398  "subu $t7, $zero, %[qc4] \n\t"
1399  "movn %[qc1], $t4, $t0 \n\t"
1400  "movn %[qc2], $t5, $t1 \n\t"
1401  "movn %[qc3], $t6, $t2 \n\t"
1402  "movn %[qc4], $t7, $t3 \n\t"
1403 
1404  ".set pop \n\t"
1405 
1406  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1407  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1408  : [in_int]"r"(in_int)
1409  : "t0", "t1", "t2", "t3",
1410  "t4", "t5", "t6", "t7",
1411  "memory"
1412  );
1413 
1414  curidx = qc1;
1415  curidx *= 3;
1416  curidx += qc2;
1417  curidx *= 3;
1418  curidx += qc3;
1419  curidx *= 3;
1420  curidx += qc4;
1421  curidx += 40;
1422 
1423  curbits += p_bits[curidx];
1424  vec = &p_codes[curidx*4];
1425 
1426  __asm__ volatile (
1427  ".set push \n\t"
1428  ".set noreorder \n\t"
1429 
1430  "lwc1 $f0, 0(%[in_pos]) \n\t"
1431  "lwc1 $f1, 0(%[vec]) \n\t"
1432  "lwc1 $f2, 4(%[in_pos]) \n\t"
1433  "lwc1 $f3, 4(%[vec]) \n\t"
1434  "lwc1 $f4, 8(%[in_pos]) \n\t"
1435  "lwc1 $f5, 8(%[vec]) \n\t"
1436  "lwc1 $f6, 12(%[in_pos]) \n\t"
1437  "lwc1 $f7, 12(%[vec]) \n\t"
1438  "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1439  "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1440  "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1441  "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1442 
1443  ".set pop \n\t"
1444 
1445  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1446  [di2]"=&f"(di2), [di3]"=&f"(di3)
1447  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1448  [IQ]"f"(IQ)
1449  : "$f0", "$f1", "$f2", "$f3",
1450  "$f4", "$f5", "$f6", "$f7",
1451  "memory"
1452  );
1453 
1454  cost += di0 * di0 + di1 * di1
1455  + di2 * di2 + di3 * di3;
1456  }
1457 
1458  if (bits)
1459  *bits = curbits;
1460  return cost * lambda + curbits;
1461 }
1462 
1463 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1464  PutBitContext *pb, const float *in,
1465  const float *scaled, int size, int scale_idx,
1466  int cb, const float lambda, const float uplim,
1467  int *bits)
1468 {
1469  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1470  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1471  int i;
1472  float cost = 0;
1473  int curbits = 0;
1474  int qc1, qc2, qc3, qc4;
1475 
1476  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1477  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1478 
1479  for (i = 0; i < size; i += 4) {
1480  const float *vec;
1481  int curidx;
1482  float *in_pos = (float *)&in[i];
1483  float di0, di1, di2, di3;
1484 
1485  qc1 = scaled[i ] * Q34 + 0.4054f;
1486  qc2 = scaled[i+1] * Q34 + 0.4054f;
1487  qc3 = scaled[i+2] * Q34 + 0.4054f;
1488  qc4 = scaled[i+3] * Q34 + 0.4054f;
1489 
1490  __asm__ volatile (
1491  ".set push \n\t"
1492  ".set noreorder \n\t"
1493 
1494  "ori $t4, $zero, 2 \n\t"
1495  "slt $t0, $t4, %[qc1] \n\t"
1496  "slt $t1, $t4, %[qc2] \n\t"
1497  "slt $t2, $t4, %[qc3] \n\t"
1498  "slt $t3, $t4, %[qc4] \n\t"
1499  "movn %[qc1], $t4, $t0 \n\t"
1500  "movn %[qc2], $t4, $t1 \n\t"
1501  "movn %[qc3], $t4, $t2 \n\t"
1502  "movn %[qc4], $t4, $t3 \n\t"
1503 
1504  ".set pop \n\t"
1505 
1506  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1507  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1508  :
1509  : "t0", "t1", "t2", "t3", "t4"
1510  );
1511 
1512  curidx = qc1;
1513  curidx *= 3;
1514  curidx += qc2;
1515  curidx *= 3;
1516  curidx += qc3;
1517  curidx *= 3;
1518  curidx += qc4;
1519 
1520  curbits += p_bits[curidx];
1521  curbits += uquad_sign_bits[curidx];
1522  vec = &p_codes[curidx*4];
1523 
1524  __asm__ volatile (
1525  ".set push \n\t"
1526  ".set noreorder \n\t"
1527 
1528  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1529  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1530  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1531  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1532  "abs.s %[di0], %[di0] \n\t"
1533  "abs.s %[di1], %[di1] \n\t"
1534  "abs.s %[di2], %[di2] \n\t"
1535  "abs.s %[di3], %[di3] \n\t"
1536  "lwc1 $f0, 0(%[vec]) \n\t"
1537  "lwc1 $f1, 4(%[vec]) \n\t"
1538  "lwc1 $f2, 8(%[vec]) \n\t"
1539  "lwc1 $f3, 12(%[vec]) \n\t"
1540  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1541  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1542  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1543  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1544 
1545  ".set pop \n\t"
1546 
1547  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1548  [di2]"=&f"(di2), [di3]"=&f"(di3)
1549  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1550  [IQ]"f"(IQ)
1551  : "$f0", "$f1", "$f2", "$f3",
1552  "memory"
1553  );
1554 
1555  cost += di0 * di0 + di1 * di1
1556  + di2 * di2 + di3 * di3;
1557  }
1558 
1559  if (bits)
1560  *bits = curbits;
1561  return cost * lambda + curbits;
1562 }
1563 
1564 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1565  PutBitContext *pb, const float *in,
1566  const float *scaled, int size, int scale_idx,
1567  int cb, const float lambda, const float uplim,
1568  int *bits)
1569 {
1570  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1571  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1572  int i;
1573  float cost = 0;
1574  int qc1, qc2, qc3, qc4;
1575  int curbits = 0;
1576 
1577  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1578  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1579 
1580  for (i = 0; i < size; i += 4) {
1581  const float *vec, *vec2;
1582  int curidx, curidx2;
1583  int *in_int = (int *)&in[i];
1584  float *in_pos = (float *)&in[i];
1585  float di0, di1, di2, di3;
1586 
1587  qc1 = scaled[i ] * Q34 + 0.4054f;
1588  qc2 = scaled[i+1] * Q34 + 0.4054f;
1589  qc3 = scaled[i+2] * Q34 + 0.4054f;
1590  qc4 = scaled[i+3] * Q34 + 0.4054f;
1591 
1592  __asm__ volatile (
1593  ".set push \n\t"
1594  ".set noreorder \n\t"
1595 
1596  "ori $t4, $zero, 4 \n\t"
1597  "slt $t0, $t4, %[qc1] \n\t"
1598  "slt $t1, $t4, %[qc2] \n\t"
1599  "slt $t2, $t4, %[qc3] \n\t"
1600  "slt $t3, $t4, %[qc4] \n\t"
1601  "movn %[qc1], $t4, $t0 \n\t"
1602  "movn %[qc2], $t4, $t1 \n\t"
1603  "movn %[qc3], $t4, $t2 \n\t"
1604  "movn %[qc4], $t4, $t3 \n\t"
1605  "lw $t0, 0(%[in_int]) \n\t"
1606  "lw $t1, 4(%[in_int]) \n\t"
1607  "lw $t2, 8(%[in_int]) \n\t"
1608  "lw $t3, 12(%[in_int]) \n\t"
1609  "srl $t0, $t0, 31 \n\t"
1610  "srl $t1, $t1, 31 \n\t"
1611  "srl $t2, $t2, 31 \n\t"
1612  "srl $t3, $t3, 31 \n\t"
1613  "subu $t4, $zero, %[qc1] \n\t"
1614  "subu $t5, $zero, %[qc2] \n\t"
1615  "subu $t6, $zero, %[qc3] \n\t"
1616  "subu $t7, $zero, %[qc4] \n\t"
1617  "movn %[qc1], $t4, $t0 \n\t"
1618  "movn %[qc2], $t5, $t1 \n\t"
1619  "movn %[qc3], $t6, $t2 \n\t"
1620  "movn %[qc4], $t7, $t3 \n\t"
1621 
1622  ".set pop \n\t"
1623 
1624  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1625  [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1626  : [in_int]"r"(in_int)
1627  : "t0", "t1", "t2", "t3",
1628  "t4", "t5", "t6", "t7",
1629  "memory"
1630  );
1631 
1632  curidx = 9 * qc1;
1633  curidx += qc2 + 40;
1634 
1635  curidx2 = 9 * qc3;
1636  curidx2 += qc4 + 40;
1637 
1638  curbits += p_bits[curidx];
1639  curbits += p_bits[curidx2];
1640 
1641  vec = &p_codes[curidx*2];
1642  vec2 = &p_codes[curidx2*2];
1643 
1644  __asm__ volatile (
1645  ".set push \n\t"
1646  ".set noreorder \n\t"
1647 
1648  "lwc1 $f0, 0(%[in_pos]) \n\t"
1649  "lwc1 $f1, 0(%[vec]) \n\t"
1650  "lwc1 $f2, 4(%[in_pos]) \n\t"
1651  "lwc1 $f3, 4(%[vec]) \n\t"
1652  "lwc1 $f4, 8(%[in_pos]) \n\t"
1653  "lwc1 $f5, 0(%[vec2]) \n\t"
1654  "lwc1 $f6, 12(%[in_pos]) \n\t"
1655  "lwc1 $f7, 4(%[vec2]) \n\t"
1656  "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1657  "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1658  "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1659  "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1660 
1661  ".set pop \n\t"
1662 
1663  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1664  [di2]"=&f"(di2), [di3]"=&f"(di3)
1665  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1666  [vec2]"r"(vec2), [IQ]"f"(IQ)
1667  : "$f0", "$f1", "$f2", "$f3",
1668  "$f4", "$f5", "$f6", "$f7",
1669  "memory"
1670  );
1671 
1672  cost += di0 * di0 + di1 * di1
1673  + di2 * di2 + di3 * di3;
1674  }
1675 
1676  if (bits)
1677  *bits = curbits;
1678  return cost * lambda + curbits;
1679 }
1680 
1681 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1682  PutBitContext *pb, const float *in,
1683  const float *scaled, int size, int scale_idx,
1684  int cb, const float lambda, const float uplim,
1685  int *bits)
1686 {
1687  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1688  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1689  int i;
1690  float cost = 0;
1691  int qc1, qc2, qc3, qc4;
1692  int curbits = 0;
1693 
1694  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1695  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1696 
1697  for (i = 0; i < size; i += 4) {
1698  const float *vec, *vec2;
1699  int curidx, curidx2, sign1, count1, sign2, count2;
1700  int *in_int = (int *)&in[i];
1701  float *in_pos = (float *)&in[i];
1702  float di0, di1, di2, di3;
1703 
1704  qc1 = scaled[i ] * Q34 + 0.4054f;
1705  qc2 = scaled[i+1] * Q34 + 0.4054f;
1706  qc3 = scaled[i+2] * Q34 + 0.4054f;
1707  qc4 = scaled[i+3] * Q34 + 0.4054f;
1708 
1709  __asm__ volatile (
1710  ".set push \n\t"
1711  ".set noreorder \n\t"
1712 
1713  "ori $t4, $zero, 7 \n\t"
1714  "ori %[sign1], $zero, 0 \n\t"
1715  "ori %[sign2], $zero, 0 \n\t"
1716  "slt $t0, $t4, %[qc1] \n\t"
1717  "slt $t1, $t4, %[qc2] \n\t"
1718  "slt $t2, $t4, %[qc3] \n\t"
1719  "slt $t3, $t4, %[qc4] \n\t"
1720  "movn %[qc1], $t4, $t0 \n\t"
1721  "movn %[qc2], $t4, $t1 \n\t"
1722  "movn %[qc3], $t4, $t2 \n\t"
1723  "movn %[qc4], $t4, $t3 \n\t"
1724  "lw $t0, 0(%[in_int]) \n\t"
1725  "lw $t1, 4(%[in_int]) \n\t"
1726  "lw $t2, 8(%[in_int]) \n\t"
1727  "lw $t3, 12(%[in_int]) \n\t"
1728  "slt $t0, $t0, $zero \n\t"
1729  "movn %[sign1], $t0, %[qc1] \n\t"
1730  "slt $t2, $t2, $zero \n\t"
1731  "movn %[sign2], $t2, %[qc3] \n\t"
1732  "slt $t1, $t1, $zero \n\t"
1733  "sll $t0, %[sign1], 1 \n\t"
1734  "or $t0, $t0, $t1 \n\t"
1735  "movn %[sign1], $t0, %[qc2] \n\t"
1736  "slt $t3, $t3, $zero \n\t"
1737  "sll $t0, %[sign2], 1 \n\t"
1738  "or $t0, $t0, $t3 \n\t"
1739  "movn %[sign2], $t0, %[qc4] \n\t"
1740  "slt %[count1], $zero, %[qc1] \n\t"
1741  "slt $t1, $zero, %[qc2] \n\t"
1742  "slt %[count2], $zero, %[qc3] \n\t"
1743  "slt $t2, $zero, %[qc4] \n\t"
1744  "addu %[count1], %[count1], $t1 \n\t"
1745  "addu %[count2], %[count2], $t2 \n\t"
1746 
1747  ".set pop \n\t"
1748 
1749  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1750  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1751  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1752  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1753  : [in_int]"r"(in_int)
1754  : "t0", "t1", "t2", "t3", "t4",
1755  "memory"
1756  );
1757 
1758  curidx = 8 * qc1;
1759  curidx += qc2;
1760 
1761  curidx2 = 8 * qc3;
1762  curidx2 += qc4;
1763 
1764  curbits += p_bits[curidx];
1765  curbits += upair7_sign_bits[curidx];
1766  vec = &p_codes[curidx*2];
1767 
1768  curbits += p_bits[curidx2];
1769  curbits += upair7_sign_bits[curidx2];
1770  vec2 = &p_codes[curidx2*2];
1771 
1772  __asm__ volatile (
1773  ".set push \n\t"
1774  ".set noreorder \n\t"
1775 
1776  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1777  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1778  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1779  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1780  "abs.s %[di0], %[di0] \n\t"
1781  "abs.s %[di1], %[di1] \n\t"
1782  "abs.s %[di2], %[di2] \n\t"
1783  "abs.s %[di3], %[di3] \n\t"
1784  "lwc1 $f0, 0(%[vec]) \n\t"
1785  "lwc1 $f1, 4(%[vec]) \n\t"
1786  "lwc1 $f2, 0(%[vec2]) \n\t"
1787  "lwc1 $f3, 4(%[vec2]) \n\t"
1788  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1789  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1790  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1791  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1792 
1793  ".set pop \n\t"
1794 
1795  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1796  [di2]"=&f"(di2), [di3]"=&f"(di3)
1797  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1798  [vec2]"r"(vec2), [IQ]"f"(IQ)
1799  : "$f0", "$f1", "$f2", "$f3",
1800  "memory"
1801  );
1802 
1803  cost += di0 * di0 + di1 * di1
1804  + di2 * di2 + di3 * di3;
1805  }
1806 
1807  if (bits)
1808  *bits = curbits;
1809  return cost * lambda + curbits;
1810 }
1811 
1812 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1813  PutBitContext *pb, const float *in,
1814  const float *scaled, int size, int scale_idx,
1815  int cb, const float lambda, const float uplim,
1816  int *bits)
1817 {
1818  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1819  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1820  int i;
1821  float cost = 0;
1822  int qc1, qc2, qc3, qc4;
1823  int curbits = 0;
1824 
1825  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1826  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1827 
1828  for (i = 0; i < size; i += 4) {
1829  const float *vec, *vec2;
1830  int curidx, curidx2;
1831  int sign1, count1, sign2, count2;
1832  int *in_int = (int *)&in[i];
1833  float *in_pos = (float *)&in[i];
1834  float di0, di1, di2, di3;
1835 
1836  qc1 = scaled[i ] * Q34 + 0.4054f;
1837  qc2 = scaled[i+1] * Q34 + 0.4054f;
1838  qc3 = scaled[i+2] * Q34 + 0.4054f;
1839  qc4 = scaled[i+3] * Q34 + 0.4054f;
1840 
1841  __asm__ volatile (
1842  ".set push \n\t"
1843  ".set noreorder \n\t"
1844 
1845  "ori $t4, $zero, 12 \n\t"
1846  "ori %[sign1], $zero, 0 \n\t"
1847  "ori %[sign2], $zero, 0 \n\t"
1848  "slt $t0, $t4, %[qc1] \n\t"
1849  "slt $t1, $t4, %[qc2] \n\t"
1850  "slt $t2, $t4, %[qc3] \n\t"
1851  "slt $t3, $t4, %[qc4] \n\t"
1852  "movn %[qc1], $t4, $t0 \n\t"
1853  "movn %[qc2], $t4, $t1 \n\t"
1854  "movn %[qc3], $t4, $t2 \n\t"
1855  "movn %[qc4], $t4, $t3 \n\t"
1856  "lw $t0, 0(%[in_int]) \n\t"
1857  "lw $t1, 4(%[in_int]) \n\t"
1858  "lw $t2, 8(%[in_int]) \n\t"
1859  "lw $t3, 12(%[in_int]) \n\t"
1860  "slt $t0, $t0, $zero \n\t"
1861  "movn %[sign1], $t0, %[qc1] \n\t"
1862  "slt $t2, $t2, $zero \n\t"
1863  "movn %[sign2], $t2, %[qc3] \n\t"
1864  "slt $t1, $t1, $zero \n\t"
1865  "sll $t0, %[sign1], 1 \n\t"
1866  "or $t0, $t0, $t1 \n\t"
1867  "movn %[sign1], $t0, %[qc2] \n\t"
1868  "slt $t3, $t3, $zero \n\t"
1869  "sll $t0, %[sign2], 1 \n\t"
1870  "or $t0, $t0, $t3 \n\t"
1871  "movn %[sign2], $t0, %[qc4] \n\t"
1872  "slt %[count1], $zero, %[qc1] \n\t"
1873  "slt $t1, $zero, %[qc2] \n\t"
1874  "slt %[count2], $zero, %[qc3] \n\t"
1875  "slt $t2, $zero, %[qc4] \n\t"
1876  "addu %[count1], %[count1], $t1 \n\t"
1877  "addu %[count2], %[count2], $t2 \n\t"
1878 
1879  ".set pop \n\t"
1880 
1881  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1882  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1883  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1884  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1885  : [in_int]"r"(in_int)
1886  : "t0", "t1", "t2", "t3", "t4",
1887  "memory"
1888  );
1889 
1890  curidx = 13 * qc1;
1891  curidx += qc2;
1892 
1893  curidx2 = 13 * qc3;
1894  curidx2 += qc4;
1895 
1896  curbits += p_bits[curidx];
1897  curbits += p_bits[curidx2];
1898  curbits += upair12_sign_bits[curidx];
1899  curbits += upair12_sign_bits[curidx2];
1900  vec = &p_codes[curidx*2];
1901  vec2 = &p_codes[curidx2*2];
1902 
1903  __asm__ volatile (
1904  ".set push \n\t"
1905  ".set noreorder \n\t"
1906 
1907  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1908  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1909  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1910  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1911  "abs.s %[di0], %[di0] \n\t"
1912  "abs.s %[di1], %[di1] \n\t"
1913  "abs.s %[di2], %[di2] \n\t"
1914  "abs.s %[di3], %[di3] \n\t"
1915  "lwc1 $f0, 0(%[vec]) \n\t"
1916  "lwc1 $f1, 4(%[vec]) \n\t"
1917  "lwc1 $f2, 0(%[vec2]) \n\t"
1918  "lwc1 $f3, 4(%[vec2]) \n\t"
1919  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1920  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1921  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1922  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1923 
1924  ".set pop \n\t"
1925 
1926  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1927  [di2]"=&f"(di2), [di3]"=&f"(di3)
1928  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1929  [vec2]"r"(vec2), [IQ]"f"(IQ)
1930  : "$f0", "$f1", "$f2", "$f3",
1931  "memory"
1932  );
1933 
1934  cost += di0 * di0 + di1 * di1
1935  + di2 * di2 + di3 * di3;
1936  }
1937 
1938  if (bits)
1939  *bits = curbits;
1940  return cost * lambda + curbits;
1941 }
1942 
1943 static float get_band_cost_ESC_mips(struct AACEncContext *s,
1944  PutBitContext *pb, const float *in,
1945  const float *scaled, int size, int scale_idx,
1946  int cb, const float lambda, const float uplim,
1947  int *bits)
1948 {
1949  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1950  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1951  const float CLIPPED_ESCAPE = 165140.0f * IQ;
1952  int i;
1953  float cost = 0;
1954  int qc1, qc2, qc3, qc4;
1955  int curbits = 0;
1956 
1957  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1958  float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
1959 
1960  for (i = 0; i < size; i += 4) {
1961  const float *vec, *vec2;
1962  int curidx, curidx2;
1963  float t1, t2, t3, t4;
1964  float di1, di2, di3, di4;
1965  int cond0, cond1, cond2, cond3;
1966  int c1, c2, c3, c4;
1967 
1968  qc1 = scaled[i ] * Q34 + 0.4054f;
1969  qc2 = scaled[i+1] * Q34 + 0.4054f;
1970  qc3 = scaled[i+2] * Q34 + 0.4054f;
1971  qc4 = scaled[i+3] * Q34 + 0.4054f;
1972 
1973  __asm__ volatile (
1974  ".set push \n\t"
1975  ".set noreorder \n\t"
1976 
1977  "ori $t4, $zero, 15 \n\t"
1978  "ori $t5, $zero, 16 \n\t"
1979  "shll_s.w %[c1], %[qc1], 18 \n\t"
1980  "shll_s.w %[c2], %[qc2], 18 \n\t"
1981  "shll_s.w %[c3], %[qc3], 18 \n\t"
1982  "shll_s.w %[c4], %[qc4], 18 \n\t"
1983  "srl %[c1], %[c1], 18 \n\t"
1984  "srl %[c2], %[c2], 18 \n\t"
1985  "srl %[c3], %[c3], 18 \n\t"
1986  "srl %[c4], %[c4], 18 \n\t"
1987  "slt %[cond0], $t4, %[qc1] \n\t"
1988  "slt %[cond1], $t4, %[qc2] \n\t"
1989  "slt %[cond2], $t4, %[qc3] \n\t"
1990  "slt %[cond3], $t4, %[qc4] \n\t"
1991  "movn %[qc1], $t5, %[cond0] \n\t"
1992  "movn %[qc2], $t5, %[cond1] \n\t"
1993  "movn %[qc3], $t5, %[cond2] \n\t"
1994  "movn %[qc4], $t5, %[cond3] \n\t"
1995 
1996  ".set pop \n\t"
1997 
1998  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1999  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2000  [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2001  [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2002  [c1]"=&r"(c1), [c2]"=&r"(c2),
2003  [c3]"=&r"(c3), [c4]"=&r"(c4)
2004  :
2005  : "t4", "t5"
2006  );
2007 
2008  curidx = 17 * qc1;
2009  curidx += qc2;
2010 
2011  curidx2 = 17 * qc3;
2012  curidx2 += qc4;
2013 
2014  curbits += p_bits[curidx];
2015  curbits += esc_sign_bits[curidx];
2016  vec = &p_codes[curidx*2];
2017 
2018  curbits += p_bits[curidx2];
2019  curbits += esc_sign_bits[curidx2];
2020  vec2 = &p_codes[curidx2*2];
2021 
2022  curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2023  curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2024  curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2025  curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2026 
2027  t1 = fabsf(in[i ]);
2028  t2 = fabsf(in[i+1]);
2029  t3 = fabsf(in[i+2]);
2030  t4 = fabsf(in[i+3]);
2031 
2032  if (cond0) {
2033  if (t1 >= CLIPPED_ESCAPE) {
2034  di1 = t1 - CLIPPED_ESCAPE;
2035  } else {
2036  di1 = t1 - c1 * cbrtf(c1) * IQ;
2037  }
2038  } else
2039  di1 = t1 - vec[0] * IQ;
2040 
2041  if (cond1) {
2042  if (t2 >= CLIPPED_ESCAPE) {
2043  di2 = t2 - CLIPPED_ESCAPE;
2044  } else {
2045  di2 = t2 - c2 * cbrtf(c2) * IQ;
2046  }
2047  } else
2048  di2 = t2 - vec[1] * IQ;
2049 
2050  if (cond2) {
2051  if (t3 >= CLIPPED_ESCAPE) {
2052  di3 = t3 - CLIPPED_ESCAPE;
2053  } else {
2054  di3 = t3 - c3 * cbrtf(c3) * IQ;
2055  }
2056  } else
2057  di3 = t3 - vec2[0] * IQ;
2058 
2059  if (cond3) {
2060  if (t4 >= CLIPPED_ESCAPE) {
2061  di4 = t4 - CLIPPED_ESCAPE;
2062  } else {
2063  di4 = t4 - c4 * cbrtf(c4) * IQ;
2064  }
2065  } else
2066  di4 = t4 - vec2[1]*IQ;
2067 
2068  cost += di1 * di1 + di2 * di2
2069  + di3 * di3 + di4 * di4;
2070  }
2071 
2072  if (bits)
2073  *bits = curbits;
2074  return cost * lambda + curbits;
2075 }
2076 
2077 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2078  PutBitContext *pb, const float *in,
2079  const float *scaled, int size, int scale_idx,
2080  int cb, const float lambda, const float uplim,
2081  int *bits) = {
2082  get_band_cost_ZERO_mips,
2083  get_band_cost_SQUAD_mips,
2084  get_band_cost_SQUAD_mips,
2085  get_band_cost_UQUAD_mips,
2086  get_band_cost_UQUAD_mips,
2087  get_band_cost_SPAIR_mips,
2088  get_band_cost_SPAIR_mips,
2089  get_band_cost_UPAIR7_mips,
2090  get_band_cost_UPAIR7_mips,
2091  get_band_cost_UPAIR12_mips,
2092  get_band_cost_UPAIR12_mips,
2093  get_band_cost_ESC_mips,
2094 };
2095 
2096 #define get_band_cost( \
2097  s, pb, in, scaled, size, scale_idx, cb, \
2098  lambda, uplim, bits) \
2099  get_band_cost_arr[cb]( \
2100  s, pb, in, scaled, size, scale_idx, cb, \
2101  lambda, uplim, bits)
2102 
2103 static float quantize_band_cost(struct AACEncContext *s, const float *in,
2104  const float *scaled, int size, int scale_idx,
2105  int cb, const float lambda, const float uplim,
2106  int *bits)
2107 {
2108  return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2109 }
2110 
2111 static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2112  AACEncContext *s,
2113  SingleChannelElement *sce,
2114  const float lambda)
2115 {
2116  int start = 0, i, w, w2, g;
2117  int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2118  float dists[128] = { 0 }, uplims[128];
2119  float maxvals[128];
2120  int fflag, minscaler;
2121  int its = 0;
2122  int allz = 0;
2123  float minthr = INFINITY;
2124 
2125  destbits = FFMIN(destbits, 5800);
2126  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2127  for (g = 0; g < sce->ics.num_swb; g++) {
2128  int nz = 0;
2129  float uplim = 0.0f;
2130  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2131  FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2132  uplim += band->threshold;
2133  if (band->energy <= band->threshold || band->threshold == 0.0f) {
2134  sce->zeroes[(w+w2)*16+g] = 1;
2135  continue;
2136  }
2137  nz = 1;
2138  }
2139  uplims[w*16+g] = uplim *512;
2140  sce->zeroes[w*16+g] = !nz;
2141  if (nz)
2142  minthr = FFMIN(minthr, uplim);
2143  allz |= nz;
2144  }
2145  }
2146  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2147  for (g = 0; g < sce->ics.num_swb; g++) {
2148  if (sce->zeroes[w*16+g]) {
2149  sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2150  continue;
2151  }
2152  sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2153  }
2154  }
2155 
2156  if (!allz)
2157  return;
2158  abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2159 
2160  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2161  start = w*128;
2162  for (g = 0; g < sce->ics.num_swb; g++) {
2163  const float *scaled = s->scoefs + start;
2164  maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2165  start += sce->ics.swb_sizes[g];
2166  }
2167  }
2168 
2169  do {
2170  int tbits, qstep;
2171  minscaler = sce->sf_idx[0];
2172  qstep = its ? 1 : 32;
2173  do {
2174  int prev = -1;
2175  tbits = 0;
2176  fflag = 0;
2177 
2178  if (qstep > 1) {
2179  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2180  start = w*128;
2181  for (g = 0; g < sce->ics.num_swb; g++) {
2182  const float *coefs = sce->coeffs + start;
2183  const float *scaled = s->scoefs + start;
2184  int bits = 0;
2185  int cb;
2186 
2187  if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2188  start += sce->ics.swb_sizes[g];
2189  continue;
2190  }
2191  minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2192  cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2193  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2194  int b;
2195  bits += quantize_band_cost_bits(s, coefs + w2*128,
2196  scaled + w2*128,
2197  sce->ics.swb_sizes[g],
2198  sce->sf_idx[w*16+g],
2199  cb,
2200  1.0f,
2201  INFINITY,
2202  &b);
2203  }
2204  if (prev != -1) {
2205  bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2206  }
2207  tbits += bits;
2208  start += sce->ics.swb_sizes[g];
2209  prev = sce->sf_idx[w*16+g];
2210  }
2211  }
2212  }
2213  else {
2214  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2215  start = w*128;
2216  for (g = 0; g < sce->ics.num_swb; g++) {
2217  const float *coefs = sce->coeffs + start;
2218  const float *scaled = s->scoefs + start;
2219  int bits = 0;
2220  int cb;
2221  float dist = 0.0f;
2222 
2223  if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2224  start += sce->ics.swb_sizes[g];
2225  continue;
2226  }
2227  minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2228  cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2229  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2230  int b;
2231  dist += quantize_band_cost(s, coefs + w2*128,
2232  scaled + w2*128,
2233  sce->ics.swb_sizes[g],
2234  sce->sf_idx[w*16+g],
2235  cb,
2236  1.0f,
2237  INFINITY,
2238  &b);
2239  bits += b;
2240  }
2241  dists[w*16+g] = dist - bits;
2242  if (prev != -1) {
2243  bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2244  }
2245  tbits += bits;
2246  start += sce->ics.swb_sizes[g];
2247  prev = sce->sf_idx[w*16+g];
2248  }
2249  }
2250  }
2251  if (tbits > destbits) {
2252  for (i = 0; i < 128; i++)
2253  if (sce->sf_idx[i] < 218 - qstep)
2254  sce->sf_idx[i] += qstep;
2255  } else {
2256  for (i = 0; i < 128; i++)
2257  if (sce->sf_idx[i] > 60 - qstep)
2258  sce->sf_idx[i] -= qstep;
2259  }
2260  qstep >>= 1;
2261  if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2262  qstep = 1;
2263  } while (qstep);
2264 
2265  fflag = 0;
2266  minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2267  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2268  for (g = 0; g < sce->ics.num_swb; g++) {
2269  int prevsc = sce->sf_idx[w*16+g];
2270  if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2271  if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2272  sce->sf_idx[w*16+g]--;
2273  else
2274  sce->sf_idx[w*16+g]-=2;
2275  }
2276  sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2277  sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2278  if (sce->sf_idx[w*16+g] != prevsc)
2279  fflag = 1;
2280  sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2281  }
2282  }
2283  its++;
2284  } while (fflag && its < 10);
2285 }
2286 
2287 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2288  const float lambda)
2289 {
2290  int start = 0, i, w, w2, g;
2291  float M[128], S[128];
2292  float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2293  SingleChannelElement *sce0 = &cpe->ch[0];
2294  SingleChannelElement *sce1 = &cpe->ch[1];
2295  if (!cpe->common_window)
2296  return;
2297  for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2298  for (g = 0; g < sce0->ics.num_swb; g++) {
2299  if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2300  float dist1 = 0.0f, dist2 = 0.0f;
2301  for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2302  FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2303  FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2304  float minthr = FFMIN(band0->threshold, band1->threshold);
2305  float maxthr = FFMAX(band0->threshold, band1->threshold);
2306  for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2307  M[i ] = (sce0->coeffs[start+w2*128+i ]
2308  + sce1->coeffs[start+w2*128+i ]) * 0.5;
2309  M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2310  + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2311  M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2312  + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2313  M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2314  + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2315 
2316  S[i ] = M[i ]
2317  - sce1->coeffs[start+w2*128+i ];
2318  S[i+1] = M[i+1]
2319  - sce1->coeffs[start+w2*128+i+1];
2320  S[i+2] = M[i+2]
2321  - sce1->coeffs[start+w2*128+i+2];
2322  S[i+3] = M[i+3]
2323  - sce1->coeffs[start+w2*128+i+3];
2324  }
2325  abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2326  abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2327  abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
2328  abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
2329  dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2330  L34,
2331  sce0->ics.swb_sizes[g],
2332  sce0->sf_idx[(w+w2)*16+g],
2333  sce0->band_type[(w+w2)*16+g],
2334  lambda / band0->threshold, INFINITY, NULL);
2335  dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2336  R34,
2337  sce1->ics.swb_sizes[g],
2338  sce1->sf_idx[(w+w2)*16+g],
2339  sce1->band_type[(w+w2)*16+g],
2340  lambda / band1->threshold, INFINITY, NULL);
2341  dist2 += quantize_band_cost(s, M,
2342  M34,
2343  sce0->ics.swb_sizes[g],
2344  sce0->sf_idx[(w+w2)*16+g],
2345  sce0->band_type[(w+w2)*16+g],
2346  lambda / maxthr, INFINITY, NULL);
2347  dist2 += quantize_band_cost(s, S,
2348  S34,
2349  sce1->ics.swb_sizes[g],
2350  sce1->sf_idx[(w+w2)*16+g],
2351  sce1->band_type[(w+w2)*16+g],
2352  lambda / minthr, INFINITY, NULL);
2353  }
2354  cpe->ms_mask[w*16+g] = dist2 < dist1;
2355  }
2356  start += sce0->ics.swb_sizes[g];
2357  }
2358  }
2359 }
2360 #endif /*HAVE_MIPSFPU */
2361 
2362 static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2363  int win, int group_len, const float lambda)
2364 {
2365  BandCodingPath path[120][12];
2366  int w, swb, cb, start, size;
2367  int i, j;
2368  const int max_sfb = sce->ics.max_sfb;
2369  const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2370  const int run_esc = (1 << run_bits) - 1;
2371  int idx, ppos, count;
2372  int stackrun[120], stackcb[120], stack_len;
2373  float next_minbits = INFINITY;
2374  int next_mincb = 0;
2375 
2376  abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2377  start = win*128;
2378  for (cb = 0; cb < 12; cb++) {
2379  path[0][cb].cost = run_bits+4;
2380  path[0][cb].prev_idx = -1;
2381  path[0][cb].run = 0;
2382  }
2383  for (swb = 0; swb < max_sfb; swb++) {
2384  size = sce->ics.swb_sizes[swb];
2385  if (sce->zeroes[win*16 + swb]) {
2386  float cost_stay_here = path[swb][0].cost;
2387  float cost_get_here = next_minbits + run_bits + 4;
2388  if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2389  != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2390  cost_stay_here += run_bits;
2391  if (cost_get_here < cost_stay_here) {
2392  path[swb+1][0].prev_idx = next_mincb;
2393  path[swb+1][0].cost = cost_get_here;
2394  path[swb+1][0].run = 1;
2395  } else {
2396  path[swb+1][0].prev_idx = 0;
2397  path[swb+1][0].cost = cost_stay_here;
2398  path[swb+1][0].run = path[swb][0].run + 1;
2399  }
2400  next_minbits = path[swb+1][0].cost;
2401  next_mincb = 0;
2402  for (cb = 1; cb < 12; cb++) {
2403  path[swb+1][cb].cost = 61450;
2404  path[swb+1][cb].prev_idx = -1;
2405  path[swb+1][cb].run = 0;
2406  }
2407  } else {
2408  float minbits = next_minbits;
2409  int mincb = next_mincb;
2410  int startcb = sce->band_type[win*16+swb];
2411  next_minbits = INFINITY;
2412  next_mincb = 0;
2413  for (cb = 0; cb < startcb; cb++) {
2414  path[swb+1][cb].cost = 61450;
2415  path[swb+1][cb].prev_idx = -1;
2416  path[swb+1][cb].run = 0;
2417  }
2418  for (cb = startcb; cb < 12; cb++) {
2419  float cost_stay_here, cost_get_here;
2420  float bits = 0.0f;
2421  for (w = 0; w < group_len; w++) {
2422  bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2423  s->scoefs + start + w*128, size,
2424  sce->sf_idx[(win+w)*16+swb], cb,
2425  0, INFINITY, NULL);
2426  }
2427  cost_stay_here = path[swb][cb].cost + bits;
2428  cost_get_here = minbits + bits + run_bits + 4;
2429  if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2430  != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2431  cost_stay_here += run_bits;
2432  if (cost_get_here < cost_stay_here) {
2433  path[swb+1][cb].prev_idx = mincb;
2434  path[swb+1][cb].cost = cost_get_here;
2435  path[swb+1][cb].run = 1;
2436  } else {
2437  path[swb+1][cb].prev_idx = cb;
2438  path[swb+1][cb].cost = cost_stay_here;
2439  path[swb+1][cb].run = path[swb][cb].run + 1;
2440  }
2441  if (path[swb+1][cb].cost < next_minbits) {
2442  next_minbits = path[swb+1][cb].cost;
2443  next_mincb = cb;
2444  }
2445  }
2446  }
2447  start += sce->ics.swb_sizes[swb];
2448  }
2449 
2450  stack_len = 0;
2451  idx = 0;
2452  for (cb = 1; cb < 12; cb++)
2453  if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2454  idx = cb;
2455  ppos = max_sfb;
2456  while (ppos > 0) {
2457  av_assert1(idx >= 0);
2458  cb = idx;
2459  stackrun[stack_len] = path[ppos][cb].run;
2460  stackcb [stack_len] = cb;
2461  idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2462  ppos -= path[ppos][cb].run;
2463  stack_len++;
2464  }
2465 
2466  start = 0;
2467  for (i = stack_len - 1; i >= 0; i--) {
2468  put_bits(&s->pb, 4, stackcb[i]);
2469  count = stackrun[i];
2470  memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2471  for (j = 0; j < count; j++) {
2472  sce->band_type[win*16 + start] = stackcb[i];
2473  start++;
2474  }
2475  while (count >= run_esc) {
2476  put_bits(&s->pb, run_bits, run_esc);
2477  count -= run_esc;
2478  }
2479  put_bits(&s->pb, run_bits, count);
2480  }
2481 }
2482 #endif /* HAVE_INLINE_ASM */
2483 
2485 #if HAVE_INLINE_ASM
2486  AACCoefficientsEncoder *e = c->coder;
2487  int option = c->options.aac_coder;
2488 
2489  if (option == 2) {
2490  e->quantize_and_encode_band = quantize_and_encode_band_mips;
2491  e->encode_window_bands_info = codebook_trellis_rate_mips;
2492 #if HAVE_MIPSFPU
2493  e->search_for_quantizers = search_for_quantizers_twoloop_mips;
2494  e->search_for_ms = search_for_ms_mips;
2495 #endif /* HAVE_MIPSFPU */
2496  }
2497 #endif /* HAVE_INLINE_ASM */
2498 }