32 "xor $f0, $f0, $f0 \r\n"
33 "ldc1 $f2, 0(%[src]) \r\n"
34 "ldc1 $f4, 8(%[src]) \r\n"
35 "ldc1 $f6, 16(%[src]) \r\n"
36 "ldc1 $f8, 24(%[src]) \r\n"
37 "lwc1 $f10, 0(%[dst0]) \r\n"
38 "lwc1 $f12, 0(%[dst1]) \r\n"
39 "lwc1 $f14, 0(%[dst2]) \r\n"
40 "lwc1 $f16, 0(%[dst3]) \r\n"
41 "punpcklbh $f10, $f10, $f0 \r\n"
42 "punpcklbh $f12, $f12, $f0 \r\n"
43 "punpcklbh $f14, $f14, $f0 \r\n"
44 "punpcklbh $f16, $f16, $f0 \r\n"
45 "paddh $f2, $f2, $f10 \r\n"
46 "paddh $f4, $f4, $f12 \r\n"
47 "paddh $f6, $f6, $f14 \r\n"
48 "paddh $f8, $f8, $f16 \r\n"
49 "packushb $f2, $f2, $f0 \r\n"
50 "packushb $f4, $f4, $f0 \r\n"
51 "packushb $f6, $f6, $f0 \r\n"
52 "packushb $f8, $f8, $f0 \r\n"
53 "swc1 $f2, 0(%[dst0]) \r\n"
54 "swc1 $f4, 0(%[dst1]) \r\n"
55 "swc1 $f6, 0(%[dst2]) \r\n"
56 "swc1 $f8, 0(%[dst3]) \r\n"
57 ::[dst0]
"r"(dst),[dst1]
"r"(dst+stride),[dst2]
"r"(dst+2*
stride),
58 [dst3]
"r"(dst+3*stride),[
src]
"r"(
src)
59 :
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16"
69 "ldc1 $f0, 0(%[block]) \r\n"
71 "ldc1 $f2, 8(%[block]) \r\n"
73 "ldc1 $f4, 16(%[block]) \r\n"
75 "psrah $f8, $f2, $f16 \r\n"
76 "ldc1 $f6, 24(%[block]) \r\n"
77 "psrah $f10, $f6, $f16 \r\n"
78 "psubh $f8, $f8, $f6 \r\n"
79 "paddh $f10, $f10, $f2 \r\n"
80 "paddh $f20, $f4, $f0 \r\n"
81 "psubh $f0, $f0, $f4 \r\n"
82 "paddh $f22, $f10, $f20 \r\n"
83 "psubh $f4, $f20, $f10 \r\n"
84 "paddh $f20, $f8, $f0 \r\n"
85 "psubh $f0, $f0, $f8 \r\n"
86 "punpckhhw $f2, $f22, $f20 \r\n"
87 "punpcklhw $f10, $f22, $f20 \r\n"
88 "punpckhhw $f8, $f0, $f4 \r\n"
89 "punpcklhw $f0, $f0, $f4 \r\n"
90 "punpckhwd $f4, $f10, $f0 \r\n"
91 "punpcklwd $f10, $f10, $f0 \r\n"
92 "punpcklwd $f20, $f2, $f8 \r\n"
93 "punpckhwd $f0, $f2, $f8 \r\n"
94 "paddh $f10, $f10, %[ff_pw_32] \r\n"
95 "psrah $f8, $f4, $f16 \r\n"
96 "psrah $f6, $f0, $f16 \r\n"
97 "psubh $f8, $f8, $f0 \r\n"
98 "paddh $f6, $f6, $f4 \r\n"
99 "paddh $f2, $f20, $f10 \r\n"
100 "psubh $f10, $f10, $f20 \r\n"
101 "paddh $f20, $f6, $f2 \r\n"
102 "psubh $f2, $f2, $f6 \r\n"
103 "paddh $f22, $f8, $f10 \r\n"
104 "xor $f14, $f14, $f14 \r\n"
105 "psubh $f10, $f10, $f8 \r\n"
106 "sdc1 $f14, 0(%[block]) \r\n"
107 "sdc1 $f14, 8(%[block]) \r\n"
108 "sdc1 $f14, 16(%[block]) \r\n"
109 "sdc1 $f14, 24(%[block]) \r\n"
110 "lwc1 $f4, 0(%[dst]) \r\n"
111 "psrah $f6, $f20, $f18 \r\n"
112 "gslwxc1 $f0, 0(%[dst], %[stride]) \r\n"
113 "psrah $f8, $f22, $f18 \r\n"
114 "punpcklbh $f4, $f4, $f14 \r\n"
115 "punpcklbh $f0, $f0, $f14 \r\n"
116 "paddh $f4, $f4, $f6 \r\n"
117 "paddh $f0, $f0, $f8 \r\n"
118 "packushb $f4, $f4, $f14 \r\n"
119 "packushb $f0, $f0, $f14 \r\n"
120 "swc1 $f4, 0(%[dst]) \r\n"
121 "gsswxc1 $f0, 0(%[dst], %[stride]) \r\n"
122 "daddu %[dst], %[dst], %[stride] \r\n"
123 "daddu %[dst], %[dst], %[stride] \r\n"
124 "lwc1 $f4, 0(%[dst]) \r\n"
125 "psrah $f10, $f10, $f18 \r\n"
126 "gslwxc1 $f0, 0(%[dst], %[stride]) \r\n"
127 "psrah $f2, $f2, $f18 \r\n"
128 "punpcklbh $f4, $f4, $f14 \r\n"
129 "punpcklbh $f0, $f0, $f14 \r\n"
130 "paddh $f4, $f4, $f10 \r\n"
131 "paddh $f0, $f0, $f2 \r\n"
132 "packushb $f4, $f4, $f14 \r\n"
133 "swc1 $f4, 0(%[dst]) \r\n"
134 "packushb $f0, $f0, $f14 \r\n"
135 "gsswxc1 $f0, 0(%[dst], %[stride]) \r\n"
136 ::[dst]
"r"(dst),[block]
"r"(block),[
stride]
"r"((uint64_t)stride),
138 :
"$8",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16",
142 memset(block, 0, 32);
148 "lhu $10, 0x0(%[block]) \r\n"
149 "daddiu $29, $29, -0x20 \r\n"
150 "daddiu $10, $10, 0x20 \r\n"
151 "ldc1 $f2, 0x10(%[block]) \r\n"
152 "sh $10, 0x0(%[block]) \r\n"
153 "ldc1 $f4, 0x20(%[block]) \r\n"
155 "ldc1 $f6, 0x30(%[block]) \r\n"
156 "dmtc1 $10, $f16 \r\n"
157 "ldc1 $f10, 0x50(%[block]) \r\n"
158 "ldc1 $f12, 0x60(%[block]) \r\n"
159 "ldc1 $f14, 0x70(%[block]) \r\n"
160 "mov.d $f0, $f2 \r\n"
161 "psrah $f2, $f2, $f16 \r\n"
162 "psrah $f8, $f10, $f16 \r\n"
163 "paddh $f2, $f2, $f0 \r\n"
164 "paddh $f8, $f8, $f10 \r\n"
165 "paddh $f2, $f2, $f10 \r\n"
166 "paddh $f8, $f8, $f14 \r\n"
167 "paddh $f2, $f2, $f6 \r\n"
168 "psubh $f8, $f8, $f0 \r\n"
169 "psubh $f0, $f0, $f6 \r\n"
170 "psubh $f10, $f10, $f6 \r\n"
171 "psrah $f6, $f6, $f16 \r\n"
172 "paddh $f0, $f0, $f14 \r\n"
173 "psubh $f10, $f10, $f14 \r\n"
174 "psrah $f14, $f14, $f16 \r\n"
175 "psubh $f0, $f0, $f6 \r\n"
177 "psubh $f10, $f10, $f14 \r\n"
178 "dmtc1 $10, $f18 \r\n"
179 "mov.d $f14, $f2 \r\n"
180 "psrah $f2, $f2, $f18 \r\n"
181 "psrah $f6, $f8, $f18 \r\n"
182 "paddh $f6, $f6, $f0 \r\n"
183 "psrah $f0, $f0, $f18 \r\n"
184 "paddh $f2, $f2, $f10 \r\n"
185 "psrah $f10, $f10, $f18 \r\n"
186 "psubh $f0, $f0, $f8 \r\n"
187 "psubh $f14, $f14, $f10 \r\n"
188 "mov.d $f10, $f12 \r\n"
189 "psrah $f12, $f12, $f16 \r\n"
190 "psrah $f8, $f4, $f16 \r\n"
191 "paddh $f12, $f12, $f4 \r\n"
192 "psubh $f8, $f8, $f10 \r\n"
193 "ldc1 $f4, 0x0(%[block]) \r\n"
194 "ldc1 $f10, 0x40(%[block]) \r\n"
195 "paddh $f10, $f10, $f4 \r\n"
196 "paddh $f4, $f4, $f4 \r\n"
197 "paddh $f12, $f12, $f10 \r\n"
198 "psubh $f4, $f4, $f10 \r\n"
199 "paddh $f10, $f10, $f10 \r\n"
200 "paddh $f8, $f8, $f4 \r\n"
201 "psubh $f10, $f10, $f12 \r\n"
202 "paddh $f4, $f4, $f4 \r\n"
203 "paddh $f14, $f14, $f12 \r\n"
204 "psubh $f4, $f4, $f8 \r\n"
205 "paddh $f12, $f12, $f12 \r\n"
206 "paddh $f0, $f0, $f8 \r\n"
207 "psubh $f12, $f12, $f14 \r\n"
208 "paddh $f8, $f8, $f8 \r\n"
209 "paddh $f6, $f6, $f4 \r\n"
210 "psubh $f8, $f8, $f0 \r\n"
211 "paddh $f4, $f4, $f4 \r\n"
212 "paddh $f2, $f2, $f10 \r\n"
213 "psubh $f4, $f4, $f6 \r\n"
214 "paddh $f10, $f10, $f10 \r\n"
215 "sdc1 $f12, 0x0(%[block]) \r\n"
216 "psubh $f10, $f10, $f2 \r\n"
217 "punpckhhw $f12, $f14, $f0 \r\n"
218 "punpcklhw $f14, $f14, $f0 \r\n"
219 "punpckhhw $f0, $f6, $f2 \r\n"
220 "punpcklhw $f6, $f6, $f2 \r\n"
221 "punpckhwd $f2, $f14, $f6 \r\n"
222 "punpcklwd $f14, $f14, $f6 \r\n"
223 "punpckhwd $f6, $f12, $f0 \r\n"
224 "punpcklwd $f12, $f12, $f0 \r\n"
225 "ldc1 $f0, 0x0(%[block]) \r\n"
226 "sdc1 $f14, 0x0($29) \r\n"
227 "sdc1 $f2, 0x10($29) \r\n"
228 "dmfc1 $8, $f12 \r\n"
229 "dmfc1 $11, $f6 \r\n"
230 "punpckhhw $f6, $f10, $f4 \r\n"
231 "punpcklhw $f10, $f10, $f4 \r\n"
232 "punpckhhw $f4, $f8, $f0 \r\n"
233 "punpcklhw $f8, $f8, $f0 \r\n"
234 "punpckhwd $f0, $f10, $f8 \r\n"
235 "punpcklwd $f10, $f10, $f8 \r\n"
236 "punpckhwd $f8, $f6, $f4 \r\n"
237 "punpcklwd $f6, $f6, $f4 \r\n"
238 "sdc1 $f10, 0x8($29) \r\n"
239 "sdc1 $f0, 0x18($29) \r\n"
241 "dmfc1 $12, $f8 \r\n"
242 "ldc1 $f2, 0x18(%[block]) \r\n"
243 "ldc1 $f12, 0x28(%[block]) \r\n"
244 "ldc1 $f4, 0x38(%[block]) \r\n"
245 "ldc1 $f0, 0x58(%[block]) \r\n"
246 "ldc1 $f6, 0x68(%[block]) \r\n"
247 "ldc1 $f8, 0x78(%[block]) \r\n"
248 "mov.d $f14, $f2 \r\n"
249 "psrah $f10, $f0, $f16 \r\n"
250 "psrah $f2, $f2, $f16 \r\n"
251 "paddh $f10, $f10, $f0 \r\n"
252 "paddh $f2, $f2, $f14 \r\n"
253 "paddh $f10, $f10, $f8 \r\n"
254 "paddh $f2, $f2, $f0 \r\n"
255 "psubh $f10, $f10, $f14 \r\n"
256 "paddh $f2, $f2, $f4 \r\n"
257 "psubh $f14, $f14, $f4 \r\n"
258 "psubh $f0, $f0, $f4 \r\n"
259 "psrah $f4, $f4, $f16 \r\n"
260 "paddh $f14, $f14, $f8 \r\n"
261 "psubh $f0, $f0, $f8 \r\n"
262 "psrah $f8, $f8, $f16 \r\n"
263 "psubh $f14, $f14, $f4 \r\n"
264 "psubh $f0, $f0, $f8 \r\n"
265 "mov.d $f8, $f2 \r\n"
266 "psrah $f4, $f10, $f18 \r\n"
267 "psrah $f2, $f2, $f18 \r\n"
268 "paddh $f4, $f4, $f14 \r\n"
269 "psrah $f14, $f14, $f18 \r\n"
270 "paddh $f2, $f2, $f0 \r\n"
271 "psrah $f0, $f0, $f18 \r\n"
272 "psubh $f14, $f14, $f10 \r\n"
273 "psubh $f8, $f8, $f0 \r\n"
274 "mov.d $f0, $f6 \r\n"
275 "psrah $f6, $f6, $f16 \r\n"
276 "psrah $f10, $f12, $f16 \r\n"
277 "paddh $f6, $f6, $f12 \r\n"
278 "psubh $f10, $f10, $f0 \r\n"
279 "ldc1 $f12, 0x8(%[block]) \r\n"
280 "ldc1 $f0, 0x48(%[block]) \r\n"
281 "paddh $f0, $f0, $f12 \r\n"
282 "paddh $f12, $f12, $f12 \r\n"
283 "paddh $f6, $f6, $f0 \r\n"
284 "psubh $f12, $f12, $f0 \r\n"
285 "paddh $f0, $f0, $f0 \r\n"
286 "paddh $f10, $f10, $f12 \r\n"
287 "psubh $f0, $f0, $f6 \r\n"
288 "paddh $f12, $f12, $f12 \r\n"
289 "paddh $f8, $f8, $f6 \r\n"
290 "psubh $f12, $f12, $f10 \r\n"
291 "paddh $f6, $f6, $f6 \r\n"
292 "paddh $f14, $f14, $f10 \r\n"
293 "psubh $f6, $f6, $f8 \r\n"
294 "paddh $f10, $f10, $f10 \r\n"
295 "paddh $f4, $f4, $f12 \r\n"
296 "psubh $f10, $f10, $f14 \r\n"
297 "paddh $f12, $f12, $f12 \r\n"
298 "paddh $f2, $f2, $f0 \r\n"
299 "psubh $f12, $f12, $f4 \r\n"
300 "paddh $f0, $f0, $f0 \r\n"
301 "sdc1 $f6, 0x8(%[block]) \r\n"
302 "psubh $f0, $f0, $f2 \r\n"
303 "punpckhhw $f6, $f8, $f14 \r\n"
304 "punpcklhw $f8, $f8, $f14 \r\n"
305 "punpckhhw $f14, $f4, $f2 \r\n"
306 "punpcklhw $f4, $f4, $f2 \r\n"
307 "punpckhwd $f2, $f8, $f4 \r\n"
308 "punpcklwd $f8, $f8, $f4 \r\n"
309 "punpckhwd $f4, $f6, $f14 \r\n"
310 "punpcklwd $f6, $f6, $f14 \r\n"
311 "ldc1 $f14, 0x8(%[block]) \r\n"
312 "dmfc1 $13, $f8 \r\n"
313 "dmfc1 $15, $f2 \r\n"
314 "mov.d $f24, $f6 \r\n"
315 "mov.d $f28, $f4 \r\n"
316 "punpckhhw $f4, $f0, $f12 \r\n"
317 "punpcklhw $f0, $f0, $f12 \r\n"
318 "punpckhhw $f12, $f10, $f14 \r\n"
319 "punpcklhw $f10, $f10, $f14 \r\n"
320 "punpckhwd $f14, $f0, $f10 \r\n"
321 "punpcklwd $f0, $f0, $f10 \r\n"
322 "punpckhwd $f10, $f4, $f12 \r\n"
323 "punpcklwd $f4, $f4, $f12 \r\n"
324 "dmfc1 $14, $f0 \r\n"
325 "mov.d $f22, $f14 \r\n"
326 "mov.d $f26, $f4 \r\n"
327 "mov.d $f30, $f10 \r\n"
328 "daddiu $10, %[dst], 0x4 \r\n"
329 "dmtc1 $15, $f14 \r\n"
330 "dmtc1 $11, $f12 \r\n"
331 "ldc1 $f2, 0x10($29) \r\n"
333 "mov.d $f8, $f2 \r\n"
334 "psrah $f2, $f2, $f16 \r\n"
335 "psrah $f0, $f14, $f16 \r\n"
336 "paddh $f2, $f2, $f8 \r\n"
337 "paddh $f0, $f0, $f14 \r\n"
338 "paddh $f2, $f2, $f14 \r\n"
339 "paddh $f0, $f0, $f28 \r\n"
340 "paddh $f2, $f2, $f12 \r\n"
341 "psubh $f0, $f0, $f8 \r\n"
342 "psubh $f8, $f8, $f12 \r\n"
343 "psubh $f14, $f14, $f12 \r\n"
344 "psrah $f12, $f12, $f16 \r\n"
345 "paddh $f8, $f8, $f28 \r\n"
346 "psubh $f14, $f14, $f28 \r\n"
347 "psrah $f10, $f28, $f16 \r\n"
348 "psubh $f8, $f8, $f12 \r\n"
349 "psubh $f14, $f14, $f10 \r\n"
350 "mov.d $f10, $f2 \r\n"
351 "psrah $f2, $f2, $f18 \r\n"
352 "psrah $f12, $f0, $f18 \r\n"
353 "paddh $f2, $f2, $f14 \r\n"
354 "paddh $f12, $f12, $f8 \r\n"
355 "psrah $f8, $f8, $f18 \r\n"
356 "psrah $f14, $f14, $f18 \r\n"
357 "psubh $f8, $f8, $f0 \r\n"
358 "psubh $f10, $f10, $f14 \r\n"
359 "mov.d $f14, $f24 \r\n"
360 "psrah $f4, $f24, $f16 \r\n"
361 "psrah $f0, $f6, $f16 \r\n"
362 "paddh $f4, $f4, $f6 \r\n"
363 "psubh $f0, $f0, $f14 \r\n"
364 "ldc1 $f6, 0x0($29) \r\n"
365 "dmtc1 $13, $f14 \r\n"
366 "paddh $f14, $f14, $f6 \r\n"
367 "paddh $f6, $f6, $f6 \r\n"
368 "paddh $f4, $f4, $f14 \r\n"
369 "psubh $f6, $f6, $f14 \r\n"
370 "paddh $f14, $f14, $f14 \r\n"
371 "paddh $f0, $f0, $f6 \r\n"
372 "psubh $f14, $f14, $f4 \r\n"
373 "paddh $f6, $f6, $f6 \r\n"
374 "paddh $f10, $f10, $f4 \r\n"
375 "psubh $f6, $f6, $f0 \r\n"
376 "paddh $f4, $f4, $f4 \r\n"
377 "paddh $f8, $f8, $f0 \r\n"
378 "psubh $f4, $f4, $f10 \r\n"
379 "paddh $f0, $f0, $f0 \r\n"
380 "paddh $f12, $f12, $f6 \r\n"
381 "psubh $f0, $f0, $f8 \r\n"
382 "paddh $f6, $f6, $f6 \r\n"
383 "paddh $f2, $f2, $f14 \r\n"
384 "psubh $f6, $f6, $f12 \r\n"
385 "paddh $f14, $f14, $f14 \r\n"
386 "sdc1 $f6, 0x0($29) \r\n"
387 "psubh $f14, $f14, $f2 \r\n"
388 "sdc1 $f0, 0x10($29) \r\n"
390 "xor $f4, $f4, $f4 \r\n"
391 "sdc1 $f4, 0x0(%[block]) \r\n"
392 "sdc1 $f4, 0x8(%[block]) \r\n"
393 "sdc1 $f4, 0x10(%[block]) \r\n"
394 "sdc1 $f4, 0x18(%[block]) \r\n"
395 "sdc1 $f4, 0x20(%[block]) \r\n"
396 "sdc1 $f4, 0x28(%[block]) \r\n"
397 "sdc1 $f4, 0x30(%[block]) \r\n"
398 "sdc1 $f4, 0x38(%[block]) \r\n"
399 "sdc1 $f4, 0x40(%[block]) \r\n"
400 "sdc1 $f4, 0x48(%[block]) \r\n"
401 "sdc1 $f4, 0x50(%[block]) \r\n"
402 "sdc1 $f4, 0x58(%[block]) \r\n"
403 "sdc1 $f4, 0x60(%[block]) \r\n"
404 "sdc1 $f4, 0x68(%[block]) \r\n"
405 "sdc1 $f4, 0x70(%[block]) \r\n"
406 "sdc1 $f4, 0x78(%[block]) \r\n"
408 "lwc1 $f6, 0x0(%[dst]) \r\n"
409 "dmtc1 $11, $f20 \r\n"
410 "gslwxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
411 "psrah $f10, $f10, $f20 \r\n"
412 "psrah $f8, $f8, $f20 \r\n"
413 "punpcklbh $f6, $f6, $f4 \r\n"
414 "punpcklbh $f0, $f0, $f4 \r\n"
415 "paddh $f6, $f6, $f10 \r\n"
416 "paddh $f0, $f0, $f8 \r\n"
417 "packushb $f6, $f6, $f4 \r\n"
418 "packushb $f0, $f0, $f4 \r\n"
419 "swc1 $f6, 0x0(%[dst]) \r\n"
420 "gsswxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
421 "daddu %[dst], %[dst], %[stride] \r\n"
422 "daddu %[dst], %[dst], %[stride] \r\n"
423 "lwc1 $f6, 0x0(%[dst]) \r\n"
424 "gslwxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
425 "psrah $f12, $f12, $f20 \r\n"
426 "psrah $f2, $f2, $f20 \r\n"
427 "punpcklbh $f6, $f6, $f4 \r\n"
428 "punpcklbh $f0, $f0, $f4 \r\n"
429 "paddh $f6, $f6, $f12 \r\n"
430 "paddh $f0, $f0, $f2 \r\n"
431 "packushb $f6, $f6, $f4 \r\n"
432 "packushb $f0, $f0, $f4 \r\n"
433 "swc1 $f6, 0x0(%[dst]) \r\n"
434 "gsswxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
435 "ldc1 $f10, 0x0($29) \r\n"
436 "ldc1 $f8, 0x10($29) \r\n"
437 "dmtc1 $8, $f12 \r\n"
438 "daddu %[dst], %[dst], %[stride] \r\n"
439 "daddu %[dst], %[dst], %[stride] \r\n"
440 "lwc1 $f6, 0x0(%[dst]) \r\n"
441 "gslwxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
442 "psrah $f14, $f14, $f20 \r\n"
443 "psrah $f10, $f10, $f20 \r\n"
444 "punpcklbh $f6, $f6, $f4 \r\n"
445 "punpcklbh $f0, $f0, $f4 \r\n"
446 "paddh $f6, $f6, $f14 \r\n"
447 "paddh $f0, $f0, $f10 \r\n"
448 "packushb $f6, $f6, $f4 \r\n"
449 "packushb $f0, $f0, $f4 \r\n"
450 "swc1 $f6, 0x0(%[dst]) \r\n"
451 "gsswxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
452 "daddu %[dst], %[dst], %[stride] \r\n"
453 "daddu %[dst], %[dst], %[stride] \r\n"
454 "lwc1 $f6, 0x0(%[dst]) \r\n"
455 "gslwxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
456 "psrah $f8, $f8, $f20 \r\n"
457 "psrah $f12, $f12, $f20 \r\n"
458 "punpcklbh $f6, $f6, $f4 \r\n"
459 "punpcklbh $f0, $f0, $f4 \r\n"
460 "paddh $f6, $f6, $f8 \r\n"
461 "paddh $f0, $f0, $f12 \r\n"
462 "packushb $f6, $f6, $f4 \r\n"
463 "packushb $f0, $f0, $f4 \r\n"
464 "swc1 $f6, 0x0(%[dst]) \r\n"
465 "gsswxc1 $f0, 0x0(%[dst], %[stride]) \r\n"
466 "dmtc1 $12, $f2 \r\n"
467 "dmtc1 $9, $f12 \r\n"
468 "ldc1 $f8, 0x18($29) \r\n"
469 "mov.d $f10, $f8 \r\n"
470 "psrah $f8, $f8, $f16 \r\n"
471 "psrah $f14, $f22, $f16 \r\n"
472 "paddh $f14, $f14, $f22 \r\n"
473 "paddh $f8, $f8, $f10 \r\n"
474 "paddh $f14, $f14, $f30 \r\n"
475 "paddh $f8, $f8, $f22 \r\n"
476 "psubh $f14, $f14, $f10 \r\n"
477 "paddh $f8, $f8, $f2 \r\n"
478 "psubh $f10, $f10, $f2 \r\n"
479 "psubh $f6, $f22, $f2 \r\n"
480 "psrah $f2, $f2, $f16 \r\n"
481 "paddh $f10, $f10, $f30 \r\n"
482 "psubh $f6, $f6, $f30 \r\n"
483 "psrah $f4, $f30, $f16 \r\n"
484 "psubh $f10, $f10, $f2 \r\n"
485 "psubh $f6, $f6, $f4 \r\n"
486 "mov.d $f4, $f8 \r\n"
487 "psrah $f8, $f8, $f18 \r\n"
488 "psrah $f2, $f14, $f18 \r\n"
489 "paddh $f8, $f8, $f6 \r\n"
490 "paddh $f2, $f2, $f10 \r\n"
491 "psrah $f10, $f10, $f18 \r\n"
492 "psrah $f6, $f6, $f18 \r\n"
493 "psubh $f10, $f10, $f14 \r\n"
494 "psubh $f4, $f4, $f6 \r\n"
495 "mov.d $f6, $f26 \r\n"
496 "psrah $f0, $f26, $f16 \r\n"
497 "psrah $f14, $f12, $f16 \r\n"
498 "paddh $f0, $f0, $f12 \r\n"
499 "psubh $f14, $f14, $f6 \r\n"
500 "ldc1 $f12, 0x8($29) \r\n"
501 "dmtc1 $14, $f6 \r\n"
502 "paddh $f6, $f6, $f12 \r\n"
503 "paddh $f12, $f12, $f12 \r\n"
504 "paddh $f0, $f0, $f6 \r\n"
505 "psubh $f12, $f12, $f6 \r\n"
506 "paddh $f6, $f6, $f6 \r\n"
507 "paddh $f14, $f14, $f12 \r\n"
508 "psubh $f6, $f6, $f0 \r\n"
509 "paddh $f12, $f12, $f12 \r\n"
510 "paddh $f4, $f4, $f0 \r\n"
511 "psubh $f12, $f12, $f14 \r\n"
512 "paddh $f0, $f0, $f0 \r\n"
513 "paddh $f10, $f10, $f14 \r\n"
514 "psubh $f0, $f0, $f4 \r\n"
515 "paddh $f14, $f14, $f14 \r\n"
516 "paddh $f2, $f2, $f12 \r\n"
517 "psubh $f14, $f14, $f10 \r\n"
518 "paddh $f12, $f12, $f12 \r\n"
519 "paddh $f8, $f8, $f6 \r\n"
520 "psubh $f12, $f12, $f2 \r\n"
521 "paddh $f6, $f6, $f6 \r\n"
522 "sdc1 $f12, 0x8($29) \r\n"
523 "psubh $f6, $f6, $f8 \r\n"
524 "sdc1 $f14, 0x18($29) \r\n"
526 "xor $f0, $f0, $f0 \r\n"
527 "lwc1 $f12, 0x0($10) \r\n"
528 "gslwxc1 $f14, 0x0($10, %[stride]) \r\n"
529 "psrah $f4, $f4, $f20 \r\n"
530 "psrah $f10, $f10, $f20 \r\n"
531 "punpcklbh $f12, $f12, $f0 \r\n"
532 "punpcklbh $f14, $f14, $f0 \r\n"
533 "paddh $f12, $f12, $f4 \r\n"
534 "paddh $f14, $f14, $f10 \r\n"
535 "packushb $f12, $f12, $f0 \r\n"
536 "packushb $f14, $f14, $f0 \r\n"
537 "swc1 $f12, 0x0($10) \r\n"
538 "gsswxc1 $f14, 0x0($10, %[stride]) \r\n"
539 "daddu $10, $10, %[stride] \r\n"
540 "daddu $10, $10, %[stride] \r\n"
541 "lwc1 $f12, 0x0($10) \r\n"
542 "gslwxc1 $f14, 0x0($10, %[stride]) \r\n"
543 "psrah $f2, $f2, $f20 \r\n"
544 "psrah $f8, $f8, $f20 \r\n"
545 "punpcklbh $f12, $f12, $f0 \r\n"
546 "punpcklbh $f14, $f14, $f0 \r\n"
547 "paddh $f12, $f12, $f2 \r\n"
548 "paddh $f14, $f14, $f8 \r\n"
549 "packushb $f12, $f12, $f0 \r\n"
550 "packushb $f14, $f14, $f0 \r\n"
551 "swc1 $f12, 0x0($10) \r\n"
552 "gsswxc1 $f14, 0x0($10, %[stride]) \r\n"
553 "ldc1 $f4, 0x8($29) \r\n"
554 "ldc1 $f10, 0x18($29) \r\n"
555 "daddu $10, $10, %[stride] \r\n"
557 "daddu $10, $10, %[stride] \r\n"
558 "lwc1 $f12, 0x0($10) \r\n"
559 "gslwxc1 $f14, 0x0($10, %[stride]) \r\n"
560 "psrah $f6, $f6, $f20 \r\n"
561 "psrah $f4, $f4, $f20 \r\n"
562 "punpcklbh $f12, $f12, $f0 \r\n"
563 "punpcklbh $f14, $f14, $f0 \r\n"
564 "paddh $f12, $f12, $f6 \r\n"
565 "paddh $f14, $f14, $f4 \r\n"
566 "packushb $f12, $f12, $f0 \r\n"
567 "packushb $f14, $f14, $f0 \r\n"
568 "swc1 $f12, 0x0($10) \r\n"
569 "gsswxc1 $f14, 0x0($10, %[stride]) \r\n"
570 "daddu $10, $10, %[stride] \r\n"
571 "daddu $10, $10, %[stride] \r\n"
572 "lwc1 $f12, 0x0($10) \r\n"
573 "gslwxc1 $f14, 0x0($10, %[stride]) \r\n"
574 "psrah $f10, $f10, $f20 \r\n"
575 "psrah $f2, $f2, $f20 \r\n"
576 "punpcklbh $f12, $f12, $f0 \r\n"
577 "punpcklbh $f14, $f14, $f0 \r\n"
578 "paddh $f12, $f12, $f10 \r\n"
579 "paddh $f14, $f14, $f2 \r\n"
580 "packushb $f12, $f12, $f0 \r\n"
581 "packushb $f14, $f14, $f0 \r\n"
582 "swc1 $f12, 0x0($10) \r\n"
583 "gsswxc1 $f14, 0x0($10, %[stride]) \r\n"
584 "daddiu $29, $29, 0x20 \r\n"
585 ::[dst]
"r"(dst),[block]
"r"(block),[
stride]
"r"((uint64_t)stride)
586 :
"$8",
"$9",
"$10",
"$11",
"$12",
"$13",
"$14",
"$15",
"$29",
"$f0",
"$f2",
"$f4",
587 "$f8",
"$f10",
"$f12",
"$f14",
"$f16",
"$f18",
"$f20",
"$f22",
"$f24",
"$f26",
591 memset(block, 0, 128);
597 "lh $8, 0x0(%[block]) \r\n"
598 "sd $0, 0x0(%[block]) \r\n"
599 "daddiu $8, $8, 0x20 \r\n"
600 "daddu $10, %[stride], %[stride] \r\n"
601 "dsra $8, $8, 0x6 \r\n"
602 "xor $f2, $f2, $f2 \r\n"
604 "pshufh $f0, $f0, $f2 \r\n"
605 "daddu $8, $10, %[stride] \r\n"
606 "psubh $f2, $f2, $f0 \r\n"
607 "packushb $f0, $f0, $f0 \r\n"
608 "packushb $f2, $f2, $f2 \r\n"
609 "lwc1 $f4, 0x0(%[dst]) \r\n"
610 "gslwxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
611 "gslwxc1 $f8, 0x0(%[dst], $10) \r\n"
612 "gslwxc1 $f10, 0x0(%[dst], $8) \r\n"
613 "paddusb $f4, $f4, $f0 \r\n"
614 "paddusb $f6, $f6, $f0 \r\n"
615 "paddusb $f8, $f8, $f0 \r\n"
616 "paddusb $f10, $f10, $f0 \r\n"
617 "psubusb $f4, $f4, $f2 \r\n"
618 "psubusb $f6, $f6, $f2 \r\n"
619 "psubusb $f8, $f8, $f2 \r\n"
620 "psubusb $f10, $f10, $f2 \r\n"
621 "swc1 $f4, 0x0(%[dst]) \r\n"
622 "gsswxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
623 "gsswxc1 $f8, 0x0(%[dst], $10) \r\n"
624 "gsswxc1 $f10, 0x0(%[dst], $8) \r\n"
625 ::[dst]
"r"(dst),[block]
"r"(block),[
stride]
"r"((uint64_t)stride)
626 :
"$8",
"$10",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10"
633 "lh $8, 0x0(%[block]) \r\n"
634 "sd $0, 0x0(%[block]) \r\n"
635 "daddiu $8, $8, 0x20 \r\n"
636 "daddu $10, %[stride], %[stride] \r\n"
637 "dsra $8, $8, 0x6 \r\n"
638 "xor $f2, $f2, $f2 \r\n"
640 "pshufh $f0, $f0, $f2 \r\n"
641 "daddu $8, $10, %[stride] \r\n"
642 "psubh $f2, $f2, $f0 \r\n"
643 "packushb $f0, $f0, $f0 \r\n"
644 "packushb $f2, $f2, $f2 \r\n"
645 "ldc1 $f4, 0x0(%[dst]) \r\n"
646 "gsldxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
647 "gsldxc1 $f8, 0x0(%[dst], $10) \r\n"
648 "gsldxc1 $f10, 0x0(%[dst], $8) \r\n"
649 "paddusb $f4, $f4, $f0 \r\n"
650 "paddusb $f6, $f6, $f0 \r\n"
651 "paddusb $f8, $f8, $f0 \r\n"
652 "paddusb $f10, $f10, $f0 \r\n"
653 "psubusb $f4, $f4, $f2 \r\n"
654 "psubusb $f6, $f6, $f2 \r\n"
655 "psubusb $f8, $f8, $f2 \r\n"
656 "psubusb $f10, $f10, $f2 \r\n"
657 "sdc1 $f4, 0x0(%[dst]) \r\n"
658 "gssdxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
659 "gssdxc1 $f8, 0x0(%[dst], $10) \r\n"
660 "daddu $9, $10, $10 \r\n"
661 "gssdxc1 $f10, 0x0(%[dst], $8) \r\n"
662 "daddu %[dst], %[dst], $9 \r\n"
663 "ldc1 $f4, 0x0(%[dst]) \r\n"
664 "gsldxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
665 "gsldxc1 $f8, 0x0(%[dst], $10) \r\n"
666 "gsldxc1 $f10, 0x0(%[dst], $8) \r\n"
667 "paddusb $f4, $f4, $f0 \r\n"
668 "paddusb $f6, $f6, $f0 \r\n"
669 "paddusb $f8, $f8, $f0 \r\n"
670 "paddusb $f10, $f10, $f0 \r\n"
671 "psubusb $f4, $f4, $f2 \r\n"
672 "psubusb $f6, $f6, $f2 \r\n"
673 "psubusb $f8, $f8, $f2 \r\n"
674 "psubusb $f10, $f10, $f2 \r\n"
675 "sdc1 $f4, 0x0(%[dst]) \r\n"
676 "gssdxc1 $f6, 0x0(%[dst], %[stride]) \r\n"
677 "gssdxc1 $f8, 0x0(%[dst], $10) \r\n"
678 "gssdxc1 $f10, 0x0(%[dst], $8) \r\n"
679 ::[dst]
"r"(dst),[block]
"r"(block),[
stride]
"r"((uint64_t)stride)
680 :
"$8",
"$9",
"$10",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10"
689 int nnz = nnzc[
scan8[i] ];
691 if(nnz==1 && ((int16_t*)block)[i*16])
708 else if(((int16_t*)block)[i*16])
718 for(i=0; i<16; i+=4){
719 int nnz = nnzc[
scan8[i] ];
721 if(nnz==1 && ((int16_t*)block)[i*16])
723 block + i*16, stride);
736 for(i=j*16; i<j*16+4; i++){
739 block + i*16, stride);
740 else if(((int16_t*)block)[i*16])
742 block + i*16, stride);
753 for(i=j*16; i<j*16+4; i++){
756 block + i*16, stride);
757 else if(((int16_t*)block)[i*16])
759 block + i*16, stride);
764 for(i=j*16+4; i<j*16+8; i++){
765 if(nnzc[
scan8[i+4] ])
767 block + i*16, stride);
768 else if(((int16_t*)block)[i*16])
770 block + i*16, stride);
779 ".set noreorder \r\n"
781 "ldc1 $f6, 0x18(%[input]) \r\n"
782 "dmtc1 $10, $f16 \r\n"
783 "ldc1 $f4, 0x10(%[input]) \r\n"
785 "ldc1 $f2, 0x8(%[input]) \r\n"
786 "dmtc1 $10, $f18 \r\n"
787 "ldc1 $f0, 0x0(%[input]) \r\n"
788 "mov.d $f8, $f6 \r\n"
789 "paddh $f6, $f6, $f4 \r\n"
790 "psubh $f4, $f4, $f8 \r\n"
791 "mov.d $f8, $f2 \r\n"
792 "paddh $f2, $f2, $f0 \r\n"
793 "psubh $f0, $f0, $f8 \r\n"
794 "mov.d $f8, $f6 \r\n"
795 "paddh $f6, $f6, $f2 \r\n"
796 "psubh $f2, $f2, $f8 \r\n"
797 "mov.d $f8, $f4 \r\n"
798 "paddh $f4, $f4, $f0 \r\n"
799 "psubh $f0, $f0, $f8 \r\n"
800 "mov.d $f8, $f6 \r\n"
801 "punpcklhw $f6, $f6, $f2 \r\n"
802 "punpckhhw $f8, $f8, $f2 \r\n"
803 "punpckhhw $f2, $f0, $f4 \r\n"
804 "punpcklhw $f0, $f0, $f4 \r\n"
805 "punpckhwd $f4, $f6, $f0 \r\n"
806 "punpcklwd $f6, $f6, $f0 \r\n"
807 "mov.d $f0, $f8 \r\n"
808 "punpcklwd $f8, $f8, $f2 \r\n"
809 "punpckhwd $f0, $f0, $f2 \r\n"
810 "mov.d $f2, $f0 \r\n"
811 "paddh $f0, $f0, $f8 \r\n"
812 "psubh $f8, $f8, $f2 \r\n"
813 "mov.d $f2, $f4 \r\n"
814 "paddh $f4, $f4, $f6 \r\n"
815 "psubh $f6, $f6, $f2 \r\n"
816 "mov.d $f2, $f0 \r\n"
817 "paddh $f0, $f0, $f4 \r\n"
818 "psubh $f4, $f4, $f2 \r\n"
819 "mov.d $f2, $f8 \r\n"
820 "daddiu $10, %[qmul], -0x7fff \r\n"
821 "paddh $f8, $f8, $f6 \r\n"
823 "psubh $f6, $f6, $f2 \r\n"
824 "ori $10, $0, 0x80 \r\n"
825 "dsll $10, $10, 0x10 \r\n"
826 "punpckhhw $f2, $f0, %[ff_pw_1] \r\n"
827 "daddu %[qmul], %[qmul], $10 \r\n"
828 "punpcklhw $f0, $f0, %[ff_pw_1] \r\n"
829 "punpckhhw $f10, $f4, %[ff_pw_1] \r\n"
830 "punpcklhw $f4, $f4, %[ff_pw_1] \r\n"
831 "mtc1 %[qmul], $f14 \r\n"
832 "punpcklwd $f14, $f14, $f14 \r\n"
833 "pmaddhw $f0, $f0, $f14 \r\n"
834 "pmaddhw $f4, $f4, $f14 \r\n"
835 "pmaddhw $f2, $f2, $f14 \r\n"
836 "pmaddhw $f10, $f10, $f14 \r\n"
837 "psraw $f0, $f0, $f16 \r\n"
838 "psraw $f4, $f4, $f16 \r\n"
839 "psraw $f2, $f2, $f16 \r\n"
840 "psraw $f10, $f10, $f16 \r\n"
841 "packsswh $f0, $f0, $f2 \r\n"
842 "packsswh $f4, $f4, $f10 \r\n"
844 "dsrl $f0, $f0, $f18 \r\n"
845 "mfc1 %[input], $f0 \r\n"
846 "sh $9, 0x0(%[output]) \r\n"
847 "sh %[input], 0x80(%[output]) \r\n"
848 "dsrl $9, $9, 0x10 \r\n"
849 "dsrl %[input], %[input], 0x10 \r\n"
850 "sh $9, 0x20(%[output]) \r\n"
851 "sh %[input], 0xa0(%[output]) \r\n"
853 "dsrl $f4, $f4, $f18 \r\n"
854 "mfc1 %[input], $f4 \r\n"
855 "sh $9, 0x40(%[output]) \r\n"
856 "sh %[input], 0xc0(%[output]) \r\n"
857 "dsrl $9, $9, 0x10 \r\n"
858 "dsrl %[input], %[input], 0x10 \r\n"
859 "sh $9, 0x60(%[output]) \r\n"
860 "sh %[input], 0xe0(%[output]) \r\n"
861 "punpckhhw $f2, $f6, %[ff_pw_1] \r\n"
862 "punpcklhw $f6, $f6, %[ff_pw_1] \r\n"
863 "punpckhhw $f10, $f8, %[ff_pw_1] \r\n"
864 "punpcklhw $f8, $f8, %[ff_pw_1] \r\n"
865 "mtc1 %[qmul], $f14 \r\n"
866 "punpcklwd $f14, $f14, $f14 \r\n"
867 "pmaddhw $f6, $f6, $f14 \r\n"
868 "pmaddhw $f8, $f8, $f14 \r\n"
869 "pmaddhw $f2, $f2, $f14 \r\n"
870 "pmaddhw $f10, $f10, $f14 \r\n"
871 "psraw $f6, $f6, $f16 \r\n"
872 "psraw $f8, $f8, $f16 \r\n"
873 "psraw $f2, $f2, $f16 \r\n"
874 "psraw $f10, $f10, $f16 \r\n"
875 "packsswh $f6, $f6, $f2 \r\n"
876 "packsswh $f8, $f8, $f10 \r\n"
878 "dsrl $f6, $f6, $f18 \r\n"
879 "mfc1 %[input], $f6 \r\n"
880 "sh $9, 0x100(%[output]) \r\n"
881 "sh %[input], 0x180(%[output]) \r\n"
882 "dsrl $9, $9, 0x10 \r\n"
883 "dsrl %[input], %[input], 0x10 \r\n"
884 "sh $9, 0x120(%[output]) \r\n"
885 "sh %[input], 0x1a0(%[output]) \r\n"
887 "dsrl $f8, $f8, $f18 \r\n"
888 "mfc1 %[input], $f8 \r\n"
889 "sh $9, 0x140(%[output]) \r\n"
890 "sh %[input], 0x1c0(%[output]) \r\n"
891 "dsrl $9, $9, 0x10 \r\n"
892 "dsrl %[input], %[input], 0x10 \r\n"
893 "sh $9, 0x160(%[output]) \r\n"
895 "sh %[input], 0x1e0(%[output]) \r\n"
897 "ori $10, $0, 0x1f \r\n"
898 "clz $9, %[qmul] \r\n"
899 "ori %[input], $0, 0x7 \r\n"
900 "dsubu $9, $10, $9 \r\n"
901 "ori $10, $0, 0x80 \r\n"
902 "dsll $10, $10, 0x10 \r\n"
903 "daddu %[qmul], %[qmul], $10 \r\n"
904 "dsubu $10, $9, %[input] \r\n"
905 "movn $9, %[input], $10 \r\n"
906 "daddiu %[input], %[input], 0x1 \r\n"
907 "andi $10, $9, 0xff \r\n"
908 "dsrlv %[qmul], %[qmul], $10 \r\n"
909 "dsubu %[input], %[input], $9 \r\n"
910 "mtc1 %[input], $f12 \r\n"
911 "punpckhhw $f2, $f0, %[ff_pw_1] \r\n"
912 "punpcklhw $f0, $f0, %[ff_pw_1] \r\n"
913 "punpckhhw $f10, $f4, %[ff_pw_1] \r\n"
914 "punpcklhw $f4, $f4, %[ff_pw_1] \r\n"
915 "mtc1 %[qmul], $f14 \r\n"
916 "punpcklwd $f14, $f14, $f14 \r\n"
917 "pmaddhw $f0, $f0, $f14 \r\n"
918 "pmaddhw $f4, $f4, $f14 \r\n"
919 "pmaddhw $f2, $f2, $f14 \r\n"
920 "pmaddhw $f10, $f10, $f14 \r\n"
921 "psraw $f0, $f0, $f12 \r\n"
922 "psraw $f4, $f4, $f12 \r\n"
923 "psraw $f2, $f2, $f12 \r\n"
924 "psraw $f10, $f10, $f12 \r\n"
925 "packsswh $f0, $f0, $f2 \r\n"
926 "packsswh $f4, $f4, $f10 \r\n"
928 "dsrl $f0, $f0, $f18 \r\n"
929 "sh $9, 0x0(%[output]) \r\n"
930 "mfc1 %[input], $f0 \r\n"
931 "dsrl $9, $9, 0x10 \r\n"
932 "sh %[input], 0x80(%[output]) \r\n"
933 "sh $9, 0x20(%[output]) \r\n"
934 "dsrl %[input], %[input], 0x10 \r\n"
936 "sh %[input], 0xa0(%[output]) \r\n"
937 "dsrl $f4, $f4, $f18 \r\n"
938 "sh $9, 0x40(%[output]) \r\n"
939 "mfc1 %[input], $f4 \r\n"
940 "dsrl $9, $9, 0x10 \r\n"
941 "sh %[input], 0xc0(%[output]) \r\n"
942 "sh $9, 0x60(%[output]) \r\n"
943 "dsrl %[input], %[input], 0x10 \r\n"
944 "sh %[input], 0xe0(%[output]) \r\n"
945 "punpckhhw $f2, $f6, %[ff_pw_1] \r\n"
946 "punpcklhw $f6, $f6, %[ff_pw_1] \r\n"
947 "punpckhhw $f10, $f8, %[ff_pw_1] \r\n"
948 "punpcklhw $f8, $f8, %[ff_pw_1] \r\n"
949 "mtc1 %[qmul], $f14 \r\n"
950 "punpcklwd $f14, $f14, $f14 \r\n"
951 "pmaddhw $f6, $f6, $f14 \r\n"
952 "pmaddhw $f8, $f8, $f14 \r\n"
953 "pmaddhw $f2, $f2, $f14 \r\n"
954 "pmaddhw $f10, $f10, $f14 \r\n"
955 "psraw $f6, $f6, $f12 \r\n"
956 "psraw $f8, $f8, $f12 \r\n"
957 "psraw $f2, $f2, $f12 \r\n"
958 "psraw $f10, $f10, $f12 \r\n"
959 "packsswh $f6, $f6, $f2 \r\n"
960 "packsswh $f8, $f8, $f10 \r\n"
962 "dsrl $f6, $f6, $f18 \r\n"
963 "mfc1 %[input], $f6 \r\n"
964 "sh $9, 0x100(%[output]) \r\n"
965 "sh %[input], 0x180(%[output]) \r\n"
966 "dsrl $9, $9, 0x10 \r\n"
967 "dsrl %[input], %[input], 0x10 \r\n"
968 "sh $9, 0x120(%[output]) \r\n"
969 "sh %[input], 0x1a0(%[output]) \r\n"
971 "dsrl $f8, $f8, $f18 \r\n"
972 "mfc1 %[input], $f8 \r\n"
973 "sh $9, 0x140(%[output]) \r\n"
974 "sh %[input], 0x1c0(%[output]) \r\n"
975 "dsrl $9, $9, 0x10 \r\n"
976 "dsrl %[input], %[input], 0x10 \r\n"
977 "sh $9, 0x160(%[output]) \r\n"
978 "sh %[input], 0x1e0(%[output]) \r\n"
980 ::[output]
"r"(output),[input]
"r"(input),[qmul]
"r"((uint64_t)qmul),
982 :
"$9",
"$10",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16",
992 temp[0] = block[0] + block[16];
993 temp[1] = block[0] - block[16];
994 temp[2] = block[32] + block[48];
995 temp[3] = block[32] - block[48];
996 temp[4] = block[64] + block[80];
997 temp[5] = block[64] - block[80];
998 temp[6] = block[96] + block[112];
999 temp[7] = block[96] - block[112];
1001 t[0] = temp[0] + temp[4] + temp[2] + temp[6];
1002 t[1] = temp[0] - temp[4] + temp[2] - temp[6];
1003 t[2] = temp[0] - temp[4] - temp[2] + temp[6];
1004 t[3] = temp[0] + temp[4] - temp[2] - temp[6];
1005 t[4] = temp[1] + temp[5] + temp[3] + temp[7];
1006 t[5] = temp[1] - temp[5] + temp[3] - temp[7];
1007 t[6] = temp[1] - temp[5] - temp[3] + temp[7];
1008 t[7] = temp[1] + temp[5] - temp[3] - temp[7];
1010 block[ 0]= (t[0]*qmul + 128) >> 8;
1011 block[ 32]= (t[1]*qmul + 128) >> 8;
1012 block[ 64]= (t[2]*qmul + 128) >> 8;
1013 block[ 96]= (t[3]*qmul + 128) >> 8;
1014 block[ 16]= (t[4]*qmul + 128) >> 8;
1015 block[ 48]= (t[5]*qmul + 128) >> 8;
1016 block[ 80]= (t[6]*qmul + 128) >> 8;
1017 block[112]= (t[7]*qmul + 128) >> 8;
1024 d = block[0] - block[16];
1025 a = block[0] + block[16];
1026 b = block[32] - block[48];
1027 c = block[32] + block[48];
1028 block[0] = ((a+
c)*qmul) >> 7;
1029 block[16]= ((d+
b)*qmul) >> 7;
1030 block[32]= ((a-
c)*qmul) >> 7;
1031 block[48]= ((d-
b)*qmul) >> 7;
1039 offset <<= log2_denom;
1042 offset += 1 << (log2_denom - 1);
1048 "dmtc1 $0, $f20 \r\n"
1051 "mtc1 %4, $f10 \r\n"
1052 "pshufh $f6, $f6, $f20 \r\n"
1053 "pshufh $f8, $f8, $f20 \r\n"
1054 "punpckhbh $f14, $f2, $f20 \r\n"
1055 "punpckhbh $f16, $f4, $f20 \r\n"
1056 "punpcklbh $f2, $f2, $f20 \r\n"
1057 "punpcklbh $f4, $f4, $f20 \r\n"
1058 "pmullh $f14, $f14, $f6 \r\n"
1059 "pmullh $f16, $f16, $f6 \r\n"
1060 "pmullh $f2, $f2, $f6 \r\n"
1061 "pmullh $f4, $f4, $f6 \r\n"
1062 "paddsh $f14, $f14, $f8 \r\n"
1063 "paddsh $f16, $f16, $f8 \r\n"
1064 "paddsh $f2, $f2, $f8 \r\n"
1065 "paddsh $f4, $f4, $f8 \r\n"
1066 "psrah $f14, $f14, $f10 \r\n"
1067 "psrah $f16, $f16, $f10 \r\n"
1068 "psrah $f2, $f2, $f10 \r\n"
1069 "psrah $f4, $f4, $f10 \r\n"
1070 "packushb $f2, $f2, $f14 \r\n"
1071 "packushb $f4, $f4, $f16 \r\n"
1074 :
"=m"(*block),
"=m"(*(block + 8))
1075 :
"r"(
weight),
"r"(offset),
"r"(log2_denom)
1081 int stride,
int height,
int log2_denom,
int weightd,
int weights,
1086 offset = ((offset + 1) | 1) << log2_denom;
1092 "dmtc1 $0, $f20 \r\n"
1095 "mtc1 %8, $f10 \r\n"
1096 "mtc1 %9, $f12 \r\n"
1097 "pshufh $f6, $f6, $f20 \r\n"
1098 "pshufh $f8, $f8, $f20 \r\n"
1099 "pshufh $f10, $f10, $f20 \r\n"
1100 "punpckhbh $f14, $f2, $f20 \r\n"
1101 "punpckhbh $f16, $f4, $f20 \r\n"
1102 "punpcklbh $f2, $f2, $f20 \r\n"
1103 "punpcklbh $f4, $f4, $f20 \r\n"
1104 "pmullh $f14, $f14, $f6 \r\n"
1105 "pmullh $f16, $f16, $f8 \r\n"
1106 "pmullh $f2, $f2, $f6 \r\n"
1107 "pmullh $f4, $f4, $f8 \r\n"
1108 "paddsh $f14, $f14, $f10 \r\n"
1109 "paddsh $f2, $f2, $f10 \r\n"
1110 "paddsh $f14, $f14, $f16 \r\n"
1111 "paddsh $f2, $f2, $f4 \r\n"
1112 "psrah $f14, $f14, $f12 \r\n"
1113 "psrah $f2, $f2, $f12 \r\n"
1114 "packushb $f2, $f2, $f14 \r\n"
1118 "punpckhbh $f14, $f2, $f20 \r\n"
1119 "punpckhbh $f16, $f4, $f20 \r\n"
1120 "punpcklbh $f2, $f2, $f20 \r\n"
1121 "punpcklbh $f4, $f4, $f20 \r\n"
1122 "pmullh $f14, $f14, $f6 \r\n"
1123 "pmullh $f16, $f16, $f8 \r\n"
1124 "pmullh $f2, $f2, $f6 \r\n"
1125 "pmullh $f4, $f4, $f8 \r\n"
1126 "paddsh $f14, $f14, $f10 \r\n"
1127 "paddsh $f2, $f2, $f10 \r\n"
1128 "paddsh $f14, $f14, $f16 \r\n"
1129 "paddsh $f2, $f2, $f4 \r\n"
1130 "psrah $f14, $f14, $f12 \r\n"
1131 "psrah $f2, $f2, $f12 \r\n"
1132 "packushb $f2, $f2, $f14 \r\n"
1134 :
"=m"(*dst),
"=m"(*(dst+8))
1135 :
"m"(*src),
"m"(*dst),
"m"(*(src+8)),
"m"(*(dst+8)),
1136 "r"(weights),
"r"(weightd),
"r"(
offset),
"r"(log2_denom+1)
1146 offset <<= log2_denom;
1149 offset += 1 << (log2_denom - 1);
1156 "mtc1 %3, $f10 \r\n"
1157 "dmtc1 $0, $f20 \r\n"
1158 "pshufh $f6, $f6, $f20 \r\n"
1159 "pshufh $f8, $f8, $f20 \r\n"
1160 "punpckhbh $f14, $f2, $f20 \r\n"
1161 "punpcklbh $f2, $f2, $f20 \r\n"
1162 "pmullh $f14, $f14, $f6 \r\n"
1163 "pmullh $f2, $f2, $f6 \r\n"
1164 "paddsh $f14, $f14, $f8 \r\n"
1165 "paddsh $f2, $f2, $f8 \r\n"
1166 "psrah $f14, $f14, $f10 \r\n"
1167 "psrah $f2, $f2, $f10 \r\n"
1168 "packushb $f2, $f2, $f14 \r\n"
1171 :
"r"(
weight),
"r"(offset),
"r"(log2_denom)
1177 int stride,
int height,
int log2_denom,
int weightd,
int weights,
1182 offset = ((offset + 1) | 1) << log2_denom;
1188 "dmtc1 $0, $f20 \r\n"
1191 "mtc1 %5, $f10 \r\n"
1192 "mtc1 %6, $f12 \r\n"
1193 "pshufh $f6, $f6, $f20 \r\n"
1194 "pshufh $f8, $f8, $f20 \r\n"
1195 "pshufh $f10, $f10, $f20 \r\n"
1196 "punpckhbh $f14, $f2, $f20 \r\n"
1197 "punpckhbh $f16, $f4, $f20 \r\n"
1198 "punpcklbh $f2, $f2, $f20 \r\n"
1199 "punpcklbh $f4, $f4, $f20 \r\n"
1200 "pmullh $f14, $f14, $f6 \r\n"
1201 "pmullh $f16, $f16, $f8 \r\n"
1202 "pmullh $f2, $f2, $f6 \r\n"
1203 "pmullh $f4, $f4, $f8 \r\n"
1204 "paddsh $f14, $f14, $f10 \r\n"
1205 "paddsh $f2, $f2, $f10 \r\n"
1206 "paddsh $f14, $f14, $f16 \r\n"
1207 "paddsh $f2, $f2, $f4 \r\n"
1208 "psrah $f14, $f14, $f12 \r\n"
1209 "psrah $f2, $f2, $f12 \r\n"
1210 "packushb $f2, $f2, $f14 \r\n"
1213 :
"m"(*src),
"m"(*dst),
"r"(weights),
1214 "r"(weightd),
"r"(
offset),
"r"(log2_denom+1)
1224 offset <<= log2_denom;
1227 offset += 1 << (log2_denom - 1);
1234 "mtc1 %3, $f10 \r\n"
1235 "dmtc1 $0, $f20 \r\n"
1236 "pshufh $f6, $f6, $f20 \r\n"
1237 "pshufh $f8, $f8, $f20 \r\n"
1238 "punpcklbh $f2, $f2, $f20 \r\n"
1239 "pmullh $f2, $f2, $f6 \r\n"
1240 "paddsh $f2, $f2, $f8 \r\n"
1241 "psrah $f2, $f2, $f10 \r\n"
1242 "packushb $f2, $f2, $f20 \r\n"
1245 :
"r"(
weight),
"r"(offset),
"r"(log2_denom)
1251 int stride,
int height,
int log2_denom,
int weightd,
int weights,
1256 offset = ((offset + 1) | 1) << log2_denom;
1262 "dmtc1 $0, $f20 \r\n"
1265 "mtc1 %5, $f10 \r\n"
1266 "mtc1 %6, $f12 \r\n"
1267 "pshufh $f6, $f6, $f20 \r\n"
1268 "pshufh $f8, $f8, $f20 \r\n"
1269 "pshufh $f10, $f10, $f20 \r\n"
1270 "punpcklbh $f2, $f2, $f20 \r\n"
1271 "punpcklbh $f4, $f4, $f20 \r\n"
1272 "pmullh $f2, $f2, $f6 \r\n"
1273 "pmullh $f4, $f4, $f8 \r\n"
1274 "paddsh $f2, $f2, $f10 \r\n"
1275 "paddsh $f2, $f2, $f4 \r\n"
1276 "psrah $f2, $f2, $f12 \r\n"
1277 "packushb $f2, $f2, $f20 \r\n"
1280 :
"m"(*src),
"m"(*dst),
"r"(weights),
1281 "r"(weightd),
"r"(
offset),
"r"(log2_denom+1)
1287 int alpha,
int beta, int8_t *tc0)
1290 "xor $f16, $f16, $f16 \r\n"
1291 "mtc1 %[alpha], $f8 \r\n"
1292 "mtc1 %[beta], $f10 \r\n"
1293 "pshufh $f8, $f8, $f16 \r\n"
1294 "pshufh $f10, $f10, $f16 \r\n"
1295 "packushb $f8, $f8, $f8 \r\n"
1296 "packushb $f10, $f10, $f10 \r\n"
1297 "psubusb $f12, $f4, $f2 \r\n"
1298 "psubusb $f14, $f2, $f4 \r\n"
1299 "or $f14, $f14, $f12 \r\n"
1300 "psubusb $f14, $f14, $f8 \r\n"
1301 "psubusb $f12, $f2, $f0 \r\n"
1302 "psubusb $f8, $f0, $f2 \r\n"
1303 "or $f8, $f8, $f12 \r\n"
1304 "psubusb $f8, $f8, $f10 \r\n"
1305 "or $f14, $f14, $f8 \r\n"
1306 "psubusb $f12, $f4, $f6 \r\n"
1307 "psubusb $f8, $f6, $f4 \r\n"
1308 "or $f8, $f8, $f12 \r\n"
1309 "psubusb $f8, $f8, $f10 \r\n"
1310 "or $f14, $f14, $f8 \r\n"
1311 "xor $f12, $f12, $f12 \r\n"
1312 "pcmpeqb $f14, $f14, $f12 \r\n"
1313 "lwc1 $f12, 0x0(%[tc0]) \r\n"
1314 "punpcklbh $f12, $f12, $f12 \r\n"
1315 "and $f14, $f14, $f12 \r\n"
1316 "pcmpeqb $f8, $f8, $f8 \r\n"
1317 "xor $f10, $f2, $f4 \r\n"
1318 "xor $f6, $f6, $f8 \r\n"
1319 "and $f10, $f10, %[ff_pb_1] \r\n"
1320 "pavgb $f6, $f6, $f0 \r\n"
1321 "xor $f8, $f8, $f2 \r\n"
1322 "pavgb $f6, $f6, %[ff_pb_3] \r\n"
1323 "pavgb $f8, $f8, $f4 \r\n"
1324 "pavgb $f6, $f6, $f10 \r\n"
1325 "paddusb $f6, $f6, $f8 \r\n"
1326 "psubusb $f12, %[ff_pb_A1], $f6 \r\n"
1327 "psubusb $f6, $f6, %[ff_pb_A1] \r\n"
1328 "pminub $f12, $f12, $f14 \r\n"
1329 "pminub $f6, $f6, $f14 \r\n"
1330 "psubusb $f2, $f2, $f12 \r\n"
1331 "psubusb $f4, $f4, $f6 \r\n"
1332 "paddusb $f2, $f2, $f6 \r\n"
1333 "paddusb $f4, $f4, $f12 \r\n"
1334 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),
1335 [alpha]
"r"((int64_t)
alpha),[beta]
"r"((int64_t)beta),[tc0]
"r"(tc0),
1337 :
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16"
1342 int alpha,
int beta)
1345 "xor $f16, $f16, $f16 \r\n"
1346 "mtc1 %[alpha], $f8 \r\n"
1347 "mtc1 %[beta], $f10 \r\n"
1348 "pshufh $f8, $f8, $f16 \r\n"
1349 "pshufh $f10, $f10, $f16 \r\n"
1350 "packushb $f8, $f8, $f8 \r\n"
1351 "packushb $f10, $f10, $f10 \r\n"
1352 "psubusb $f12, $f4, $f2 \r\n"
1353 "psubusb $f14, $f2, $f4 \r\n"
1354 "or $f14, $f14, $f12 \r\n"
1355 "psubusb $f14, $f14, $f8 \r\n"
1356 "psubusb $f12, $f2, $f0 \r\n"
1357 "psubusb $f8, $f0, $f2 \r\n"
1358 "or $f8, $f8, $f12 \r\n"
1359 "psubusb $f8, $f8, $f10 \r\n"
1360 "or $f14, $f14, $f8 \r\n"
1361 "psubusb $f12, $f4, $f6 \r\n"
1362 "psubusb $f8, $f6, $f4 \r\n"
1363 "or $f8, $f8, $f12 \r\n"
1364 "psubusb $f8, $f8, $f10 \r\n"
1365 "or $f14, $f14, $f8 \r\n"
1366 "xor $f12, $f12, $f12 \r\n"
1367 "pcmpeqb $f14, $f14, $f12 \r\n"
1368 "mov.d $f10, $f2 \r\n"
1369 "mov.d $f12, $f4 \r\n"
1370 "xor $f8, $f2, $f6 \r\n"
1371 "and $f8, $f8, %[ff_pb_1] \r\n"
1372 "pavgb $f2, $f2, $f6 \r\n"
1373 "psubusb $f2, $f2, $f8 \r\n"
1374 "pavgb $f2, $f2, $f0 \r\n"
1375 "xor $f8, $f4, $f0 \r\n"
1376 "and $f8, $f8, %[ff_pb_1] \r\n"
1377 "pavgb $f4, $f4, $f0 \r\n"
1378 "psubusb $f4, $f4, $f8 \r\n"
1379 "pavgb $f4, $f4, $f6 \r\n"
1380 "psubb $f2, $f2, $f10 \r\n"
1381 "psubb $f4, $f4, $f12 \r\n"
1382 "and $f2, $f2, $f14 \r\n"
1383 "and $f4, $f4, $f14 \r\n"
1384 "paddb $f2, $f2, $f10 \r\n"
1385 "paddb $f4, $f4, $f12 \r\n"
1386 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),
1387 [alpha]
"r"((int64_t)
alpha),[beta]
"r"((int64_t)beta),
1389 :
"$f0",
"$f2",
"$f4",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16"
1397 "daddu $8, %[stride], %[stride] \r\n"
1398 "xor $f16, $f16, $f16 \r\n"
1399 "daddu $9, %[stride], $8 \r\n"
1400 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1401 "dsubu $9, $0, $9 \r\n"
1402 "daddiu %[beta], %[beta], -0x1 \r\n"
1403 "daddu $9, $9, %[pix] \r\n"
1404 "ldc1 $f4, 0x0(%[pix]) \r\n"
1405 "gsldxc1 $f0, 0x0($9, %[stride]) \r\n"
1406 "gsldxc1 $f2, 0x0($9, $8) \r\n"
1407 "gsldxc1 $f6, 0x0(%[pix], %[stride]) \r\n"
1408 "mtc1 %[alpha], $f8 \r\n"
1409 "mtc1 %[beta], $f10 \r\n"
1410 "pshufh $f8, $f8, $f16 \r\n"
1411 "pshufh $f10, $f10, $f16 \r\n"
1412 "packushb $f8, $f8, $f8 \r\n"
1413 "packushb $f10, $f10, $f10 \r\n"
1414 "psubusb $f12, $f4, $f2 \r\n"
1415 "psubusb $f14, $f2, $f4 \r\n"
1416 "or $f14, $f14, $f12 \r\n"
1417 "psubusb $f12, $f2, $f0 \r\n"
1418 "psubusb $f14, $f14, $f8 \r\n"
1419 "psubusb $f8, $f0, $f2 \r\n"
1420 "or $f8, $f8, $f12 \r\n"
1421 "psubusb $f12, $f4, $f6 \r\n"
1422 "psubusb $f8, $f8, $f10 \r\n"
1423 "or $f14, $f14, $f8 \r\n"
1424 "psubusb $f8, $f6, $f4 \r\n"
1425 "or $f8, $f8, $f12 \r\n"
1426 "psubusb $f8, $f8, $f10 \r\n"
1427 "or $f14, $f14, $f8 \r\n"
1428 "pcmpeqb $f14, $f14, $f16 \r\n"
1429 "pcmpeqb $f6, $f6, $f6 \r\n"
1430 "gslwlc1 $f8, 0x3(%[tc0]) \r\n"
1431 "gslwrc1 $f8, 0x0(%[tc0]) \r\n"
1432 "punpcklbh $f8, $f8, $f8 \r\n"
1433 "punpcklbh $f18, $f8, $f8 \r\n"
1434 "pcmpgtb $f8, $f18, $f6 \r\n"
1435 "ldc1 $f6, 0x0($9) \r\n"
1436 "and $f20, $f8, $f14 \r\n"
1437 "psubusb $f14, $f6, $f2 \r\n"
1438 "psubusb $f12, $f2, $f6 \r\n"
1439 "psubusb $f14, $f14, $f10 \r\n"
1440 "psubusb $f12, $f12, $f10 \r\n"
1441 "pcmpeqb $f12, $f12, $f14 \r\n"
1442 "and $f12, $f12, $f20 \r\n"
1443 "and $f8, $f20, $f18 \r\n"
1444 "psubb $f14, $f8, $f12 \r\n"
1445 "and $f12, $f12, $f8 \r\n"
1446 "pavgb $f8, $f2, $f4 \r\n"
1447 "ldc1 $f22, 0x0($9) \r\n"
1448 "pavgb $f6, $f6, $f8 \r\n"
1449 "xor $f8, $f8, $f22 \r\n"
1450 "and $f8, $f8, %[ff_pb_1] \r\n"
1451 "psubusb $f6, $f6, $f8 \r\n"
1452 "psubusb $f8, $f0, $f12 \r\n"
1453 "paddusb $f12, $f12, $f0 \r\n"
1454 "pmaxub $f6, $f6, $f8 \r\n"
1455 "pminub $f6, $f6, $f12 \r\n"
1456 "gssdxc1 $f6, 0x0($9, %[stride]) \r\n"
1457 "gsldxc1 $f8, 0x0(%[pix], $8) \r\n"
1458 "psubusb $f6, $f8, $f4 \r\n"
1459 "psubusb $f12, $f4, $f8 \r\n"
1460 "psubusb $f6, $f6, $f10 \r\n"
1461 "psubusb $f12, $f12, $f10 \r\n"
1462 "pcmpeqb $f12, $f12, $f6 \r\n"
1463 "and $f12, $f12, $f20 \r\n"
1464 "psubb $f14, $f14, $f12 \r\n"
1465 "and $f10, $f18, $f12 \r\n"
1466 "gsldxc1 $f6, 0x0(%[pix], %[stride]) \r\n"
1467 "pavgb $f12, $f2, $f4 \r\n"
1468 "gsldxc1 $f22, 0x0(%[pix], $8) \r\n"
1469 "pavgb $f8, $f8, $f12 \r\n"
1470 "xor $f12, $f12, $f22 \r\n"
1471 "and $f12, $f12, %[ff_pb_1] \r\n"
1472 "psubusb $f8, $f8, $f12 \r\n"
1473 "psubusb $f12, $f6, $f10 \r\n"
1474 "paddusb $f10, $f10, $f6 \r\n"
1475 "pmaxub $f8, $f8, $f12 \r\n"
1476 "pminub $f8, $f8, $f10 \r\n"
1477 "gssdxc1 $f8, 0x0(%[pix], %[stride]) \r\n"
1478 "xor $f10, $f2, $f4 \r\n"
1479 "pcmpeqb $f8, $f8, $f8 \r\n"
1480 "and $f10, $f10, %[ff_pb_1] \r\n"
1481 "xor $f6, $f6, $f8 \r\n"
1482 "xor $f8, $f8, $f2 \r\n"
1483 "pavgb $f6, $f6, $f0 \r\n"
1484 "pavgb $f6, $f6, %[ff_pb_3] \r\n"
1485 "pavgb $f8, $f8, $f4 \r\n"
1486 "pavgb $f6, $f6, $f10 \r\n"
1487 "paddusb $f6, $f6, $f8 \r\n"
1488 "psubusb $f12, %[ff_pb_A1], $f6 \r\n"
1489 "psubusb $f6, $f6, %[ff_pb_A1] \r\n"
1490 "pminub $f12, $f12, $f14 \r\n"
1491 "pminub $f6, $f6, $f14 \r\n"
1492 "psubusb $f2, $f2, $f12 \r\n"
1493 "psubusb $f4, $f4, $f6 \r\n"
1494 "paddusb $f2, $f2, $f6 \r\n"
1495 "paddusb $f4, $f4, $f12 \r\n"
1496 "gssdxc1 $f2, 0x0($9, $8) \r\n"
1497 "sdc1 $f4, 0x0(%[pix]) \r\n"
1498 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),
1499 [alpha]
"r"((int64_t)
alpha),[beta]
"r"((int64_t)beta),[tc0]
"r"(tc0),
1501 :
"$8",
"$9",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16",
1502 "$f18",
"$f20",
"$f22"
1509 uint64_t stack[0xa];
1512 "ori $8, $0, 0x1 \r\n"
1513 "xor $f30, $f30, $f30 \r\n"
1514 "dmtc1 $8, $f16 \r\n"
1515 "dsll $8, %[stride], 2 \r\n"
1516 "daddu $10, %[stride], %[stride] \r\n"
1517 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1518 "dsll $f20, $f16, $f16 \r\n"
1519 "bltz %[alpha], 1f \r\n"
1520 "daddu $9, $10, %[stride] \r\n"
1521 "daddiu %[beta], %[beta], -0x1 \r\n"
1522 "bltz %[beta], 1f \r\n"
1523 "dsubu $8, $0, $8 \r\n"
1524 "daddu $8, $8, %[pix] \r\n"
1525 "ldc1 $f4, 0x0(%[pix]) \r\n"
1526 "gsldxc1 $f0, 0x0($8, $10) \r\n"
1527 "gsldxc1 $f2, 0x0($8, $9) \r\n"
1528 "gsldxc1 $f6, 0x0(%[pix], %[stride]) \r\n"
1529 "mtc1 %[alpha], $f8 \r\n"
1530 "mtc1 %[beta], $f10 \r\n"
1531 "pshufh $f8, $f8, $f30 \r\n"
1532 "pshufh $f10, $f10, $f30 \r\n"
1533 "packushb $f8, $f8, $f8 \r\n"
1534 "psubusb $f12, $f4, $f2 \r\n"
1535 "psubusb $f14, $f2, $f4 \r\n"
1536 "packushb $f10, $f10, $f10 \r\n"
1537 "or $f14, $f14, $f12 \r\n"
1538 "sdc1 $f8, 0x10+%[stack] \r\n"
1539 "psubusb $f14, $f14, $f8 \r\n"
1540 "psubusb $f12, $f2, $f0 \r\n"
1541 "psubusb $f8, $f0, $f2 \r\n"
1542 "or $f8, $f8, $f12 \r\n"
1543 "psubusb $f8, $f8, $f10 \r\n"
1544 "or $f14, $f14, $f8 \r\n"
1545 "psubusb $f12, $f4, $f6 \r\n"
1546 "psubusb $f8, $f6, $f4 \r\n"
1547 "or $f8, $f8, $f12 \r\n"
1548 "psubusb $f8, $f8, $f10 \r\n"
1549 "or $f14, $f14, $f8 \r\n"
1550 "xor $f12, $f12, $f12 \r\n"
1551 "ldc1 $f8, 0x10+%[stack] \r\n"
1552 "pcmpeqb $f14, $f14, $f12 \r\n"
1553 "sdc1 $f14, 0x20+%[stack] \r\n"
1554 "pavgb $f8, $f8, $f30 \r\n"
1555 "psubusb $f14, $f4, $f2 \r\n"
1556 "pavgb $f8, $f8, %[ff_pb_1] \r\n"
1557 "psubusb $f12, $f2, $f4 \r\n"
1558 "psubusb $f14, $f14, $f8 \r\n"
1559 "psubusb $f12, $f12, $f8 \r\n"
1560 "ldc1 $f28, 0x20+%[stack] \r\n"
1561 "pcmpeqb $f12, $f12, $f14 \r\n"
1562 "and $f12, $f12, $f28 \r\n"
1563 "gsldxc1 $f28, 0x0($8, %[stride]) \r\n"
1564 "psubusb $f14, $f28, $f2 \r\n"
1565 "psubusb $f8, $f2, $f28 \r\n"
1566 "psubusb $f14, $f14, $f10 \r\n"
1567 "psubusb $f8, $f8, $f10 \r\n"
1568 "pcmpeqb $f8, $f8, $f14 \r\n"
1569 "and $f8, $f8, $f12 \r\n"
1570 "gsldxc1 $f26, 0x0(%[pix], $10) \r\n"
1571 "sdc1 $f8, 0x30+%[stack] \r\n"
1572 "psubusb $f14, $f26, $f4 \r\n"
1573 "psubusb $f8, $f4, $f26 \r\n"
1574 "psubusb $f14, $f14, $f10 \r\n"
1575 "psubusb $f8, $f8, $f10 \r\n"
1576 "pcmpeqb $f8, $f8, $f14 \r\n"
1577 "and $f8, $f8, $f12 \r\n"
1578 "sdc1 $f8, 0x40+%[stack] \r\n"
1579 "pavgb $f8, $f28, $f0 \r\n"
1580 "pavgb $f10, $f2, $f4 \r\n"
1581 "pavgb $f8, $f8, $f10 \r\n"
1582 "sdc1 $f10, 0x10+%[stack] \r\n"
1583 "paddb $f12, $f28, $f0 \r\n"
1584 "paddb $f14, $f2, $f4 \r\n"
1585 "paddb $f12, $f12, $f14 \r\n"
1586 "mov.d $f14, $f12 \r\n"
1587 "sdc1 $f12, 0x0+%[stack] \r\n"
1588 "psrlh $f12, $f12, $f16 \r\n"
1589 "pavgb $f12, $f12, $f30 \r\n"
1590 "xor $f12, $f12, $f8 \r\n"
1591 "and $f12, $f12, %[ff_pb_1] \r\n"
1592 "psubb $f8, $f8, $f12 \r\n"
1593 "pavgb $f10, $f28, $f6 \r\n"
1594 "psubb $f12, $f28, $f6 \r\n"
1595 "paddb $f14, $f14, $f14 \r\n"
1596 "psubb $f14, $f14, $f12 \r\n"
1597 "and $f12, $f12, %[ff_pb_1] \r\n"
1598 "psubb $f10, $f10, $f12 \r\n"
1599 "ldc1 $f24, 0x10+%[stack] \r\n"
1600 "pavgb $f10, $f10, $f0 \r\n"
1601 "psrlh $f14, $f14, $f20 \r\n"
1602 "pavgb $f10, $f10, $f24 \r\n"
1603 "pavgb $f14, $f14, $f30 \r\n"
1604 "xor $f14, $f14, $f10 \r\n"
1605 "and $f14, $f14, %[ff_pb_1] \r\n"
1606 "psubb $f10, $f10, $f14 \r\n"
1607 "xor $f14, $f2, $f6 \r\n"
1608 "pavgb $f12, $f2, $f6 \r\n"
1609 "and $f14, $f14, %[ff_pb_1] \r\n"
1610 "psubb $f12, $f12, $f14 \r\n"
1611 "ldc1 $f24, 0x30+%[stack] \r\n"
1612 "pavgb $f12, $f12, $f0 \r\n"
1613 "ldc1 $f22, 0x20+%[stack] \r\n"
1614 "xor $f10, $f10, $f12 \r\n"
1615 "xor $f12, $f12, $f2 \r\n"
1616 "and $f10, $f10, $f24 \r\n"
1617 "and $f12, $f12, $f22 \r\n"
1618 "xor $f10, $f10, $f12 \r\n"
1619 "xor $f10, $f10, $f2 \r\n"
1620 "gssdxc1 $f10, 0x0($8, $9) \r\n"
1621 "ldc1 $f10, 0x0($8) \r\n"
1622 "paddb $f12, $f28, $f10 \r\n"
1623 "pavgb $f10, $f10, $f28 \r\n"
1624 "ldc1 $f22, 0x0+%[stack] \r\n"
1625 "pavgb $f10, $f10, $f8 \r\n"
1626 "paddb $f12, $f12, $f12 \r\n"
1627 "paddb $f12, $f12, $f22 \r\n"
1628 "psrlh $f12, $f12, $f20 \r\n"
1629 "pavgb $f12, $f12, $f30 \r\n"
1630 "xor $f12, $f12, $f10 \r\n"
1631 "and $f12, $f12, %[ff_pb_1] \r\n"
1632 "ldc1 $f22, 0x30+%[stack] \r\n"
1633 "psubb $f10, $f10, $f12 \r\n"
1634 "xor $f8, $f8, $f0 \r\n"
1635 "xor $f10, $f10, $f28 \r\n"
1636 "and $f8, $f8, $f22 \r\n"
1637 "and $f10, $f10, $f22 \r\n"
1638 "xor $f8, $f8, $f0 \r\n"
1639 "xor $f10, $f10, $f28 \r\n"
1640 "gssdxc1 $f8, 0x0($8, $10) \r\n"
1641 "gssdxc1 $f10, 0x0($8, %[stride]) \r\n"
1642 "pavgb $f8, $f26, $f6 \r\n"
1643 "pavgb $f10, $f4, $f2 \r\n"
1644 "pavgb $f8, $f8, $f10 \r\n"
1645 "sdc1 $f10, 0x10+%[stack] \r\n"
1646 "paddb $f12, $f26, $f6 \r\n"
1647 "paddb $f14, $f4, $f2 \r\n"
1648 "paddb $f12, $f12, $f14 \r\n"
1649 "mov.d $f14, $f12 \r\n"
1650 "sdc1 $f12, 0x0+%[stack] \r\n"
1651 "psrlh $f12, $f12, $f16 \r\n"
1652 "pavgb $f12, $f12, $f30 \r\n"
1653 "xor $f12, $f12, $f8 \r\n"
1654 "and $f12, $f12, %[ff_pb_1] \r\n"
1655 "psubb $f8, $f8, $f12 \r\n"
1656 "pavgb $f10, $f26, $f0 \r\n"
1657 "paddb $f14, $f14, $f14 \r\n"
1658 "psubb $f12, $f26, $f0 \r\n"
1659 "psubb $f14, $f14, $f12 \r\n"
1660 "and $f12, $f12, %[ff_pb_1] \r\n"
1661 "psubb $f10, $f10, $f12 \r\n"
1662 "ldc1 $f22, 0x10+%[stack] \r\n"
1663 "pavgb $f10, $f10, $f6 \r\n"
1664 "pavgb $f10, $f10, $f22 \r\n"
1665 "psrlh $f14, $f14, $f20 \r\n"
1666 "pavgb $f14, $f14, $f30 \r\n"
1667 "xor $f14, $f14, $f10 \r\n"
1668 "and $f14, $f14, %[ff_pb_1] \r\n"
1669 "psubb $f10, $f10, $f14 \r\n"
1670 "xor $f14, $f4, $f0 \r\n"
1671 "pavgb $f12, $f4, $f0 \r\n"
1672 "and $f14, $f14, %[ff_pb_1] \r\n"
1673 "ldc1 $f22, 0x40+%[stack] \r\n"
1674 "psubb $f12, $f12, $f14 \r\n"
1675 "ldc1 $f24, 0x20+%[stack] \r\n"
1676 "pavgb $f12, $f12, $f6 \r\n"
1677 "xor $f10, $f10, $f12 \r\n"
1678 "xor $f12, $f12, $f4 \r\n"
1679 "and $f10, $f10, $f22 \r\n"
1680 "and $f12, $f12, $f24 \r\n"
1681 "xor $f10, $f10, $f12 \r\n"
1682 "xor $f10, $f10, $f4 \r\n"
1683 "sdc1 $f10, 0x0(%[pix]) \r\n"
1684 "gsldxc1 $f10, 0x0(%[pix], $9) \r\n"
1685 "paddb $f12, $f26, $f10 \r\n"
1686 "pavgb $f10, $f10, $f26 \r\n"
1687 "ldc1 $f22, 0x0+%[stack] \r\n"
1688 "pavgb $f10, $f10, $f8 \r\n"
1689 "paddb $f12, $f12, $f12 \r\n"
1690 "paddb $f12, $f12, $f22 \r\n"
1691 "psrlh $f12, $f12, $f20 \r\n"
1692 "pavgb $f12, $f12, $f30 \r\n"
1693 "xor $f12, $f12, $f10 \r\n"
1694 "and $f12, $f12, %[ff_pb_1] \r\n"
1695 "ldc1 $f22, 0x40+%[stack] \r\n"
1696 "psubb $f10, $f10, $f12 \r\n"
1697 "xor $f8, $f8, $f6 \r\n"
1698 "xor $f10, $f10, $f26 \r\n"
1699 "and $f8, $f8, $f22 \r\n"
1700 "and $f10, $f10, $f22 \r\n"
1701 "xor $f8, $f8, $f6 \r\n"
1702 "xor $f10, $f10, $f26 \r\n"
1703 "gssdxc1 $f8, 0x0(%[pix], %[stride]) \r\n"
1704 "gssdxc1 $f10, 0x0(%[pix], $10) \r\n"
1706 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),
1707 [alpha]
"r"((int64_t)
alpha),[beta]
"r"((int64_t)beta),
1709 :
"$8",
"$9",
"$10",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
"$f14",
1710 "$f16",
"$f18",
"$f20",
"$f22",
"$f24",
"$f26",
"$f28",
"$f30"
1718 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1719 "daddiu %[beta], %[beta], -0x1 \r\n"
1720 "or $16, $0, %[pix] \r\n"
1721 "dsubu $16, $16, %[stride] \r\n"
1722 "dsubu $16, $16, %[stride] \r\n"
1723 "ldc1 $f0, 0x0($16) \r\n"
1724 "gsldxc1 $f2, 0x0($16, %[stride]) \r\n"
1725 "ldc1 $f4, 0x0(%[pix]) \r\n"
1726 "gsldxc1 $f6, 0x0(%[pix], %[stride]) \r\n"
1727 : [pix]
"+r"(pix),[stride]
"+r"(stride),[
alpha]
"+r"(
alpha),
1736 "gssdxc1 $f2, 0x0($16, %[stride]) \r\n"
1737 "sdc1 $f4, 0x0(%[pix]) \r\n"
1738 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride)
1747 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1748 "daddiu %[beta], %[beta], -0x1 \r\n"
1749 "or $16, $0, %[pix] \r\n"
1750 "dsubu $16, $16, %[stride] \r\n"
1751 "dsubu $16, $16, %[stride] \r\n"
1752 "ldc1 $f0, 0x0($16) \r\n"
1753 "gsldxc1 $f2, 0x0($16, %[stride]) \r\n"
1754 "ldc1 $f4, 0x0(%[pix]) \r\n"
1755 "gsldxc1 $f6, 0x0(%[pix], %[stride]) \r\n"
1756 : [pix]
"+r"(pix),[stride]
"+r"(stride),[
alpha]
"+r"(
alpha),
1758 ::
"$16",
"$f0",
"$f2",
"$f4",
"$f6"
1764 "gssdxc1 $f2, 0x0($16, %[stride]) \r\n"
1765 "sdc1 $f4, 0x0(%[pix]) \r\n"
1766 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride)
1775 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1776 "daddiu %[beta], %[beta], -0x1 \r\n"
1777 "daddu $16, %[stride], %[stride] \r\n"
1778 "daddiu %[pix], %[pix], -0x2 \r\n"
1779 "daddu $17, $16, %[stride] \r\n"
1780 "daddu $19, $16, $16 \r\n"
1781 "or $18, $0, %[pix] \r\n"
1782 "daddu %[pix], %[pix], $17 \r\n"
1783 "gslwlc1 $f0, 0x3($18) \r\n"
1784 "daddu $12, $18, %[stride] \r\n"
1785 "gslwrc1 $f0, 0x0($18) \r\n"
1786 "gslwlc1 $f4, 0x3($12) \r\n"
1787 "daddu $13, $18, $16 \r\n"
1788 "gslwrc1 $f4, 0x0($12) \r\n"
1789 "gslwlc1 $f2, 0x3($13) \r\n"
1790 "gslwrc1 $f2, 0x0($13) \r\n"
1791 "gslwlc1 $f6, 0x3(%[pix]) \r\n"
1792 "gslwrc1 $f6, 0x0(%[pix]) \r\n"
1793 "punpcklbh $f0, $f0, $f4 \r\n"
1794 "punpcklbh $f2, $f2, $f6 \r\n"
1795 "daddu $12, %[pix], %[stride] \r\n"
1796 "punpckhhw $f4, $f0, $f2 \r\n"
1797 "punpcklhw $f0, $f0, $f2 \r\n"
1798 "gslwlc1 $f8, 0x3($12) \r\n"
1799 "daddu $13, %[pix], $16 \r\n"
1800 "gslwrc1 $f8, 0x0($12) \r\n"
1801 "gslwlc1 $f12, 0x3($13) \r\n"
1802 "daddu $12, %[pix], $17 \r\n"
1803 "gslwrc1 $f12, 0x0($13) \r\n"
1804 "gslwlc1 $f10, 0x3($12) \r\n"
1805 "daddu $13, %[pix], $19 \r\n"
1806 "gslwrc1 $f10, 0x0($12) \r\n"
1807 "gslwlc1 $f14, 0x3($13) \r\n"
1808 "gslwrc1 $f14, 0x0($13) \r\n"
1809 "punpcklbh $f8, $f8, $f12 \r\n"
1810 "punpcklbh $f10, $f10, $f14 \r\n"
1811 "mov.d $f12, $f8 \r\n"
1812 "punpcklhw $f8, $f8, $f10 \r\n"
1813 "punpckhhw $f12, $f12, $f10 \r\n"
1814 "punpckhwd $f2, $f0, $f8 \r\n"
1815 "punpckhwd $f6, $f4, $f12 \r\n"
1816 "punpcklwd $f0, $f0, $f8 \r\n"
1817 "punpcklwd $f4, $f4, $f12 \r\n"
1818 "mov.d $f20, $f0 \r\n"
1819 "mov.d $f22, $f6 \r\n"
1820 : [pix]
"+r"(pix),[stride]
"+r"(stride),[
alpha]
"+r"(
alpha),
1822 ::
"$12",
"$13",
"$16",
"$17",
"$18",
"$19",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
1823 "$f10",
"$f12",
"$f14",
"$f20",
"$f22"
1829 "punpckhwd $f8, $f20, $f20 \r\n"
1830 "punpckhwd $f10, $f2, $f2 \r\n"
1831 "punpckhwd $f12, $f4, $f4 \r\n"
1832 "punpcklbh $f0, $f20, $f2 \r\n"
1833 "punpcklbh $f4, $f4, $f22 \r\n"
1834 "punpcklhw $f2, $f0, $f4 \r\n"
1835 "punpckhhw $f0, $f0, $f4 \r\n"
1836 "gsswlc1 $f2, 0x3($18) \r\n"
1837 "gsswrc1 $f2, 0x0($18) \r\n"
1838 "daddu $12, $18, %[stride] \r\n"
1839 "punpckhwd $f2, $f2, $f2 \r\n"
1840 "gsswlc1 $f2, 0x3($12) \r\n"
1841 "daddu $13, $18, $16 \r\n"
1842 "gsswrc1 $f2, 0x0($12) \r\n"
1843 "gsswlc1 $f0, 0x3($13) \r\n"
1844 "gsswrc1 $f0, 0x0($13) \r\n"
1845 "punpckhwd $f0, $f0, $f0 \r\n"
1846 "punpckhwd $f6, $f22, $f22 \r\n"
1847 "gsswlc1 $f0, 0x3(%[pix]) \r\n"
1848 "gsswrc1 $f0, 0x0(%[pix]) \r\n"
1849 "punpcklbh $f8, $f8, $f10 \r\n"
1850 "punpcklbh $f12, $f12, $f6 \r\n"
1851 "daddu $12, %[pix], %[stride] \r\n"
1852 "punpcklhw $f10, $f8, $f12 \r\n"
1853 "punpckhhw $f8, $f8, $f12 \r\n"
1854 "gsswlc1 $f10, 0x3($12) \r\n"
1855 "gsswrc1 $f10, 0x0($12) \r\n"
1856 "punpckhwd $f10, $f10, $f10 \r\n"
1857 "daddu $12, %[pix], $16 \r\n"
1858 "daddu $13, %[pix], $17 \r\n"
1859 "gsswlc1 $f10, 0x3($12) \r\n"
1860 "gsswrc1 $f10, 0x0($12) \r\n"
1861 "gsswlc1 $f8, 0x3($13) \r\n"
1862 "daddu $12, %[pix], $19 \r\n"
1863 "punpckhwd $f20, $f8, $f8 \r\n"
1864 "gsswrc1 $f8, 0x0($13) \r\n"
1865 "gsswlc1 $f20, 0x3($12) \r\n"
1866 "gsswrc1 $f20, 0x0($12) \r\n"
1867 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride)
1868 :
"$12",
"$13",
"$16",
"$17",
"$18",
"$19",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
1869 "$f10",
"$f12",
"$f20"
1877 "daddiu %[alpha], %[alpha], -0x1 \r\n"
1878 "daddiu %[beta], %[beta], -0x1 \r\n"
1879 "daddu $16, %[stride], %[stride] \r\n"
1880 "daddiu %[pix], %[pix], -0x2 \r\n"
1881 "daddu $17, $16, %[stride] \r\n"
1882 "daddu $19, $16, $16 \r\n"
1883 "or $18, $0, %[pix] \r\n"
1884 "daddu %[pix], %[pix], $17 \r\n"
1885 "gslwlc1 $f0, 0x3($18) \r\n"
1886 "daddu $12, $18, %[stride] \r\n"
1887 "gslwrc1 $f0, 0x0($18) \r\n"
1888 "gslwlc1 $f4, 0x3($12) \r\n"
1889 "daddu $13, $18, $16 \r\n"
1890 "gslwrc1 $f4, 0x0($12) \r\n"
1891 "gslwlc1 $f2, 0x3($13) \r\n"
1892 "gslwrc1 $f2, 0x0($13) \r\n"
1893 "gslwlc1 $f6, 0x3(%[pix]) \r\n"
1894 "gslwrc1 $f6, 0x0(%[pix]) \r\n"
1895 "punpcklbh $f0, $f0, $f4 \r\n"
1896 "punpcklbh $f2, $f2, $f6 \r\n"
1897 "daddu $12, %[pix], %[stride] \r\n"
1898 "punpckhhw $f4, $f0, $f2 \r\n"
1899 "punpcklhw $f0, $f0, $f2 \r\n"
1900 "gslwlc1 $f8, 0x3($12) \r\n"
1901 "daddu $13, %[pix], $16 \r\n"
1902 "gslwrc1 $f8, 0x0($12) \r\n"
1903 "gslwlc1 $f12, 0x3($13) \r\n"
1904 "daddu $12, %[pix], $17 \r\n"
1905 "gslwrc1 $f12, 0x0($13) \r\n"
1906 "gslwlc1 $f10, 0x3($12) \r\n"
1907 "daddu $13, %[pix], $19 \r\n"
1908 "gslwrc1 $f10, 0x0($12) \r\n"
1909 "gslwlc1 $f14, 0x3($13) \r\n"
1910 "gslwrc1 $f14, 0x0($13) \r\n"
1911 "punpcklbh $f8, $f8, $f12 \r\n"
1912 "punpcklbh $f10, $f10, $f14 \r\n"
1913 "mov.d $f12, $f8 \r\n"
1914 "punpcklhw $f8, $f8, $f10 \r\n"
1915 "punpckhhw $f12, $f12, $f10 \r\n"
1916 "punpckhwd $f2, $f0, $f8 \r\n"
1917 "punpckhwd $f6, $f4, $f12 \r\n"
1918 "punpcklwd $f0, $f0, $f8 \r\n"
1919 "punpcklwd $f4, $f4, $f12 \r\n"
1920 : [pix]
"+r"(pix),[stride]
"+r"(stride),[
alpha]
"+r"(
alpha),
1922 ::
"$12",
"$13",
"$16",
"$17",
"$18",
"$19",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
1923 "$f10",
"$f12",
"$f14",
"$f20",
"$f22"
1929 "punpckhwd $f8, $f0, $f0 \r\n"
1930 "punpckhwd $f10, $f2, $f2 \r\n"
1931 "punpckhwd $f12, $f4, $f4 \r\n"
1932 "punpcklbh $f0, $f0, $f2 \r\n"
1933 "punpcklbh $f4, $f4, $f6 \r\n"
1934 "punpcklhw $f2, $f0, $f4 \r\n"
1935 "punpckhhw $f0, $f0, $f4 \r\n"
1936 "gsswlc1 $f2, 0x3($18) \r\n"
1937 "gsswrc1 $f2, 0x0($18) \r\n"
1938 "daddu $12, $18, %[stride] \r\n"
1939 "punpckhwd $f2, $f2, $f2 \r\n"
1940 "gsswlc1 $f2, 0x3($12) \r\n"
1941 "daddu $13, $18, $16 \r\n"
1942 "gsswrc1 $f2, 0x0($12) \r\n"
1943 "gsswlc1 $f0, 0x3($13) \r\n"
1944 "gsswrc1 $f0, 0x0($13) \r\n"
1945 "punpckhwd $f0, $f0, $f0 \r\n"
1946 "punpckhwd $f6, $f6, $f6 \r\n"
1947 "gsswlc1 $f0, 0x3(%[pix]) \r\n"
1948 "gsswrc1 $f0, 0x0(%[pix]) \r\n"
1949 "punpcklbh $f8, $f8, $f10 \r\n"
1950 "punpcklbh $f12, $f12, $f6 \r\n"
1951 "daddu $12, %[pix], %[stride] \r\n"
1952 "punpcklhw $f10, $f8, $f12 \r\n"
1953 "punpckhhw $f8, $f8, $f12 \r\n"
1954 "gsswlc1 $f10, 0x3($12) \r\n"
1955 "gsswrc1 $f10, 0x0($12) \r\n"
1956 "punpckhwd $f10, $f10, $f10 \r\n"
1957 "daddu $12, %[pix], $16 \r\n"
1958 "daddu $13, %[pix], $17 \r\n"
1959 "gsswlc1 $f10, 0x3($12) \r\n"
1960 "gsswrc1 $f10, 0x0($12) \r\n"
1961 "gsswlc1 $f8, 0x3($13) \r\n"
1962 "daddu $12, %[pix], $19 \r\n"
1963 "punpckhwd $f20, $f8, $f8 \r\n"
1964 "gsswrc1 $f8, 0x0($13) \r\n"
1965 "gsswlc1 $f20, 0x3($12) \r\n"
1966 "gsswrc1 $f20, 0x0($12) \r\n"
1967 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride)
1968 :
"$12",
"$13",
"$16",
"$17",
"$18",
"$19",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
1969 "$f10",
"$f12",
"$f20"
1976 if ((tc0[0] & tc0[1]) >= 0)
1978 if ((tc0[2] & tc0[3]) >= 0)
1992 uint64_t stack[0xd];
1995 "daddu $15, %[stride], %[stride] \r\n"
1996 "daddiu $8, %[pix], -0x4 \r\n"
1997 "daddu $9, %[stride], $15 \r\n"
1998 "gsldlc1 $f0, 0x7($8) \r\n"
1999 "gsldrc1 $f0, 0x0($8) \r\n"
2000 "daddu $12, $8, %[stride] \r\n"
2001 "daddu $10, $8, $9 \r\n"
2002 "gsldlc1 $f2, 0x7($12) \r\n"
2003 "daddu $11, $8, $15 \r\n"
2004 "gsldrc1 $f2, 0x0($12) \r\n"
2005 "gsldlc1 $f4, 0x7($11) \r\n"
2006 "gsldrc1 $f4, 0x0($11) \r\n"
2007 "gsldlc1 $f6, 0x7($10) \r\n"
2008 "daddu $12, $10, %[stride] \r\n"
2009 "gsldrc1 $f6, 0x0($10) \r\n"
2010 "gsldlc1 $f8, 0x7($12) \r\n"
2011 "daddu $11, $10, $15 \r\n"
2012 "gsldrc1 $f8, 0x0($12) \r\n"
2013 "gsldlc1 $f10, 0x7($11) \r\n"
2014 "daddu $12, $10, $9 \r\n"
2015 "gsldrc1 $f10, 0x0($11) \r\n"
2016 "gsldlc1 $f12, 0x7($12) \r\n"
2017 "gsldrc1 $f12, 0x0($12) \r\n"
2018 "daddu $14, $15, $15 \r\n"
2019 "punpckhbh $f14, $f0, $f2 \r\n"
2020 "punpcklbh $f0, $f0, $f2 \r\n"
2021 "punpckhbh $f2, $f4, $f6 \r\n"
2022 "punpcklbh $f4, $f4, $f6 \r\n"
2023 "punpckhbh $f6, $f8, $f10 \r\n"
2024 "punpcklbh $f8, $f8, $f10 \r\n"
2025 "daddu $12, $10, $14 \r\n"
2026 "sdc1 $f2, 0x10+%[stack] \r\n"
2027 "gsldlc1 $f16, 0x7($12) \r\n"
2028 "gsldrc1 $f16, 0x0($12) \r\n"
2029 "daddu $13, $14, $14 \r\n"
2030 "punpckhbh $f10, $f12, $f16 \r\n"
2031 "punpcklbh $f12, $f12, $f16 \r\n"
2032 "punpckhhw $f2, $f0, $f4 \r\n"
2033 "punpcklhw $f0, $f0, $f4 \r\n"
2034 "punpckhhw $f4, $f8, $f12 \r\n"
2035 "punpcklhw $f8, $f8, $f12 \r\n"
2036 "ldc1 $f16, 0x10+%[stack] \r\n"
2037 "punpckhwd $f0, $f0, $f8 \r\n"
2038 "sdc1 $f0, 0x0+%[stack] \r\n"
2039 "punpckhhw $f12, $f14, $f16 \r\n"
2040 "punpcklhw $f14, $f14, $f16 \r\n"
2041 "punpckhhw $f0, $f6, $f10 \r\n"
2042 "punpcklhw $f6, $f6, $f10 \r\n"
2043 "punpcklwd $f12, $f12, $f0 \r\n"
2044 "punpckhwd $f10, $f14, $f6 \r\n"
2045 "punpcklwd $f14, $f14, $f6 \r\n"
2046 "punpckhwd $f6, $f2, $f4 \r\n"
2047 "punpcklwd $f2, $f2, $f4 \r\n"
2048 "sdc1 $f2, 0x10+%[stack] \r\n"
2049 "sdc1 $f6, 0x20+%[stack] \r\n"
2050 "sdc1 $f14, 0x30+%[stack] \r\n"
2051 "sdc1 $f10, 0x40+%[stack] \r\n"
2052 "sdc1 $f12, 0x50+%[stack] \r\n"
2053 "daddu $8, $8, $13 \r\n"
2054 "daddu $10, $10, $13 \r\n"
2055 "gsldlc1 $f0, 0x7($8) \r\n"
2056 "daddu $12, $8, %[stride] \r\n"
2057 "gsldrc1 $f0, 0x0($8) \r\n"
2058 "gsldlc1 $f2, 0x7($12) \r\n"
2059 "daddu $11, $8, $15 \r\n"
2060 "gsldrc1 $f2, 0x0($12) \r\n"
2061 "gsldlc1 $f4, 0x7($11) \r\n"
2062 "gsldrc1 $f4, 0x0($11) \r\n"
2063 "gsldlc1 $f6, 0x7($10) \r\n"
2064 "daddu $12, $10, %[stride] \r\n"
2065 "gsldrc1 $f6, 0x0($10) \r\n"
2066 "gsldlc1 $f8, 0x7($12) \r\n"
2067 "daddu $11, $10, $15 \r\n"
2068 "gsldrc1 $f8, 0x0($12) \r\n"
2069 "gsldlc1 $f10, 0x7($11) \r\n"
2070 "daddu $12, $10, $9 \r\n"
2071 "gsldrc1 $f10, 0x0($11) \r\n"
2072 "gsldlc1 $f12, 0x7($12) \r\n"
2073 "gsldrc1 $f12, 0x0($12) \r\n"
2074 "punpckhbh $f14, $f0, $f2 \r\n"
2075 "punpcklbh $f0, $f0, $f2 \r\n"
2076 "punpckhbh $f2, $f4, $f6 \r\n"
2077 "punpcklbh $f4, $f4, $f6 \r\n"
2078 "punpckhbh $f6, $f8, $f10 \r\n"
2079 "punpcklbh $f8, $f8, $f10 \r\n"
2080 "daddu $12, $10, $14 \r\n"
2081 "sdc1 $f2, 0x18+%[stack] \r\n"
2082 "gsldlc1 $f16, 0x7($12) \r\n"
2083 "gsldrc1 $f16, 0x0($12) \r\n"
2084 "punpckhhw $f2, $f0, $f4 \r\n"
2085 "punpckhbh $f10, $f12, $f16 \r\n"
2086 "punpcklbh $f12, $f12, $f16 \r\n"
2087 "punpcklhw $f0, $f0, $f4 \r\n"
2088 "punpckhhw $f4, $f8, $f12 \r\n"
2089 "punpcklhw $f8, $f8, $f12 \r\n"
2090 "punpckhwd $f0, $f0, $f8 \r\n"
2091 "ldc1 $f16, 0x18+%[stack] \r\n"
2092 "sdc1 $f0, 0x8+%[stack] \r\n"
2093 "punpckhhw $f12, $f14, $f16 \r\n"
2094 "punpcklhw $f14, $f14, $f16 \r\n"
2095 "punpckhhw $f0, $f6, $f10 \r\n"
2096 "punpcklhw $f6, $f6, $f10 \r\n"
2097 "punpckhwd $f10, $f14, $f6 \r\n"
2098 "punpcklwd $f14, $f14, $f6 \r\n"
2099 "punpckhwd $f6, $f2, $f4 \r\n"
2100 "punpcklwd $f2, $f2, $f4 \r\n"
2101 "punpcklwd $f12, $f12, $f0 \r\n"
2102 "sdc1 $f2, 0x18+%[stack] \r\n"
2103 "sdc1 $f6, 0x28+%[stack] \r\n"
2104 "sdc1 $f14, 0x38+%[stack] \r\n"
2105 "sdc1 $f10, 0x48+%[stack] \r\n"
2106 "sdc1 $f12, 0x58+%[stack] \r\n"
2107 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),[stack]
"m"(stack[0])
2108 :
"$8",
"$9",
"$10",
"$11",
"$12",
"$13",
"$14",
"$15",
"$f0",
"$f2",
"$f4",
2109 "$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16"
2115 "daddu $15, %[stride], %[stride] \r\n"
2116 "daddiu $8, %[pix], -0x2 \r\n"
2117 "daddu $14, $15, $15 \r\n"
2118 "daddu $9, $15, %[stride] \r\n"
2119 "daddu $13, $14, $14 \r\n"
2120 "daddu $10, $8, $9 \r\n"
2121 "ldc1 $f0, 0x10+%[stack] \r\n"
2122 "ldc1 $f2, 0x20+%[stack] \r\n"
2123 "ldc1 $f4, 0x30+%[stack] \r\n"
2124 "ldc1 $f6, 0x40+%[stack] \r\n"
2125 "punpckhwd $f8, $f0, $f0 \r\n"
2126 "punpckhwd $f10, $f2, $f2 \r\n"
2127 "punpckhwd $f12, $f4, $f4 \r\n"
2128 "punpcklbh $f0, $f0, $f2 \r\n"
2129 "punpcklbh $f4, $f4, $f6 \r\n"
2130 "punpcklhw $f2, $f0, $f4 \r\n"
2131 "punpckhhw $f0, $f0, $f4 \r\n"
2132 "gsswlc1 $f2, 0x3($8) \r\n"
2133 "gsswrc1 $f2, 0x0($8) \r\n"
2134 "daddu $12, $8, %[stride] \r\n"
2135 "punpckhwd $f2, $f2, $f2 \r\n"
2136 "daddu $11, $8, $15 \r\n"
2137 "gsswlc1 $f2, 0x3($12) \r\n"
2138 "gsswrc1 $f2, 0x0($12) \r\n"
2139 "gsswlc1 $f0, 0x3($11) \r\n"
2140 "gsswrc1 $f0, 0x0($11) \r\n"
2141 "punpckhwd $f0, $f0, $f0 \r\n"
2142 "punpckhwd $f6, $f6, $f6 \r\n"
2143 "gsswlc1 $f0, 0x3($10) \r\n"
2144 "gsswrc1 $f0, 0x0($10) \r\n"
2145 "punpcklbh $f8, $f8, $f10 \r\n"
2146 "punpcklbh $f12, $f12, $f6 \r\n"
2147 "punpcklhw $f10, $f8, $f12 \r\n"
2148 "daddu $12, $10, %[stride] \r\n"
2149 "punpckhhw $f8, $f8, $f12 \r\n"
2150 "gsswlc1 $f10, 0x3($12) \r\n"
2151 "gsswrc1 $f10, 0x0($12) \r\n"
2152 "daddu $12, $10, $15 \r\n"
2153 "punpckhwd $f10, $f10, $f10 \r\n"
2154 "daddu $11, $10, $9 \r\n"
2155 "gsswlc1 $f10, 0x3($12) \r\n"
2156 "gsswrc1 $f10, 0x0($12) \r\n"
2157 "gsswlc1 $f8, 0x3($11) \r\n"
2158 "gsswrc1 $f8, 0x0($11) \r\n"
2159 "daddu $12, $10, $14 \r\n"
2160 "punpckhwd $f8, $f8, $f8 \r\n"
2161 "daddu $8, $8, $13 \r\n"
2162 "gsswlc1 $f8, 0x3($12) \r\n"
2163 "gsswrc1 $f8, 0x0($12) \r\n"
2164 "daddu $10, $10, $13 \r\n"
2165 "ldc1 $f0, 0x18+%[stack] \r\n"
2166 "ldc1 $f2, 0x28+%[stack] \r\n"
2167 "ldc1 $f4, 0x38+%[stack] \r\n"
2168 "ldc1 $f6, 0x48+%[stack] \r\n"
2169 "daddu $15, %[stride], %[stride] \r\n"
2170 "punpckhwd $f8, $f0, $f0 \r\n"
2171 "daddu $14, $15, $15 \r\n"
2172 "punpckhwd $f10, $f2, $f2 \r\n"
2173 "punpckhwd $f12, $f4, $f4 \r\n"
2174 "punpcklbh $f0, $f0, $f2 \r\n"
2175 "punpcklbh $f4, $f4, $f6 \r\n"
2176 "daddu $12, $8, %[stride] \r\n"
2177 "punpcklhw $f2, $f0, $f4 \r\n"
2178 "punpckhhw $f0, $f0, $f4 \r\n"
2179 "gsswlc1 $f2, 0x3($8) \r\n"
2180 "gsswrc1 $f2, 0x0($8) \r\n"
2181 "punpckhwd $f2, $f2, $f2 \r\n"
2182 "daddu $11, $8, $15 \r\n"
2183 "gsswlc1 $f2, 0x3($12) \r\n"
2184 "gsswrc1 $f2, 0x0($12) \r\n"
2185 "gsswlc1 $f0, 0x3($11) \r\n"
2186 "gsswrc1 $f0, 0x0($11) \r\n"
2187 "punpckhwd $f0, $f0, $f0 \r\n"
2188 "punpckhwd $f6, $f6, $f6 \r\n"
2189 "gsswlc1 $f0, 0x3($10) \r\n"
2190 "gsswrc1 $f0, 0x0($10) \r\n"
2191 "punpcklbh $f8, $f8, $f10 \r\n"
2192 "punpcklbh $f12, $f12, $f6 \r\n"
2193 "daddu $12, $10, %[stride] \r\n"
2194 "punpcklhw $f10, $f8, $f12 \r\n"
2195 "punpckhhw $f8, $f8, $f12 \r\n"
2196 "gsswlc1 $f10, 0x3($12) \r\n"
2197 "gsswrc1 $f10, 0x0($12) \r\n"
2198 "daddu $12, $10, $15 \r\n"
2199 "punpckhwd $f10, $f10, $f10 \r\n"
2200 "daddu $11, $10, $9 \r\n"
2201 "gsswlc1 $f10, 0x3($12) \r\n"
2202 "gsswrc1 $f10, 0x0($12) \r\n"
2203 "gsswlc1 $f8, 0x3($11) \r\n"
2204 "gsswrc1 $f8, 0x0($11) \r\n"
2205 "daddu $12, $10, $14 \r\n"
2206 "punpckhwd $f8, $f8, $f8 \r\n"
2207 "gsswlc1 $f8, 0x3($12) \r\n"
2208 "gsswrc1 $f8, 0x0($12) \r\n"
2209 ::[pix]
"r"(pix),[stride]
"r"((int64_t)
stride),[stack]
"m"(stack[0])
2210 :
"$8",
"$9",
"$10",
"$11",
"$12",
"$13",
"$14",
"$15",
"$f0",
"$f2",
"$f4",
2211 "$f6",
"$f8",
"$f10",
"$f12",
"$f14",
"$f16"
2218 uint64_t ptmp[0x11];
2222 "daddu $12, %[stride], %[stride] \r\n"
2223 "daddiu $10, %[pix], -0x4 \r\n"
2224 "daddu $11, $12, %[stride] \r\n"
2225 "daddu $13, $12, $12 \r\n"
2226 "daddu $9, $10, $11 \r\n"
2227 "daddu $8, $10, %[stride] \r\n"
2228 "gsldlc1 $f0, 0x7($10) \r\n"
2229 "gsldrc1 $f0, 0x0($10) \r\n"
2230 "daddu $14, $10, $12 \r\n"
2231 "gsldlc1 $f2, 0x7($8) \r\n"
2232 "gsldrc1 $f2, 0x0($8) \r\n"
2233 "gsldlc1 $f4, 0x7($14) \r\n"
2234 "gsldrc1 $f4, 0x0($14) \r\n"
2235 "daddu $8, $9, %[stride] \r\n"
2236 "gsldlc1 $f6, 0x7($9) \r\n"
2237 "gsldrc1 $f6, 0x0($9) \r\n"
2238 "daddu $14, $9, $12 \r\n"
2239 "gsldlc1 $f8, 0x7($8) \r\n"
2240 "gsldrc1 $f8, 0x0($8) \r\n"
2241 "daddu $8, $9, $11 \r\n"
2242 "gsldlc1 $f10, 0x7($14) \r\n"
2243 "gsldrc1 $f10, 0x0($14) \r\n"
2244 "gsldlc1 $f12, 0x7($8) \r\n"
2245 "gsldrc1 $f12, 0x0($8) \r\n"
2246 "daddu $8, $9, $13 \r\n"
2247 "punpckhbh $f14, $f0, $f2 \r\n"
2248 "punpcklbh $f0, $f0, $f2 \r\n"
2249 "punpckhbh $f2, $f4, $f6 \r\n"
2250 "punpcklbh $f4, $f4, $f6 \r\n"
2251 "punpckhbh $f6, $f8, $f10 \r\n"
2252 "punpcklbh $f8, $f8, $f10 \r\n"
2253 "gsldlc1 $f16, 0x7($8) \r\n"
2254 "gsldrc1 $f16, 0x0($8) \r\n"
2255 "punpckhbh $f10, $f12, $f16 \r\n"
2256 "punpcklbh $f12, $f12, $f16 \r\n"
2257 "sdc1 $f6, 0x0+%[ptmp] \r\n"
2258 "punpckhhw $f6, $f0, $f4 \r\n"
2259 "punpcklhw $f0, $f0, $f4 \r\n"
2260 "punpckhhw $f4, $f8, $f12 \r\n"
2261 "punpcklhw $f8, $f8, $f12 \r\n"
2262 "punpckhhw $f12, $f14, $f2 \r\n"
2263 "punpcklhw $f14, $f14, $f2 \r\n"
2264 "sdc1 $f4, 0x20+%[ptmp] \r\n"
2265 "ldc1 $f4, 0x0+%[ptmp] \r\n"
2266 "punpckhhw $f2, $f4, $f10 \r\n"
2267 "punpcklhw $f4, $f4, $f10 \r\n"
2268 "punpckhwd $f10, $f0, $f8 \r\n"
2269 "punpcklwd $f0, $f0, $f8 \r\n"
2270 "punpckhwd $f8, $f14, $f4 \r\n"
2271 "punpcklwd $f14, $f14, $f4 \r\n"
2272 "sdc1 $f0, 0x0+%[ptmp] \r\n"
2273 "sdc1 $f10, 0x10+%[ptmp] \r\n"
2274 "sdc1 $f14, 0x40+%[ptmp] \r\n"
2275 "sdc1 $f8, 0x50+%[ptmp] \r\n"
2276 "ldc1 $f16, 0x20+%[ptmp] \r\n"
2277 "punpckhwd $f0, $f6, $f16 \r\n"
2278 "punpcklwd $f6, $f6, $f16 \r\n"
2279 "punpckhwd $f10, $f12, $f2 \r\n"
2280 "punpcklwd $f12, $f12, $f2 \r\n"
2281 "daddu $8, $13, $13 \r\n"
2282 "sdc1 $f6, 0x20+%[ptmp] \r\n"
2283 "sdc1 $f0, 0x30+%[ptmp] \r\n"
2284 "sdc1 $f12, 0x60+%[ptmp] \r\n"
2285 "sdc1 $f10, 0x70+%[ptmp] \r\n"
2286 "daddu $10, $10, $8 \r\n"
2287 "daddu $9, $9, $8 \r\n"
2288 "daddu $8, $10, %[stride] \r\n"
2289 "gsldlc1 $f0, 0x7($10) \r\n"
2290 "gsldrc1 $f0, 0x0($10) \r\n"
2291 "daddu $14, $10, $12 \r\n"
2292 "gsldlc1 $f2, 0x7($8) \r\n"
2293 "gsldrc1 $f2, 0x0($8) \r\n"
2294 "gsldlc1 $f4, 0x7($14) \r\n"
2295 "gsldrc1 $f4, 0x0($14) \r\n"
2296 "daddu $8, $9, %[stride] \r\n"
2297 "gsldlc1 $f6, 0x7($9) \r\n"
2298 "gsldrc1 $f6, 0x0($9) \r\n"
2299 "daddu $14, $9, $12 \r\n"
2300 "gsldlc1 $f8, 0x7($8) \r\n"
2301 "gsldrc1 $f8, 0x0($8) \r\n"
2302 "daddu $8, $9, $11 \r\n"
2303 "gsldlc1 $f10, 0x7($14) \r\n"
2304 "gsldrc1 $f10, 0x0($14) \r\n"
2305 "gsldlc1 $f12, 0x7($8) \r\n"
2306 "gsldrc1 $f12, 0x0($8) \r\n"
2307 "daddu $8, $9, $13 \r\n"
2308 "punpckhbh $f14, $f0, $f2 \r\n"
2309 "punpcklbh $f0, $f0, $f2 \r\n"
2310 "punpckhbh $f2, $f4, $f6 \r\n"
2311 "punpcklbh $f4, $f4, $f6 \r\n"
2312 "punpckhbh $f6, $f8, $f10 \r\n"
2313 "punpcklbh $f8, $f8, $f10 \r\n"
2314 "gsldlc1 $f16, 0x7($8) \r\n"
2315 "gsldrc1 $f16, 0x0($8) \r\n"
2316 "punpckhbh $f10, $f12, $f16 \r\n"
2317 "punpcklbh $f12, $f12, $f16 \r\n"
2318 "sdc1 $f6, 0x8+%[ptmp] \r\n"
2319 "punpckhhw $f6, $f0, $f4 \r\n"
2320 "punpcklhw $f0, $f0, $f4 \r\n"
2321 "punpckhhw $f4, $f8, $f12 \r\n"
2322 "punpcklhw $f8, $f8, $f12 \r\n"
2323 "punpckhhw $f12, $f14, $f2 \r\n"
2324 "punpcklhw $f14, $f14, $f2 \r\n"
2325 "sdc1 $f4, 0x28+%[ptmp] \r\n"
2326 "ldc1 $f4, 0x8+%[ptmp] \r\n"
2327 "punpckhhw $f2, $f4, $f10 \r\n"
2328 "punpcklhw $f4, $f4, $f10 \r\n"
2329 "punpckhwd $f10, $f0, $f8 \r\n"
2330 "punpcklwd $f0, $f0, $f8 \r\n"
2331 "punpckhwd $f8, $f14, $f4 \r\n"
2332 "punpcklwd $f14, $f14, $f4 \r\n"
2333 "sdc1 $f0, 0x8+%[ptmp] \r\n"
2334 "sdc1 $f10, 0x18+%[ptmp] \r\n"
2335 "sdc1 $f14, 0x48+%[ptmp] \r\n"
2336 "sdc1 $f8, 0x58+%[ptmp] \r\n"
2337 "ldc1 $f16, 0x28+%[ptmp] \r\n"
2338 "punpckhwd $f0, $f6, $f16 \r\n"
2339 "punpcklwd $f6, $f6, $f16 \r\n"
2340 "punpckhwd $f10, $f12, $f2 \r\n"
2341 "punpcklwd $f12, $f12, $f2 \r\n"
2342 "sdc1 $f6, 0x28+%[ptmp] \r\n"
2343 "sdc1 $f0, 0x38+%[ptmp] \r\n"
2344 "sdc1 $f12, 0x68+%[ptmp] \r\n"
2345 "sdc1 $f10, 0x78+%[ptmp] \r\n"
2346 "sd $10, 0x00+%[pdat] \r\n"
2347 "sd $11, 0x08+%[pdat] \r\n"
2348 "sd $12, 0x10+%[pdat] \r\n"
2349 "sd $13, 0x18+%[pdat] \r\n"
2350 ::[pix]
"r"(pix),[stride]
"r"((uint64_t)
stride),[ptmp]
"m"(ptmp[0]),
2352 :
"$8",
"$9",
"$10",
"$11",
"$12",
"$13",
"$14",
"$f0",
"$f2",
"$f4",
"$f6",
2353 "$f8",
"$f10",
"$f12",
"$f14",
"$f16"
2359 "ld $10, 0x00+%[pdat] \r\n"
2360 "ld $11, 0x08+%[pdat] \r\n"
2361 "ld $12, 0x10+%[pdat] \r\n"
2362 "ld $13, 0x18+%[pdat] \r\n"
2363 "daddu $9, $10, $11 \r\n"
2364 "ldc1 $f0, 0x8+%[ptmp] \r\n"
2365 "ldc1 $f2, 0x18+%[ptmp] \r\n"
2366 "ldc1 $f4, 0x28+%[ptmp] \r\n"
2367 "ldc1 $f6, 0x38+%[ptmp] \r\n"
2368 "ldc1 $f8, 0x48+%[ptmp] \r\n"
2369 "ldc1 $f10, 0x58+%[ptmp] \r\n"
2370 "ldc1 $f12, 0x68+%[ptmp] \r\n"
2371 "punpckhbh $f14, $f0, $f2 \r\n"
2372 "punpcklbh $f0, $f0, $f2 \r\n"
2373 "punpckhbh $f2, $f4, $f6 \r\n"
2374 "punpcklbh $f4, $f4, $f6 \r\n"
2375 "punpckhbh $f6, $f8, $f10 \r\n"
2376 "punpcklbh $f8, $f8, $f10 \r\n"
2377 "ldc1 $f16, 0x78+%[ptmp] \r\n"
2378 "punpckhbh $f10, $f12, $f16 \r\n"
2379 "punpcklbh $f12, $f12, $f16 \r\n"
2380 "gssdlc1 $f6, 0x7($10) \r\n"
2381 "gssdrc1 $f6, 0x0($10) \r\n"
2382 "daddu $8, $10, $12 \r\n"
2383 "punpckhhw $f6, $f0, $f4 \r\n"
2384 "punpcklhw $f0, $f0, $f4 \r\n"
2385 "punpckhhw $f4, $f8, $f12 \r\n"
2386 "punpcklhw $f8, $f8, $f12 \r\n"
2387 "punpckhhw $f12, $f14, $f2 \r\n"
2388 "punpcklhw $f14, $f14, $f2 \r\n"
2389 "gssdlc1 $f4, 0x7($8) \r\n"
2390 "gssdrc1 $f4, 0x0($8) \r\n"
2391 "gsldlc1 $f4, 0x7($10) \r\n"
2392 "gsldrc1 $f4, 0x0($10) \r\n"
2393 "punpckhhw $f2, $f4, $f10 \r\n"
2394 "punpcklhw $f4, $f4, $f10 \r\n"
2395 "punpckhwd $f10, $f0, $f8 \r\n"
2396 "punpcklwd $f0, $f0, $f8 \r\n"
2397 "punpckhwd $f8, $f14, $f4 \r\n"
2398 "punpcklwd $f14, $f14, $f4 \r\n"
2399 "daddu $8, $10, %[stride] \r\n"
2400 "gssdlc1 $f0, 0x7($10) \r\n"
2401 "gssdrc1 $f0, 0x0($10) \r\n"
2402 "daddu $14, $9, %[stride] \r\n"
2403 "gssdlc1 $f10, 0x7($8) \r\n"
2404 "gssdrc1 $f10, 0x0($8) \r\n"
2405 "daddu $8, $9, $12 \r\n"
2406 "gssdlc1 $f14, 0x7($14) \r\n"
2407 "gssdrc1 $f14, 0x0($14) \r\n"
2408 "daddu $14, $10, $12 \r\n"
2409 "gssdlc1 $f8, 0x7($8) \r\n"
2410 "gssdrc1 $f8, 0x0($8) \r\n"
2411 "gsldlc1 $f16, 0x7($14) \r\n"
2412 "gsldrc1 $f16, 0x0($14) \r\n"
2413 "daddu $8, $10, $12 \r\n"
2414 "punpckhwd $f0, $f6, $f16 \r\n"
2415 "punpcklwd $f6, $f6, $f16 \r\n"
2416 "punpckhwd $f10, $f12, $f2 \r\n"
2417 "punpcklwd $f12, $f12, $f2 \r\n"
2418 "gssdlc1 $f6, 0x7($8) \r\n"
2419 "gssdrc1 $f6, 0x0($8) \r\n"
2420 "daddu $8, $9, $11 \r\n"
2421 "gssdlc1 $f0, 0x7($9) \r\n"
2422 "gssdrc1 $f0, 0x0($9) \r\n"
2423 "daddu $14, $9, $13 \r\n"
2424 "gssdlc1 $f12, 0x7($8) \r\n"
2425 "gssdrc1 $f12, 0x0($8) \r\n"
2426 "daddu $8, $13, $13 \r\n"
2427 "gssdlc1 $f10, 0x7($14) \r\n"
2428 "gssdrc1 $f10, 0x0($14) \r\n"
2429 "dsubu $10, $10, $8 \r\n"
2430 "dsubu $9, $9, $8 \r\n"
2431 "ldc1 $f0, 0x0+%[ptmp] \r\n"
2432 "ldc1 $f2, 0x10+%[ptmp] \r\n"
2433 "ldc1 $f4, 0x20+%[ptmp] \r\n"
2434 "ldc1 $f6, 0x30+%[ptmp] \r\n"
2435 "ldc1 $f8, 0x40+%[ptmp] \r\n"
2436 "ldc1 $f10, 0x50+%[ptmp] \r\n"
2437 "ldc1 $f12, 0x60+%[ptmp] \r\n"
2438 "punpckhbh $f14, $f0, $f2 \r\n"
2439 "punpcklbh $f0, $f0, $f2 \r\n"
2440 "punpckhbh $f2, $f4, $f6 \r\n"
2441 "punpcklbh $f4, $f4, $f6 \r\n"
2442 "punpckhbh $f6, $f8, $f10 \r\n"
2443 "punpcklbh $f8, $f8, $f10 \r\n"
2444 "ldc1 $f16, 0x70+%[ptmp] \r\n"
2445 "punpckhbh $f10, $f12, $f16 \r\n"
2446 "punpcklbh $f12, $f12, $f16 \r\n"
2447 "gssdlc1 $f6, 0x7($10) \r\n"
2448 "gssdrc1 $f6, 0x0($10) \r\n"
2449 "daddu $8, $10, $12 \r\n"
2450 "punpckhhw $f6, $f0, $f4 \r\n"
2451 "punpcklhw $f0, $f0, $f4 \r\n"
2452 "punpckhhw $f4, $f8, $f12 \r\n"
2453 "punpcklhw $f8, $f8, $f12 \r\n"
2454 "punpckhhw $f12, $f14, $f2 \r\n"
2455 "punpcklhw $f14, $f14, $f2 \r\n"
2456 "gssdlc1 $f4, 0x7($8) \r\n"
2457 "gssdrc1 $f4, 0x0($8) \r\n"
2458 "gsldlc1 $f4, 0x7($10) \r\n"
2459 "gsldrc1 $f4, 0x0($10) \r\n"
2460 "punpckhhw $f2, $f4, $f10 \r\n"
2461 "punpcklhw $f4, $f4, $f10 \r\n"
2462 "punpckhwd $f10, $f0, $f8 \r\n"
2463 "punpcklwd $f0, $f0, $f8 \r\n"
2464 "punpckhwd $f8, $f14, $f4 \r\n"
2465 "punpcklwd $f14, $f14, $f4 \r\n"
2466 "daddu $8, $10, %[stride] \r\n"
2467 "gssdlc1 $f0, 0x7($10) \r\n"
2468 "gssdrc1 $f0, 0x0($10) \r\n"
2469 "daddu $14, $9, %[stride] \r\n"
2470 "gssdlc1 $f10, 0x7($8) \r\n"
2471 "gssdrc1 $f10, 0x0($8) \r\n"
2472 "daddu $8, $9, $12 \r\n"
2473 "gssdlc1 $f14, 0x7($14) \r\n"
2474 "gssdrc1 $f14, 0x0($14) \r\n"
2475 "daddu $14, $10, $12 \r\n"
2476 "gssdlc1 $f8, 0x7($8) \r\n"
2477 "gssdrc1 $f8, 0x0($8) \r\n"
2478 "gsldlc1 $f16, 0x7($14) \r\n"
2479 "gsldrc1 $f16, 0x0($14) \r\n"
2480 "daddu $8, $10, $12 \r\n"
2481 "punpckhwd $f0, $f6, $f16 \r\n"
2482 "punpcklwd $f6, $f6, $f16 \r\n"
2483 "punpckhwd $f10, $f12, $f2 \r\n"
2484 "punpcklwd $f12, $f12, $f2 \r\n"
2485 "gssdlc1 $f6, 0x7($8) \r\n"
2486 "gssdrc1 $f6, 0x0($8) \r\n"
2487 "daddu $8, $9, $11 \r\n"
2488 "gssdlc1 $f0, 0x7($9) \r\n"
2489 "gssdrc1 $f0, 0x0($9) \r\n"
2490 "daddu $14, $9, $13 \r\n"
2491 "gssdlc1 $f12, 0x7($8) \r\n"
2492 "gssdrc1 $f12, 0x0($8) \r\n"
2493 "gssdlc1 $f10, 0x7($14) \r\n"
2494 "gssdrc1 $f10, 0x0($14) \r\n"
2495 ::[pix]
"r"(pix),[stride]
"r"((uint64_t)
stride),[ptmp]
"m"(ptmp[0]),
2497 :
"$8",
"$9",
"$10",
"$11",
"$12",
"$13",
"$14",
"$f0",
"$f2",
"$f4",
"$f6",
2498 "$f8",
"$f10",
"$f12",
"$f14",
"$f16"
void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)
void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input, int qmul)
void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)
static void chroma_inter_body_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
static double alpha(void *priv, double x, double y)
void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
static void chroma_intra_body_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
static const uint8_t offset[127][2]
static const uint8_t scan8[16 *3+3]
void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
BYTE int const BYTE int int int height
static int weight(int i, int blen, int offset)
void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
GLint GLenum GLboolean GLsizei stride
void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)