FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
29 
30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
31  "li %[tmp0], "#r1" \n\t" \
32  "mtc1 %[tmp0], %[ftmp13] \n\t" \
33  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
34  "li %[tmp0], "#r2" \n\t" \
35  "mtc1 %[tmp0], %[ftmp14] \n\t" \
36  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
37  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
38  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
39  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
40  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
41  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
42  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
43  \
44  "li %[tmp0], "#r3" \n\t" \
45  "mtc1 %[tmp0], %[ftmp13] \n\t" \
46  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
47  "li %[tmp0], "#r4" \n\t" \
48  "mtc1 %[tmp0], %[ftmp14] \n\t" \
49  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
50  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
51  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
52  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
53  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
54  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
55  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
56  \
57  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
58  "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
59  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
60  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
61  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
62  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
63  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
64  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
65  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
66  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
67  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
68  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
69  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
70  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
71  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
72  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
73 
74 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
75  "li %[tmp0], "#r1" \n\t" \
76  "mtc1 %[tmp0], %[ftmp13] \n\t" \
77  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
78  "li %[tmp0], "#r2" \n\t" \
79  "mtc1 %[tmp0], %[ftmp14] \n\t" \
80  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
81  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
82  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
83  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
84  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
85  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
86  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
87  \
88  "li %[tmp0], "#r3" \n\t" \
89  "mtc1 %[tmp0], %[ftmp13] \n\t" \
90  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
91  "li %[tmp0], "#r4" \n\t" \
92  "mtc1 %[tmp0], %[ftmp14] \n\t" \
93  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
94  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
95  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
96  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
97  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
98  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
99  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
100  \
101  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
102  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
103  "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
104  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
105  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
106  "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
107  "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
108  "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
109  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
110  "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
111  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
112  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
113  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
114  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
115  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
116  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
117  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
118  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
119  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
120  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
121 
122 /* Do inverse transform on 8x8 block */
123 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
124 {
125  int dc = block[0];
126  double ftmp[9];
127  mips_reg addr[1];
128  int count;
129 
130  dc = (3 * dc + 1) >> 1;
131  dc = (3 * dc + 16) >> 5;
132 
133  __asm__ volatile(
134  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
135  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
136  "li %[count], 0x02 \n\t"
137 
138  "1: \n\t"
139  MMI_LDC1(%[ftmp1], %[dest], 0x00)
140  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
141  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
142  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
143  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
144  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
145  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
146 
147  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
148  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
149  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
150  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
151  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
152  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
153  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
154  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
155 
156  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
157  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
158  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
159  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
160  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
161  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
162  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
163  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
164 
165  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
166  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
167  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
168  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
169 
170  MMI_SDC1(%[ftmp1], %[dest], 0x00)
171  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
172  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
173  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
174  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
175  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
176  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
177 
178  "addiu %[count], %[count], -0x01 \n\t"
179  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
180  "bnez %[count], 1b \n\t"
181  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
182  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
183  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
184  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
185  [ftmp8]"=&f"(ftmp[8]),
186  [addr0]"=&r"(addr[0]),
187  [count]"=&r"(count), [dest]"+&r"(dest)
188  : [linesize]"r"((mips_reg)linesize),
189  [dc]"f"(dc)
190  : "memory"
191  );
192 }
193 
194 #if _MIPS_SIM != _ABIO32
195 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
196 {
197  DECLARE_ALIGNED(16, int16_t, temp[64]);
198  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
199  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
200  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
201  double ftmp[23];
202  uint64_t tmp[1];
203 
204  __asm__ volatile (
205  /* 1st loop: start */
206  "li %[tmp0], 0x03 \n\t"
207  "mtc1 %[tmp0], %[ftmp0] \n\t"
208 
209  // 1st part
210  MMI_LDC1(%[ftmp1], %[block], 0x00)
211  MMI_LDC1(%[ftmp11], %[block], 0x10)
212  MMI_LDC1(%[ftmp2], %[block], 0x20)
213  MMI_LDC1(%[ftmp12], %[block], 0x30)
214  MMI_LDC1(%[ftmp3], %[block], 0x40)
215  MMI_LDC1(%[ftmp13], %[block], 0x50)
216  MMI_LDC1(%[ftmp4], %[block], 0x60)
217  MMI_LDC1(%[ftmp14], %[block], 0x70)
218  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
219  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
220  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
221  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
222 
223  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
224  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
225  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
226  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
227 
228  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
229  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
230  0x000f0010, 0x00040009, %[ff_pw_4])
231 
232  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
233  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
234  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
235 
236  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
237  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
238  0xfff00009, 0x000f0004, %[ff_pw_4])
239 
240  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
241  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
242  0xfff70004, 0xfff0000f, %[ff_pw_4])
243 
244  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
245  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
246 
247  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
248  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249 
250  MMI_SDC1(%[ftmp15], %[temp], 0x00)
251  MMI_SDC1(%[ftmp19], %[temp], 0x08)
252  MMI_SDC1(%[ftmp16], %[temp], 0x10)
253  MMI_SDC1(%[ftmp20], %[temp], 0x18)
254  MMI_SDC1(%[ftmp17], %[temp], 0x20)
255  MMI_SDC1(%[ftmp21], %[temp], 0x28)
256  MMI_SDC1(%[ftmp18], %[temp], 0x30)
257  MMI_SDC1(%[ftmp22], %[temp], 0x38)
258 
259  // 2nd part
260  MMI_LDC1(%[ftmp1], %[block], 0x08)
261  MMI_LDC1(%[ftmp11], %[block], 0x18)
262  MMI_LDC1(%[ftmp2], %[block], 0x28)
263  MMI_LDC1(%[ftmp12], %[block], 0x38)
264  MMI_LDC1(%[ftmp3], %[block], 0x48)
265  MMI_LDC1(%[ftmp13], %[block], 0x58)
266  MMI_LDC1(%[ftmp4], %[block], 0x68)
267  MMI_LDC1(%[ftmp14], %[block], 0x78)
268  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
269  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
270  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
271  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
272 
273  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
274  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
275  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
276  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
277 
278  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
279  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
280  0x000f0010, 0x00040009, %[ff_pw_4])
281 
282  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
283  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
284  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
285 
286  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
287  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
288  0xfff00009, 0x000f0004, %[ff_pw_4])
289 
290  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
291  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
292  0xfff70004, 0xfff0000f, %[ff_pw_4])
293 
294  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
295  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
296 
297  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
298  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299 
300  MMI_SDC1(%[ftmp19], %[temp], 0x48)
301  MMI_SDC1(%[ftmp20], %[temp], 0x58)
302  MMI_SDC1(%[ftmp21], %[temp], 0x68)
303  MMI_SDC1(%[ftmp22], %[temp], 0x78)
304  /* 1st loop: end */
305 
306  /* 2nd loop: start */
307  "li %[tmp0], 0x07 \n\t"
308  "mtc1 %[tmp0], %[ftmp0] \n\t"
309 
310  // 1st part
311  MMI_LDC1(%[ftmp1], %[temp], 0x00)
312  MMI_LDC1(%[ftmp11], %[temp], 0x10)
313  MMI_LDC1(%[ftmp2], %[temp], 0x20)
314  MMI_LDC1(%[ftmp12], %[temp], 0x30)
315  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
316  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
317  "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
318  "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
319 
320  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
321  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
322  "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
323  "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
324 
325  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
326  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
327  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
328 
329  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
330  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
331  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
332 
333  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
334  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
335  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
336 
337  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
338  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
339  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
340 
341  MMI_SDC1(%[ftmp15], %[block], 0x00)
342  MMI_SDC1(%[ftmp16], %[block], 0x10)
343  MMI_SDC1(%[ftmp17], %[block], 0x20)
344  MMI_SDC1(%[ftmp18], %[block], 0x30)
345  MMI_SDC1(%[ftmp19], %[block], 0x40)
346  MMI_SDC1(%[ftmp20], %[block], 0x50)
347  MMI_SDC1(%[ftmp21], %[block], 0x60)
348  MMI_SDC1(%[ftmp22], %[block], 0x70)
349 
350  // 2nd part
351  MMI_LDC1(%[ftmp1], %[temp], 0x08)
352  MMI_LDC1(%[ftmp11], %[temp], 0x18)
353  MMI_LDC1(%[ftmp2], %[temp], 0x28)
354  MMI_LDC1(%[ftmp12], %[temp], 0x38)
355  MMI_LDC1(%[ftmp3], %[temp], 0x48)
356  MMI_LDC1(%[ftmp13], %[temp], 0x58)
357  MMI_LDC1(%[ftmp4], %[temp], 0x68)
358  MMI_LDC1(%[ftmp14], %[temp], 0x78)
359  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
360  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
361  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
362  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
363 
364  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
365  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
366  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
367  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
368 
369  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
370  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
371  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
372 
373  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
374  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
375  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
376 
377  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
378  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
379  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
380 
381  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
382  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
383  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
384 
385  MMI_SDC1(%[ftmp15], %[block], 0x08)
386  MMI_SDC1(%[ftmp16], %[block], 0x18)
387  MMI_SDC1(%[ftmp17], %[block], 0x28)
388  MMI_SDC1(%[ftmp18], %[block], 0x38)
389  MMI_SDC1(%[ftmp19], %[block], 0x48)
390  MMI_SDC1(%[ftmp20], %[block], 0x58)
391  MMI_SDC1(%[ftmp21], %[block], 0x68)
392  MMI_SDC1(%[ftmp22], %[block], 0x78)
393  /* 2nd loop: end */
394  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
395  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
396  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
397  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
398  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
399  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
400  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
401  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
402  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
403  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
404  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
405  [ftmp22]"=&f"(ftmp[22]),
406  [tmp0]"=&r"(tmp[0])
407  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
408  [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
409  [temp]"r"(temp)
410  : "memory"
411  );
412 }
413 #endif
414 
415 /* Do inverse transform on 8x4 part of block */
416 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
417 {
418  int dc = block[0];
419  double ftmp[9];
420 
421  dc = ( 3 * dc + 1) >> 1;
422  dc = (17 * dc + 64) >> 7;
423 
424  __asm__ volatile(
425  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
426  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
427 
428  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
429  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
430  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
431  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
432 
433  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
434  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
435  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
436  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
437  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
438  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
439  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
440  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
441 
442  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
443  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
444  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
445  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
446  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
447  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
448  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
449  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
450 
451  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
452  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
453  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
454  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
455 
456  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
457  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
458  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
459  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
460  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
461  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
462  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
463  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
464  [ftmp8]"=&f"(ftmp[8])
465  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
466  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
467  [dc]"f"(dc)
468  : "memory"
469  );
470 }
471 
472 #if _MIPS_SIM != _ABIO32
473 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
474 {
475  int16_t *src = block;
476  int16_t *dst = block;
477  double ftmp[16];
478  uint32_t tmp[1];
479  int16_t count = 4;
480  DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
481  DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
482  int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
483  12, 15, 6, -4, -12, -16, -16, -9,
484  12, 9, -6, -16, -12, 4, 16, 15,
485  12, 4, -16, -9, 12, 15, -6, -16,
486  12, -4, -16, 9, 12, -15, -6, 16,
487  12, -9, -6, 16, -12, -4, 16, -15,
488  12, -15, 6, 4, -12, 16, -16, 9,
489  12, -16, 16, -15, 12, -9, 6, -4};
490 
491  // 1st loop
492  __asm__ volatile (
493  "li %[tmp0], 0x03 \n\t"
494  "mtc1 %[tmp0], %[ftmp0] \n\t"
495 
496  "1: \n\t"
497  MMI_LDC1(%[ftmp1], %[src], 0x00)
498  MMI_LDC1(%[ftmp2], %[src], 0x08)
499 
500  /* ftmp11: dst1,dst0 */
501  MMI_LDC1(%[ftmp3], %[coeff], 0x00)
502  MMI_LDC1(%[ftmp4], %[coeff], 0x08)
503  MMI_LDC1(%[ftmp5], %[coeff], 0x10)
504  MMI_LDC1(%[ftmp6], %[coeff], 0x18)
505  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
506  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
507  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
508  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
509  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
510  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
511  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
512  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
513  "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
514  "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
515 
516  /* ftmp12: dst3,dst2 */
517  MMI_LDC1(%[ftmp3], %[coeff], 0x20)
518  MMI_LDC1(%[ftmp4], %[coeff], 0x28)
519  MMI_LDC1(%[ftmp5], %[coeff], 0x30)
520  MMI_LDC1(%[ftmp6], %[coeff], 0x38)
521  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
522  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
523  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
524  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
525  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
526  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
527  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
528  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
529  "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
530  "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
531 
532  /* ftmp13: dst5,dst4 */
533  MMI_LDC1(%[ftmp3], %[coeff], 0x40)
534  MMI_LDC1(%[ftmp4], %[coeff], 0x48)
535  MMI_LDC1(%[ftmp5], %[coeff], 0x50)
536  MMI_LDC1(%[ftmp6], %[coeff], 0x58)
537  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
538  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
539  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
540  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
541  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
542  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
543  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
544  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
545  "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
546  "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
547 
548  /* ftmp14: dst7,dst6 */
549  MMI_LDC1(%[ftmp3], %[coeff], 0x60)
550  MMI_LDC1(%[ftmp4], %[coeff], 0x68)
551  MMI_LDC1(%[ftmp5], %[coeff], 0x70)
552  MMI_LDC1(%[ftmp6], %[coeff], 0x78)
553  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
554  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
555  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
556  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
557  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
558  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
559  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
560  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
561  "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
562  "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
563 
564  /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
565  "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
566  "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
567  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
568  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
569  "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
570  "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
571  "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
572  "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
573  "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
574  "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
575  MMI_SDC1(%[ftmp9], %[dst], 0x00)
576  MMI_SDC1(%[ftmp10], %[dst], 0x08)
577 
578  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
579  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
580  "addiu %[count], %[count], -0x01 \n\t"
581  "bnez %[count], 1b \n\t"
582  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
583  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
584  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
585  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
586  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
587  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
588  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
589  [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
590  [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
591  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
592  : "memory"
593  );
594 
595  src = block;
596 
597  // 2nd loop
598  __asm__ volatile (
599  "li %[tmp0], 0x44 \n\t"
600  "mtc1 %[tmp0], %[ftmp15] \n\t"
601 
602  // 1st part
603  "li %[tmp0], 0x07 \n\t"
604  "mtc1 %[tmp0], %[ftmp0] \n\t"
605  MMI_LDC1(%[ftmp1], %[src], 0x00)
606  MMI_LDC1(%[ftmp2], %[src], 0x10)
607  MMI_LDC1(%[ftmp3], %[src], 0x20)
608  MMI_LDC1(%[ftmp4], %[src], 0x30)
609  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
610  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
611  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
612  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
613 
614  /* ftmp11: dst03,dst02,dst01,dst00 */
615  "li %[tmp0], 0x00160011 \n\t"
616  "mtc1 %[tmp0], %[ftmp3] \n\t"
617  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
618  "li %[tmp0], 0x000a0011 \n\t"
619  "mtc1 %[tmp0], %[ftmp4] \n\t"
620  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
621  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
622  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
623  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
624  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
625  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
626  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
627  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
628  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
629  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
630  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
631  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
632  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
633  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
634 
635  /* ftmp12: dst13,dst12,dst11,dst10 */
636  "li %[tmp0], 0x000a0011 \n\t"
637  "mtc1 %[tmp0], %[ftmp3] \n\t"
638  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
639  "li %[tmp0], 0xffeaffef \n\t"
640  "mtc1 %[tmp0], %[ftmp4] \n\t"
641  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
642  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
643  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
644  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
645  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
646  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
647  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
648  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
649  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
650  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
651  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
652  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
653  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
654  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
655 
656  /* ftmp13: dst23,dst22,dst21,dst20 */
657  "li %[tmp0], 0xfff60011 \n\t"
658  "mtc1 %[tmp0], %[ftmp3] \n\t"
659  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
660  "li %[tmp0], 0x0016ffef \n\t"
661  "mtc1 %[tmp0], %[ftmp4] \n\t"
662  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
663  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
664  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
665  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
666  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
667  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
668  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
669  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
670  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
671  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
672  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
673  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
674  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
675  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
676 
677  /* ftmp14: dst33,dst32,dst31,dst30 */
678  "li %[tmp0], 0xffea0011 \n\t"
679  "mtc1 %[tmp0], %[ftmp3] \n\t"
680  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
681  "li %[tmp0], 0xfff60011 \n\t"
682  "mtc1 %[tmp0], %[ftmp4] \n\t"
683  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
684  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
685  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
686  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
687  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
688  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
689  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
690  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
691  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
692  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
693  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
694  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
695  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
696  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
697 
698  MMI_LWC1(%[ftmp1], %[dest], 0x00)
699  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
700  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
701  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
702  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
703  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
705  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
706  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
707  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
708  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
709  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
710  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
711  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
712  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
713  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
714  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
715  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
716  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
717  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
718  MMI_SWC1(%[ftmp1], %[dest], 0x00)
719  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
720  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
721  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
722  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
723  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
725 
726  // 2nd part
727  "li %[tmp0], 0x07 \n\t"
728  "mtc1 %[tmp0], %[ftmp0] \n\t"
729  MMI_LDC1(%[ftmp1], %[src], 0x08)
730  MMI_LDC1(%[ftmp2], %[src], 0x18)
731  MMI_LDC1(%[ftmp3], %[src], 0x28)
732  MMI_LDC1(%[ftmp4], %[src], 0x38)
733  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
734  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
735  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
736  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
737 
738  /* ftmp11: dst03,dst02,dst01,dst00 */
739  "li %[tmp0], 0x00160011 \n\t"
740  "mtc1 %[tmp0], %[ftmp3] \n\t"
741  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
742  "li %[tmp0], 0x000a0011 \n\t"
743  "mtc1 %[tmp0], %[ftmp4] \n\t"
744  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
745  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
746  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
747  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
748  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
749  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
750  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
751  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
752  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
753  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
754  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
755  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
756  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
757  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
758 
759  /* ftmp12: dst13,dst12,dst11,dst10 */
760  "li %[tmp0], 0x000a0011 \n\t"
761  "mtc1 %[tmp0], %[ftmp3] \n\t"
762  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
763  "li %[tmp0], 0xffeaffef \n\t"
764  "mtc1 %[tmp0], %[ftmp4] \n\t"
765  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
766  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
767  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
768  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
769  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
770  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
771  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
772  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
773  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
774  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
775  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
776  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
777  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
778  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
779 
780  /* ftmp13: dst23,dst22,dst21,dst20 */
781  "li %[tmp0], 0xfff60011 \n\t"
782  "mtc1 %[tmp0], %[ftmp3] \n\t"
783  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
784  "li %[tmp0], 0x0016ffef \n\t"
785  "mtc1 %[tmp0], %[ftmp4] \n\t"
786  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
787  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
788  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
789  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
790  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
791  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
792  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
793  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
794  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
795  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
796  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
797  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
798  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
799  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
800 
801  /* ftmp14: dst33,dst32,dst31,dst30 */
802  "li %[tmp0], 0xffea0011 \n\t"
803  "mtc1 %[tmp0], %[ftmp3] \n\t"
804  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
805  "li %[tmp0], 0xfff60011 \n\t"
806  "mtc1 %[tmp0], %[ftmp4] \n\t"
807  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
808  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
809  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
810  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
811  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
812  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
813  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
814  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
815  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
816  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
817  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
818  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
819  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
820  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
821 
822  MMI_LWC1(%[ftmp1], %[dest], 0x04)
823  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
824  MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
825  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
826  MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
827  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828  MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
829  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
830  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
831  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
832  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
833  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
834  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
835  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
836  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
837  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
838  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
839  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
840  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
841  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
842  MMI_SWC1(%[ftmp1], %[dest], 0x04)
843  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
844  MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
845  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
846  MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
847  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848  MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
849 
850  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
851  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
852  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
853  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
854  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
855  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
856  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
857  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
858  [tmp0]"=&r"(tmp[0])
859  : [ff_pw_64]"f"(ff_pw_64_local),
860  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
861  :"memory"
862  );
863 }
864 #endif
865 
866 /* Do inverse transform on 4x8 parts of block */
867 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
868 {
869  int dc = block[0];
870  double ftmp[9];
871  DECLARE_VAR_LOW32;
872 
873  dc = (17 * dc + 4) >> 3;
874  dc = (12 * dc + 64) >> 7;
875 
876  __asm__ volatile(
877  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
878  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
879 
880  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
881  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
882  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
883  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
884  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
885  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
886  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
887  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
888 
889  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
890  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
891  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
892  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
893  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
894  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
895  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
896  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
897 
898  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
899  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
900  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
901  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
902  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
903  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
904  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
905  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
906 
907  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
908  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
909  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
910  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
911  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
912  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
913  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
914  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
915 
916  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
917  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
918  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
919  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
920  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
921  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
922  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
923  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
924  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
925  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
926  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
927  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
928  RESTRICT_ASM_LOW32
929  [ftmp8]"=&f"(ftmp[8])
930  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
931  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
932  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
933  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
934  [dc]"f"(dc)
935  : "memory"
936  );
937 }
938 
939 #if _MIPS_SIM != _ABIO32
940 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
941 {
942  int16_t *src = block;
943  int16_t *dst = block;
944  double ftmp[23];
945  uint32_t count = 8, tmp[1];
946  int16_t coeff[16] = {17, 22, 17, 10,
947  17, 10,-17,-22,
948  17,-10,-17, 22,
949  17,-22, 17,-10};
950  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
951  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
952  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
953 
954  // 1st loop
955  __asm__ volatile (
956 
957  "li %[tmp0], 0x03 \n\t"
958  "mtc1 %[tmp0], %[ftmp0] \n\t"
959 
960  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
961  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
962  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
963  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
964  "1: \n\t"
965  /* ftmp8: dst3,dst2,dst1,dst0 */
966  MMI_LDC1(%[ftmp1], %[src], 0x00)
967  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
968  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
969  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
970  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
971  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
972  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
973  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
974  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
975  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
976  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
977  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
978  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
979  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
980  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
981  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
982  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
983  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
984  MMI_SDC1(%[ftmp8], %[dst], 0x00)
985 
986  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
987  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
988  "addiu %[count], %[count], -0x01 \n\t"
989  "bnez %[count], 1b \n\t"
990  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
991  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
992  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
993  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
994  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
995  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
996  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
997  [src]"+&r"(src), [dst]"+&r"(dst)
998  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
999  : "memory"
1000  );
1001 
1002  src = block;
1003 
1004  // 2nd loop
1005  __asm__ volatile (
1006  "li %[tmp0], 0x07 \n\t"
1007  "mtc1 %[tmp0], %[ftmp0] \n\t"
1008 
1009  MMI_LDC1(%[ftmp1], %[src], 0x00)
1010  MMI_LDC1(%[ftmp2], %[src], 0x20)
1011  MMI_LDC1(%[ftmp3], %[src], 0x40)
1012  MMI_LDC1(%[ftmp4], %[src], 0x60)
1013  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1014  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1015  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1016  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1017 
1018  MMI_LDC1(%[ftmp1], %[src], 0x10)
1019  MMI_LDC1(%[ftmp2], %[src], 0x30)
1020  MMI_LDC1(%[ftmp3], %[src], 0x50)
1021  MMI_LDC1(%[ftmp4], %[src], 0x70)
1022  "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1023  "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1024  "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1025  "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1026 
1027  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1028  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1029  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1030 
1031  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1032  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1033  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1034 
1035  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1036  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1037  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1038 
1039  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1040  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1041  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1042 
1043  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1044  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1045  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1046  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1047  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1048  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1049  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1050  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1051  MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1052  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1053  MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1054  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1055  MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1056  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1057  MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1058  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1059  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1060  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1061  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1062  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1063  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1064  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1065  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1066  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1067 
1068  "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1069  "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1070  "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1071  "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1072  "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1073  "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1074  "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1075  "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1076 
1077  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1078  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1079  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1080  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1081  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1082  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1083  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1084  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1085 
1086  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1087  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1088  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1089  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1090  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1091  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1092  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1093  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1094  MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1095  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1096  MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1097  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1098  MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1099  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1100  MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1101 
1102  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1103  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1104  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1105  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1106  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1107  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1108  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1109  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1110  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1111  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1112  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1113  [ftmp22]"=&f"(ftmp[22]),
1114  [tmp0]"=&r"(tmp[0])
1115  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1116  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1117  : "memory"
1118  );
1119 }
1120 #endif
1121 
1122 /* Do inverse transform on 4x4 part of block */
1123 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1124 {
1125  int dc = block[0];
1126  double ftmp[5];
1127  DECLARE_VAR_LOW32;
1128 
1129  dc = (17 * dc + 4) >> 3;
1130  dc = (17 * dc + 64) >> 7;
1131 
1132  __asm__ volatile(
1133  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1134  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1135 
1136  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1137  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1138  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1139  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1140 
1141  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1142  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1143  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1144  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1145 
1146  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1147  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1148  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1149  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1150 
1151  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1152  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1153  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1154  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1155 
1156  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1157  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1158  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1159  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1160  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1161  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1162  RESTRICT_ASM_LOW32
1163  [ftmp4]"=&f"(ftmp[4])
1164  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1165  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1166  [dc]"f"(dc)
1167  : "memory"
1168  );
1169 }
1170 
1171 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1172 {
1173  int16_t *src = block;
1174  int16_t *dst = block;
1175  double ftmp[16];
1176  uint32_t count = 4, tmp[1];
1177  int16_t coeff[16] = {17, 22, 17, 10,
1178  17, 10,-17,-22,
1179  17,-10,-17, 22,
1180  17,-22, 17,-10};
1181  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1182  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1183  // 1st loop
1184  __asm__ volatile (
1185 
1186  "li %[tmp0], 0x03 \n\t"
1187  "mtc1 %[tmp0], %[ftmp0] \n\t"
1188  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1189  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1190  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1191  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1192  "1: \n\t"
1193  /* ftmp8: dst3,dst2,dst1,dst0 */
1194  MMI_LDC1(%[ftmp1], %[src], 0x00)
1195  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1196  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1197  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1198  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1199  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1200  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1201  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1202  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1203  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1204  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1205  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1206  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1207  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1208  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1209  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1210  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1211  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1212  MMI_SDC1(%[ftmp8], %[dst], 0x00)
1213 
1214  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1215  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1216  "addiu %[count], %[count], -0x01 \n\t"
1217  "bnez %[count], 1b \n\t"
1218  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1219  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1220  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1221  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1222  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1223  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1224  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1225  [src]"+&r"(src), [dst]"+&r"(dst)
1226  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1227  : "memory"
1228  );
1229 
1230  src = block;
1231 
1232  // 2nd loop
1233  __asm__ volatile (
1234  "li %[tmp0], 0x07 \n\t"
1235  "mtc1 %[tmp0], %[ftmp0] \n\t"
1236  "li %[tmp0], 0x44 \n\t"
1237  "mtc1 %[tmp0], %[ftmp15] \n\t"
1238 
1239  MMI_LDC1(%[ftmp1], %[src], 0x00)
1240  MMI_LDC1(%[ftmp2], %[src], 0x10)
1241  MMI_LDC1(%[ftmp3], %[src], 0x20)
1242  MMI_LDC1(%[ftmp4], %[src], 0x30)
1243  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1244  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1245  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1246  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1247 
1248  /* ftmp11: dst03,dst02,dst01,dst00 */
1249  "li %[tmp0], 0x00160011 \n\t"
1250  "mtc1 %[tmp0], %[ftmp3] \n\t"
1251  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1252  "li %[tmp0], 0x000a0011 \n\t"
1253  "mtc1 %[tmp0], %[ftmp4] \n\t"
1254  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1255  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1256  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1257  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1258  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1259  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1260  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1261  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1262  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1263  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1264  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1265  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1266  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1267  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1268 
1269  /* ftmp12: dst13,dst12,dst11,dst10 */
1270  "li %[tmp0], 0x000a0011 \n\t"
1271  "mtc1 %[tmp0], %[ftmp3] \n\t"
1272  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1273  "li %[tmp0], 0xffeaffef \n\t"
1274  "mtc1 %[tmp0], %[ftmp4] \n\t"
1275  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1276  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1277  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1278  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1279  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1280  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1281  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1282  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1283  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1284  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1285  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1286  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1287  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1288  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1289 
1290  /* ftmp13: dst23,dst22,dst21,dst20 */
1291  "li %[tmp0], 0xfff60011 \n\t"
1292  "mtc1 %[tmp0], %[ftmp3] \n\t"
1293  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1294  "li %[tmp0], 0x0016ffef \n\t"
1295  "mtc1 %[tmp0], %[ftmp4] \n\t"
1296  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1297  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1298  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1299  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1300  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1301  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1302  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1303  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1304  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1305  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1306  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1307  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1308  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1309  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1310 
1311  /* ftmp14: dst33,dst32,dst31,dst30 */
1312  "li %[tmp0], 0xffea0011 \n\t"
1313  "mtc1 %[tmp0], %[ftmp3] \n\t"
1314  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1315  "li %[tmp0], 0xfff60011 \n\t"
1316  "mtc1 %[tmp0], %[ftmp4] \n\t"
1317  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1318  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1319  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1320  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1321  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1322  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1323  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1324  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1325  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1326  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1327  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1328  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1329  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1330  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1331 
1332  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1333  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1334  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1335  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1336  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1337  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1338  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1339  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1340  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1341  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1342  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1343  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1344  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1345  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1346  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1347  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1348  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1349  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1350  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1351  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1352 
1353  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1354  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1355  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1356  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1357  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1358  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1359  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1360 
1361  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1362  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1363  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1364  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1365  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1366  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1367  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1368  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1369  [tmp0]"=&r"(tmp[0])
1370  : [ff_pw_64]"f"(ff_pw_64_local),
1371  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1372  :"memory"
1373  );
1374 }
1375 
1376 /* Apply overlap transform to horizontal edge */
1378 {
1379  int i;
1380  int a, b, c, d;
1381  int d1, d2;
1382  int rnd = 1;
1383  for (i = 0; i < 8; i++) {
1384  a = src[-2];
1385  b = src[-1];
1386  c = src[0];
1387  d = src[1];
1388  d1 = (a - d + 3 + rnd) >> 3;
1389  d2 = (a - d + b - c + 4 - rnd) >> 3;
1390 
1391  src[-2] = a - d1;
1392  src[-1] = av_clip_uint8(b - d2);
1393  src[0] = av_clip_uint8(c + d2);
1394  src[1] = d + d1;
1395  src += stride;
1396  rnd = !rnd;
1397  }
1398 }
1399 
1400 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1401 {
1402  int i;
1403  int a, b, c, d;
1404  int d1, d2;
1405  int rnd1 = flags & 2 ? 3 : 4;
1406  int rnd2 = 7 - rnd1;
1407  for (i = 0; i < 8; i++) {
1408  a = left[6];
1409  b = left[7];
1410  c = right[0];
1411  d = right[1];
1412  d1 = a - d;
1413  d2 = a - d + b - c;
1414 
1415  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1416  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1417  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1418  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1419 
1420  right += right_stride;
1421  left += left_stride;
1422  if (flags & 1) {
1423  rnd2 = 7 - rnd2;
1424  rnd1 = 7 - rnd1;
1425  }
1426  }
1427 }
1428 
1429 /* Apply overlap transform to vertical edge */
1431 {
1432  int i;
1433  int a, b, c, d;
1434  int d1, d2;
1435  int rnd = 1;
1436  for (i = 0; i < 8; i++) {
1437  a = src[-2 * stride];
1438  b = src[-stride];
1439  c = src[0];
1440  d = src[stride];
1441  d1 = (a - d + 3 + rnd) >> 3;
1442  d2 = (a - d + b - c + 4 - rnd) >> 3;
1443 
1444  src[-2 * stride] = a - d1;
1445  src[-stride] = av_clip_uint8(b - d2);
1446  src[0] = av_clip_uint8(c + d2);
1447  src[stride] = d + d1;
1448  src++;
1449  rnd = !rnd;
1450  }
1451 }
1452 
1453 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1454 {
1455  int i;
1456  int a, b, c, d;
1457  int d1, d2;
1458  int rnd1 = 4, rnd2 = 3;
1459  for (i = 0; i < 8; i++) {
1460  a = top[48];
1461  b = top[56];
1462  c = bottom[0];
1463  d = bottom[8];
1464  d1 = a - d;
1465  d2 = a - d + b - c;
1466 
1467  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1468  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1469  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1470  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1471 
1472  bottom++;
1473  top++;
1474  rnd2 = 7 - rnd2;
1475  rnd1 = 7 - rnd1;
1476  }
1477 }
1478 
1479 /**
1480  * VC-1 in-loop deblocking filter for one line
1481  * @param src source block type
1482  * @param stride block stride
1483  * @param pq block quantizer
1484  * @return whether other 3 pairs should be filtered or not
1485  * @see 8.6
1486  */
1487 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1488 {
1489  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1490  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1491  int a0_sign = a0 >> 31; /* Store sign */
1492 
1493  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1494  if (a0 < pq) {
1495  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1496  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1497  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1498  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1499  if (a1 < a0 || a2 < a0) {
1500  int clip = src[-1 * stride] - src[0 * stride];
1501  int clip_sign = clip >> 31;
1502 
1503  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1504  if (clip) {
1505  int a3 = FFMIN(a1, a2);
1506  int d = 5 * (a3 - a0);
1507  int d_sign = (d >> 31);
1508 
1509  d = ((d ^ d_sign) - d_sign) >> 3;
1510  d_sign ^= a0_sign;
1511 
1512  if (d_sign ^ clip_sign)
1513  d = 0;
1514  else {
1515  d = FFMIN(d, clip);
1516  d = (d ^ d_sign) - d_sign; /* Restore sign */
1517  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1518  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1519  }
1520  return 1;
1521  }
1522  }
1523  }
1524  return 0;
1525 }
1526 
1527 /**
1528  * VC-1 in-loop deblocking filter
1529  * @param src source block type
1530  * @param step distance between horizontally adjacent elements
1531  * @param stride distance between vertically adjacent elements
1532  * @param len edge length to filter (4 or 8 pixels)
1533  * @param pq block quantizer
1534  * @see 8.6
1535  */
1536 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1537  int len, int pq)
1538 {
1539  int i;
1540  int filt3;
1541 
1542  for (i = 0; i < len; i += 4) {
1543  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1544  if (filt3) {
1545  vc1_filter_line(src + 0 * step, stride, pq);
1546  vc1_filter_line(src + 1 * step, stride, pq);
1547  vc1_filter_line(src + 3 * step, stride, pq);
1548  }
1549  src += step * 4;
1550  }
1551 }
1552 
1554 {
1555  vc1_loop_filter(src, 1, stride, 4, pq);
1556 }
1557 
1559 {
1560  vc1_loop_filter(src, stride, 1, 4, pq);
1561 }
1562 
1564 {
1565  vc1_loop_filter(src, 1, stride, 8, pq);
1566 }
1567 
1569 {
1570  vc1_loop_filter(src, stride, 1, 8, pq);
1571 }
1572 
1574 {
1575  vc1_loop_filter(src, 1, stride, 16, pq);
1576 }
1577 
1579 {
1580  vc1_loop_filter(src, stride, 1, 16, pq);
1581 }
1582 
1584  ptrdiff_t stride, int rnd)
1585 {
1586  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1587 }
1589  ptrdiff_t stride, int rnd)
1590 {
1591  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1592 }
1594  ptrdiff_t stride, int rnd)
1595 {
1596  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1597 }
1599  ptrdiff_t stride, int rnd)
1600 {
1601  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1602 }
1603 
1604 #define OP_PUT(S, D)
1605 #define OP_AVG(S, D) \
1606  "ldc1 $f16, "#S" \n\t" \
1607  "pavgb "#D", "#D", $f16 \n\t"
1608 
1609 /** Add rounder from $f14 to $f6 and pack result at destination */
1610 #define NORMALIZE_MMI(SHIFT) \
1611  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1612  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1613  "psrah $f6, $f6, "SHIFT" \n\t" \
1614  "psrah $f8, $f8, "SHIFT" \n\t"
1615 
1616 #define TRANSFER_DO_PACK(OP) \
1617  "packushb $f6, $f6, $f8 \n\t" \
1618  OP((%[dst]), $f6) \
1619  "sdc1 $f6, 0x00(%[dst]) \n\t"
1620 
1621 #define TRANSFER_DONT_PACK(OP) \
1622  OP(0(%[dst]), $f6) \
1623  OP(8(%[dst]), $f8) \
1624  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1625  "sdc1 $f8, 0x08(%[dst]) \n\t"
1626 
1627 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1628 #define DO_UNPACK(reg) \
1629  "punpcklbh "reg", "reg", $f0 \n\t"
1630 #define DONT_UNPACK(reg)
1631 
1632 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1633 #define LOAD_ROUNDER_MMI(ROUND) \
1634  "lwc1 $f14, "ROUND" \n\t" \
1635  "punpcklhw $f14, $f14, $f14 \n\t" \
1636  "punpcklwd $f14, $f14, $f14 \n\t"
1637 
1638 
1639 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1640  "paddh "#R1", "#R1", "#R2" \n\t" \
1641  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1642  MMI_ULWC1(R0, $9, 0x00) \
1643  "pmullh "#R1", "#R1", $f6 \n\t" \
1644  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1645  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1646  MMI_ULWC1(R3, $9, 0x00) \
1647  "psubh "#R1", "#R1", "#R0" \n\t" \
1648  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1649  "paddh "#R1", "#R1", $f14 \n\t" \
1650  "psubh "#R1", "#R1", "#R3" \n\t" \
1651  "psrah "#R1", "#R1", %[shift] \n\t" \
1652  MMI_SDC1(R1, %[dst], OFF) \
1653  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1654 
1655 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1656 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1657  const uint8_t *src, mips_reg stride,
1658  int rnd, int64_t shift)
1659 {
1660  DECLARE_VAR_LOW32;
1661  DECLARE_VAR_ADDRT;
1662 
1663  __asm__ volatile(
1664  "xor $f0, $f0, $f0 \n\t"
1665  "li $8, 0x03 \n\t"
1666  LOAD_ROUNDER_MMI("%[rnd]")
1667  "ldc1 $f12, %[ff_pw_9] \n\t"
1668  "1: \n\t"
1669  MMI_ULWC1($f4, %[src], 0x00)
1670  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1671  MMI_ULWC1($f6, %[src], 0x00)
1672  "punpcklbh $f4, $f4, $f0 \n\t"
1673  "punpcklbh $f6, $f6, $f0 \n\t"
1674  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1675  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1676  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1677  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1678  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1679  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1680  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1681  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1682  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1683  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1684  "addiu $8, $8, -0x01 \n\t"
1685  "bnez $8, 1b \n\t"
1686  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1687  [src]"+r"(src), [dst]"+r"(dst)
1688  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1689  [shift]"f"(shift), [rnd]"m"(rnd),
1690  [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1691  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1692  "$f14", "$f16", "memory"
1693  );
1694 }
1695 
1696 /**
1697  * Data is already unpacked, so some operations can directly be made from
1698  * memory.
1699  */
1700 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1701 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1702  const int16_t *src, int rnd) \
1703 { \
1704  int h = 8; \
1705  DECLARE_VAR_ALL64; \
1706  DECLARE_VAR_ADDRT; \
1707  \
1708  src -= 1; \
1709  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1710  \
1711  __asm__ volatile( \
1712  LOAD_ROUNDER_MMI("%[rnd]") \
1713  "ldc1 $f12, %[ff_pw_128] \n\t" \
1714  "ldc1 $f10, %[ff_pw_9] \n\t" \
1715  "1: \n\t" \
1716  MMI_ULDC1($f2, %[src], 0x00) \
1717  MMI_ULDC1($f4, %[src], 0x08) \
1718  MMI_ULDC1($f6, %[src], 0x02) \
1719  MMI_ULDC1($f8, %[src], 0x0a) \
1720  MMI_ULDC1($f0, %[src], 0x06) \
1721  "paddh $f2, $f2, $f0 \n\t" \
1722  MMI_ULDC1($f0, %[src], 0x0e) \
1723  "paddh $f4, $f4, $f0 \n\t" \
1724  MMI_ULDC1($f0, %[src], 0x04) \
1725  "paddh $f6, $f6, $f0 \n\t" \
1726  MMI_ULDC1($f0, %[src], 0x0b) \
1727  "paddh $f8, $f8, $f0 \n\t" \
1728  "pmullh $f6, $f6, $f10 \n\t" \
1729  "pmullh $f8, $f8, $f10 \n\t" \
1730  "psubh $f6, $f6, $f2 \n\t" \
1731  "psubh $f8, $f8, $f4 \n\t" \
1732  "li $8, 0x07 \n\t" \
1733  "mtc1 $8, $f16 \n\t" \
1734  NORMALIZE_MMI("$f16") \
1735  /* Remove bias */ \
1736  "paddh $f6, $f6, $f12 \n\t" \
1737  "paddh $f8, $f8, $f12 \n\t" \
1738  TRANSFER_DO_PACK(OP) \
1739  "addiu %[h], %[h], -0x01 \n\t" \
1740  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742  "bnez %[h], 1b \n\t" \
1743  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1744  [h]"+r"(h), \
1745  [src]"+r"(src), [dst]"+r"(dst) \
1746  : [stride]"r"(stride), [rnd]"m"(rnd), \
1747  [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1748  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1749  "$f16", "memory" \
1750  ); \
1751 }
1752 
1755 
1756 /**
1757  * Purely vertical or horizontal 1/2 shift interpolation.
1758  * Sacrify $f12 for *9 factor.
1759  */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762  mips_reg stride, int rnd, \
1763  mips_reg offset) \
1764 { \
1765  DECLARE_VAR_LOW32; \
1766  DECLARE_VAR_ADDRT; \
1767  \
1768  rnd = 8 - rnd; \
1769  \
1770  __asm__ volatile( \
1771  "xor $f0, $f0, $f0 \n\t" \
1772  "li $10, 0x08 \n\t" \
1773  LOAD_ROUNDER_MMI("%[rnd]") \
1774  "ldc1 $f12, %[ff_pw_9] \n\t" \
1775  "1: \n\t" \
1776  MMI_ULWC1($f6, %[src], 0x00) \
1777  MMI_ULWC1($f8, %[src], 0x04) \
1778  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1779  MMI_ULWC1($f2, $9, 0x00) \
1780  MMI_ULWC1($f4, $9, 0x04) \
1781  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1782  "punpcklbh $f6, $f6, $f0 \n\t" \
1783  "punpcklbh $f8, $f8, $f0 \n\t" \
1784  "punpcklbh $f2, $f2, $f0 \n\t" \
1785  "punpcklbh $f4, $f4, $f0 \n\t" \
1786  "paddh $f6, $f6, $f2 \n\t" \
1787  "paddh $f8, $f8, $f4 \n\t" \
1788  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1789  MMI_ULWC1($f2, $9, 0x00) \
1790  MMI_ULWC1($f4, $9, 0x04) \
1791  "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1792  "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1793  "punpcklbh $f2, $f2, $f0 \n\t" \
1794  "punpcklbh $f4, $f4, $f0 \n\t" \
1795  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1796  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1797  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1798  MMI_ULWC1($f2, $9, 0x00) \
1799  MMI_ULWC1($f4, $9, 0x04) \
1800  "punpcklbh $f2, $f2, $f0 \n\t" \
1801  "punpcklbh $f4, $f4, $f0 \n\t" \
1802  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1803  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1804  "li $8, 0x04 \n\t" \
1805  "mtc1 $8, $f16 \n\t" \
1806  NORMALIZE_MMI("$f16") \
1807  "packushb $f6, $f6, $f8 \n\t" \
1808  OP((%[dst]), $f6) \
1809  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1810  "addiu $10, $10, -0x01 \n\t" \
1811  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1812  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1813  "bnez $10, 1b \n\t" \
1814  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1815  [src]"+r"(src), [dst]"+r"(dst) \
1816  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1817  [stride]"r"(stride), [rnd]"m"(rnd), \
1818  [stride1]"r"(stride-offset), \
1819  [ff_pw_9]"m"(ff_pw_9) \
1820  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1821  "$f12", "$f14", "$f16", "memory" \
1822  ); \
1823 }
1824 
1825 VC1_SHIFT2(OP_PUT, put_)
1826 VC1_SHIFT2(OP_AVG, avg_)
1827 
1828 /**
1829  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1830  *
1831  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1832  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1833  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1834  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1835  * @param A2 Stride address of 2nd tap
1836  * @param A3 Stride address of 3rd tap
1837  * @param A4 Stride address of 4th tap
1838  */
1839 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1840  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1841  LOAD($f2, $9, M*0) \
1842  LOAD($f4, $9, M*4) \
1843  UNPACK("$f2") \
1844  UNPACK("$f4") \
1845  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1846  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1847  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1848  LOAD($f6, $9, M*0) \
1849  LOAD($f8, $9, M*4) \
1850  UNPACK("$f6") \
1851  UNPACK("$f8") \
1852  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1853  "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1854  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1855  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1856  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1857  LOAD($f2, $9, M*0) \
1858  LOAD($f4, $9, M*4) \
1859  UNPACK("$f2") \
1860  UNPACK("$f4") \
1861  "li $8, 0x02 \n\t" \
1862  "mtc1 $8, $f16 \n\t" \
1863  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1864  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1865  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1866  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1867  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1868  LOAD($f2, $9, M*0) \
1869  LOAD($f4, $9, M*4) \
1870  UNPACK("$f2") \
1871  UNPACK("$f4") \
1872  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1873  "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1874  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1875  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1876 
1877 /**
1878  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1879  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1880  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1881  *
1882  * @param NAME Either 1 or 3
1883  * @see MSPEL_FILTER13_CORE for information on A1->A4
1884  */
1885 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1886 static void \
1887 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1888  mips_reg src_stride, \
1889  int rnd, int64_t shift) \
1890 { \
1891  int h = 8; \
1892  DECLARE_VAR_LOW32; \
1893  DECLARE_VAR_ADDRT; \
1894  \
1895  src -= src_stride; \
1896  \
1897  __asm__ volatile( \
1898  "xor $f0, $f0, $f0 \n\t" \
1899  LOAD_ROUNDER_MMI("%[rnd]") \
1900  "ldc1 $f10, %[ff_pw_53] \n\t" \
1901  "ldc1 $f12, %[ff_pw_18] \n\t" \
1902  ".p2align 3 \n\t" \
1903  "1: \n\t" \
1904  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1905  NORMALIZE_MMI("%[shift]") \
1906  TRANSFER_DONT_PACK(OP_PUT) \
1907  /* Last 3 (in fact 4) bytes on the line */ \
1908  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1909  MMI_ULWC1($f2, $9, 0x08) \
1910  DO_UNPACK("$f2") \
1911  "mov.d $f6, $f2 \n\t" \
1912  "paddh $f2, $f2, $f2 \n\t" \
1913  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1914  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1915  MMI_ULWC1($f6, $9, 0x08) \
1916  DO_UNPACK("$f6") \
1917  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1918  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1919  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1920  MMI_ULWC1($f2, $9, 0x08) \
1921  DO_UNPACK("$f2") \
1922  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1923  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1924  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1925  MMI_ULWC1($f2, $9, 0x08) \
1926  DO_UNPACK("$f2") \
1927  "li $8, 0x02 \n\t" \
1928  "mtc1 $8, $f16 \n\t" \
1929  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1930  "psubh $f6, $f6, $f2 \n\t" \
1931  "paddh $f6, $f6, $f14 \n\t" \
1932  "li $8, 0x06 \n\t" \
1933  "mtc1 $8, $f16 \n\t" \
1934  "psrah $f6, $f6, $f16 \n\t" \
1935  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1936  "addiu %[h], %[h], -0x01 \n\t" \
1937  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1938  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1939  "bnez %[h], 1b \n\t" \
1940  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1941  [h]"+r"(h), \
1942  [src]"+r"(src), [dst]"+r"(dst) \
1943  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1944  [stride_x3]"r"(3*src_stride), \
1945  [rnd]"m"(rnd), [shift]"f"(shift), \
1946  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1947  [ff_pw_3]"f"(ff_pw_3) \
1948  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1949  "$f14", "$f16", "memory" \
1950  ); \
1951 }
1952 
1953 /**
1954  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1955  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1956  *
1957  * @param NAME Either 1 or 3
1958  * @see MSPEL_FILTER13_CORE for information on A1->A4
1959  */
1960 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1961 static void \
1962 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1963  const int16_t *src, int rnd) \
1964 { \
1965  int h = 8; \
1966  DECLARE_VAR_ALL64; \
1967  DECLARE_VAR_ADDRT; \
1968  \
1969  src -= 1; \
1970  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1971  \
1972  __asm__ volatile( \
1973  "xor $f0, $f0, $f0 \n\t" \
1974  LOAD_ROUNDER_MMI("%[rnd]") \
1975  "ldc1 $f10, %[ff_pw_53] \n\t" \
1976  "ldc1 $f12, %[ff_pw_18] \n\t" \
1977  ".p2align 3 \n\t" \
1978  "1: \n\t" \
1979  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1980  "li $8, 0x07 \n\t" \
1981  "mtc1 $8, $f16 \n\t" \
1982  NORMALIZE_MMI("$f16") \
1983  /* Remove bias */ \
1984  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1985  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1986  TRANSFER_DO_PACK(OP) \
1987  "addiu %[h], %[h], -0x01 \n\t" \
1988  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1989  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1990  "bnez %[h], 1b \n\t" \
1991  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1992  [h]"+r"(h), \
1993  [src]"+r"(src), [dst]"+r"(dst) \
1994  : [stride]"r"(stride), [rnd]"m"(rnd), \
1995  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1996  [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
1997  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1998  "$f14", "$f16", "memory" \
1999  ); \
2000 }
2001 
2002 /**
2003  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2004  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2005  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2006  *
2007  * @param NAME Either 1 or 3
2008  * @see MSPEL_FILTER13_CORE for information on A1->A4
2009  */
2010 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2011 static void \
2012 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2013  mips_reg stride, int rnd, mips_reg offset) \
2014 { \
2015  int h = 8; \
2016  DECLARE_VAR_LOW32; \
2017  DECLARE_VAR_ADDRT; \
2018  \
2019  src -= offset; \
2020  rnd = 32-rnd; \
2021  \
2022  __asm__ volatile ( \
2023  "xor $f0, $f0, $f0 \n\t" \
2024  LOAD_ROUNDER_MMI("%[rnd]") \
2025  "ldc1 $f10, %[ff_pw_53] \n\t" \
2026  "ldc1 $f12, %[ff_pw_18] \n\t" \
2027  ".p2align 3 \n\t" \
2028  "1: \n\t" \
2029  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2030  "li $8, 0x06 \n\t" \
2031  "mtc1 $8, $f16 \n\t" \
2032  NORMALIZE_MMI("$f16") \
2033  TRANSFER_DO_PACK(OP) \
2034  "addiu %[h], %[h], -0x01 \n\t" \
2035  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2036  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2037  "bnez %[h], 1b \n\t" \
2038  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2039  [h]"+r"(h), \
2040  [src]"+r"(src), [dst]"+r"(dst) \
2041  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2042  [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2043  [rnd]"m"(rnd), \
2044  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2045  [ff_pw_3]"f"(ff_pw_3) \
2046  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2047  "$f14", "$f16", "memory" \
2048  ); \
2049 }
2050 
2051 
2052 /** 1/4 shift bicubic interpolation */
2053 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2054 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2055 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2056 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2057 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2058 
2059 /** 3/4 shift bicubic interpolation */
2060 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2061 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2062 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2063 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2064 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2065 
2066 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2067  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2068  int64_t shift);
2069 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2070  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2071 typedef void (*vc1_mspel_mc_filter_8bits)
2072  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2073  mips_reg offset);
2074 
2075 /**
2076  * Interpolate fractional pel values by applying proper vertical then
2077  * horizontal filter.
2078  *
2079  * @param dst Destination buffer for interpolated pels.
2080  * @param src Source buffer.
2081  * @param stride Stride for both src and dst buffers.
2082  * @param hmode Horizontal filter (expressed in quarter pixels shift).
2083  * @param hmode Vertical filter.
2084  * @param rnd Rounding bias.
2085  */
2086 #define VC1_MSPEL_MC(OP) \
2087 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2088  int hmode, int vmode, int rnd) \
2089 { \
2090  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2091  { NULL, vc1_put_ver_16b_shift1_mmi, \
2092  vc1_put_ver_16b_shift2_mmi, \
2093  vc1_put_ver_16b_shift3_mmi }; \
2094  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2095  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2096  OP ## vc1_hor_16b_shift2_mmi, \
2097  OP ## vc1_hor_16b_shift3_mmi }; \
2098  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2099  { NULL, OP ## vc1_shift1_mmi, \
2100  OP ## vc1_shift2_mmi, \
2101  OP ## vc1_shift3_mmi }; \
2102  \
2103  if (vmode) { /* Vertical filter to apply */ \
2104  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2105  static const int shift_value[] = { 0, 5, 1, 5 }; \
2106  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2107  int r; \
2108  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2109  \
2110  r = (1<<(shift-1)) + rnd-1; \
2111  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2112  \
2113  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2114  return; \
2115  } \
2116  else { /* No horizontal filter, output 8 lines to dst */ \
2117  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2118  return; \
2119  } \
2120  } \
2121  \
2122  /* Horizontal mode with no vertical mode */ \
2123  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2124 } \
2125 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2126  int stride, int hmode, int vmode, int rnd)\
2127 { \
2128  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2129  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2130  dst += 8*stride; src += 8*stride; \
2131  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2132  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2133 }
2134 
2135 VC1_MSPEL_MC(put_)
2136 VC1_MSPEL_MC(avg_)
2137 
2138 /** Macro to ease bicubic filter interpolation functions declarations */
2139 #define DECLARE_FUNCTION(a, b) \
2140 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2141  const uint8_t *src, \
2142  ptrdiff_t stride, \
2143  int rnd) \
2144 { \
2145  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2146 } \
2147 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2148  const uint8_t *src, \
2149  ptrdiff_t stride, \
2150  int rnd) \
2151 { \
2152  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2153 } \
2154 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2155  const uint8_t *src, \
2156  ptrdiff_t stride, \
2157  int rnd) \
2158 { \
2159  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2160 } \
2161 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2162  const uint8_t *src, \
2163  ptrdiff_t stride, \
2164  int rnd) \
2165 { \
2166  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2167 }
2168 
2169 DECLARE_FUNCTION(0, 1)
2170 DECLARE_FUNCTION(0, 2)
2171 DECLARE_FUNCTION(0, 3)
2172 
2173 DECLARE_FUNCTION(1, 0)
2174 DECLARE_FUNCTION(1, 1)
2175 DECLARE_FUNCTION(1, 2)
2176 DECLARE_FUNCTION(1, 3)
2177 
2178 DECLARE_FUNCTION(2, 0)
2179 DECLARE_FUNCTION(2, 1)
2180 DECLARE_FUNCTION(2, 2)
2181 DECLARE_FUNCTION(2, 3)
2182 
2183 DECLARE_FUNCTION(3, 0)
2184 DECLARE_FUNCTION(3, 1)
2185 DECLARE_FUNCTION(3, 2)
2186 DECLARE_FUNCTION(3, 3)
2187 
2188 #define CHROMA_MC_8_MMI \
2189  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2190  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2191  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2192  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2193  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2194  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2195  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2196  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2197  \
2198  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2199  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2200  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2201  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2202  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2203  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2204  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2205  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2206  \
2207  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2208  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2209  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2210  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2211  \
2212  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2213  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2214  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2215  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2216  \
2217  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2218  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2219  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2220 
2221 
2222 #define CHROMA_MC_4_MMI \
2223  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2224  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2225  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2226  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2227  \
2228  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2229  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2230  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2231  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2232  \
2233  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2234  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2235  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2236  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2237  \
2238  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2239  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2240 
2241 
2243  uint8_t *src /* align 1 */,
2244  int stride, int h, int x, int y)
2245 {
2246  const int A = (8 - x) * (8 - y);
2247  const int B = (x) * (8 - y);
2248  const int C = (8 - x) * (y);
2249  const int D = (x) * (y);
2250  double ftmp[10];
2251  uint32_t tmp[1];
2252  DECLARE_VAR_ALL64;
2253  DECLARE_VAR_ADDRT;
2254 
2255  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2256 
2257  __asm__ volatile(
2258  "li %[tmp0], 0x06 \n\t"
2259  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2260  "mtc1 %[tmp0], %[ftmp9] \n\t"
2261  "pshufh %[A], %[A], %[ftmp0] \n\t"
2262  "pshufh %[B], %[B], %[ftmp0] \n\t"
2263  "pshufh %[C], %[C], %[ftmp0] \n\t"
2264  "pshufh %[D], %[D], %[ftmp0] \n\t"
2265 
2266  "1: \n\t"
2267  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2268  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2269  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2270  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2271  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2272 
2274 
2275  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2276  "addiu %[h], %[h], -0x01 \n\t"
2277  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2278  "bnez %[h], 1b \n\t"
2279  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2280  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2281  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2282  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2283  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2284  RESTRICT_ASM_ALL64
2285  RESTRICT_ASM_ADDRT
2286  [tmp0]"=&r"(tmp[0]),
2287  [src]"+&r"(src), [dst]"+&r"(dst),
2288  [h]"+&r"(h)
2289  : [stride]"r"((mips_reg)stride),
2290  [A]"f"(A), [B]"f"(B),
2291  [C]"f"(C), [D]"f"(D),
2292  [ff_pw_28]"f"(ff_pw_28)
2293  : "memory"
2294  );
2295 }
2296 
2298  uint8_t *src /* align 1 */,
2299  int stride, int h, int x, int y)
2300 {
2301  const int A = (8 - x) * (8 - y);
2302  const int B = (x) * (8 - y);
2303  const int C = (8 - x) * (y);
2304  const int D = (x) * (y);
2305  double ftmp[6];
2306  uint32_t tmp[1];
2307  DECLARE_VAR_LOW32;
2308  DECLARE_VAR_ADDRT;
2309 
2310  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2311 
2312  __asm__ volatile(
2313  "li %[tmp0], 0x06 \n\t"
2314  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2315  "mtc1 %[tmp0], %[ftmp5] \n\t"
2316  "pshufh %[A], %[A], %[ftmp0] \n\t"
2317  "pshufh %[B], %[B], %[ftmp0] \n\t"
2318  "pshufh %[C], %[C], %[ftmp0] \n\t"
2319  "pshufh %[D], %[D], %[ftmp0] \n\t"
2320 
2321  "1: \n\t"
2322  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2323  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2324  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2325  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2326  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2327 
2329 
2330  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2331  "addiu %[h], %[h], -0x01 \n\t"
2332  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2333  "bnez %[h], 1b \n\t"
2334  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2335  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2336  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2337  [tmp0]"=&r"(tmp[0]),
2338  RESTRICT_ASM_LOW32
2339  RESTRICT_ASM_ADDRT
2340  [src]"+&r"(src), [dst]"+&r"(dst),
2341  [h]"+&r"(h)
2342  : [stride]"r"((mips_reg)stride),
2343  [A]"f"(A), [B]"f"(B),
2344  [C]"f"(C), [D]"f"(D),
2345  [ff_pw_28]"f"(ff_pw_28)
2346  : "memory"
2347  );
2348 }
2349 
2351  uint8_t *src /* align 1 */,
2352  int stride, int h, int x, int y)
2353 {
2354  const int A = (8 - x) * (8 - y);
2355  const int B = (x) * (8 - y);
2356  const int C = (8 - x) * (y);
2357  const int D = (x) * (y);
2358  double ftmp[10];
2359  uint32_t tmp[1];
2360  DECLARE_VAR_ALL64;
2361  DECLARE_VAR_ADDRT;
2362 
2363  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2364 
2365  __asm__ volatile(
2366  "li %[tmp0], 0x06 \n\t"
2367  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2368  "mtc1 %[tmp0], %[ftmp9] \n\t"
2369  "pshufh %[A], %[A], %[ftmp0] \n\t"
2370  "pshufh %[B], %[B], %[ftmp0] \n\t"
2371  "pshufh %[C], %[C], %[ftmp0] \n\t"
2372  "pshufh %[D], %[D], %[ftmp0] \n\t"
2373 
2374  "1: \n\t"
2375  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2376  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2377  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2378  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2379  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2380 
2382 
2383  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2384  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2385 
2386  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2387  "addiu %[h], %[h], -0x01 \n\t"
2388  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2389  "bnez %[h], 1b \n\t"
2390  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2391  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2392  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2393  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2394  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2395  [tmp0]"=&r"(tmp[0]),
2396  RESTRICT_ASM_ALL64
2397  RESTRICT_ASM_ADDRT
2398  [src]"+&r"(src), [dst]"+&r"(dst),
2399  [h]"+&r"(h)
2400  : [stride]"r"((mips_reg)stride),
2401  [A]"f"(A), [B]"f"(B),
2402  [C]"f"(C), [D]"f"(D),
2403  [ff_pw_28]"f"(ff_pw_28)
2404  : "memory"
2405  );
2406 }
2407 
2409  uint8_t *src /* align 1 */,
2410  int stride, int h, int x, int y)
2411 {
2412  const int A = (8 - x) * (8 - y);
2413  const int B = ( x) * (8 - y);
2414  const int C = (8 - x) * ( y);
2415  const int D = ( x) * ( y);
2416  double ftmp[6];
2417  uint32_t tmp[1];
2418  DECLARE_VAR_LOW32;
2419  DECLARE_VAR_ADDRT;
2420 
2421  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2422 
2423  __asm__ volatile(
2424  "li %[tmp0], 0x06 \n\t"
2425  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2426  "mtc1 %[tmp0], %[ftmp5] \n\t"
2427  "pshufh %[A], %[A], %[ftmp0] \n\t"
2428  "pshufh %[B], %[B], %[ftmp0] \n\t"
2429  "pshufh %[C], %[C], %[ftmp0] \n\t"
2430  "pshufh %[D], %[D], %[ftmp0] \n\t"
2431 
2432  "1: \n\t"
2433  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2434  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2435  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2436  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2437  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2438 
2440 
2441  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2442  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2443 
2444  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2445  "addiu %[h], %[h], -0x01 \n\t"
2446  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2447  "bnez %[h], 1b \n\t"
2448  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2449  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2450  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2451  [tmp0]"=&r"(tmp[0]),
2452  RESTRICT_ASM_LOW32
2453  RESTRICT_ASM_ADDRT
2454  [src]"+&r"(src), [dst]"+&r"(dst),
2455  [h]"+&r"(h)
2456  : [stride]"r"((mips_reg)stride),
2457  [A]"f"(A), [B]"f"(B),
2458  [C]"f"(C), [D]"f"(D),
2459  [ff_pw_28]"f"(ff_pw_28)
2460  : "memory"
2461  );
2462 }
#define mips_reg
Definition: asmdefs.h:44
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1563
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1605
VC-1 and WMV3 decoder.
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2242
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
static int shift(int a, int b)
Definition: sonic.c:82
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2297
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1171
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
Definition: vc1dsp_mmi.c:30
#define C
else temp
Definition: vf_mcdeint.c:256
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1558
#define a0
Definition: regdef.h:46
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2408
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:258
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:2139
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1536
const char * b
Definition: vf_curves.c:116
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1453
#define a1
Definition: regdef.h:47
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:867
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:2070
#define src
Definition: vp8dsp.c:254
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1604
static int16_t block[64]
Definition: dct.c:115
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1430
#define a3
Definition: regdef.h:49
uint8_t
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:123
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define f(width, name)
Definition: cbs_vp9.c:255
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1633
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1760
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:112
const uint64_t ff_pw_64
Definition: constants.c:45
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1377
#define A(x)
Definition: vp56_arith.h:28
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1656
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1598
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
static const int shift1[6]
Definition: dxa.c:50
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
#define B
Definition: huffyuvdsp.h:32
const uint64_t ff_pw_4
Definition: constants.c:29
const char * r
Definition: vf_curves.c:114
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
Definition: vc1dsp_mmi.c:74
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1487
simple assert() macros that are a bit more flexible than ISO C assert().
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1553
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1578
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
GLsizei count
Definition: opengl_enc.c:109
const uint64_t ff_pw_1
Definition: constants.c:26
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1588
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define PTR_SUBU
Definition: asmdefs.h:50
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:2067
#define FFMIN(a, b)
Definition: common.h:96
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:416
#define a2
Definition: regdef.h:48
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:2086
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
Definition: vc1dsp_mmi.c:1400
int n
Definition: avisynth_c.h:684
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:2222
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1593
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1123
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:2010
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1583
const uint64_t ff_pw_28
Definition: constants.c:42
#define PTR_ADDIU
Definition: asmdefs.h:48
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1639
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:2188
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1568
#define flags(name, subs,...)
Definition: cbs_av1.c:596
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
D(D(float, sse)
Definition: rematrix_init.c:28
static double c[64]
#define rnd()
Definition: checkasm.h:101
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1573
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2350
int len
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:2072
static const double coeff[2][5]
Definition: vf_owdenoise.c:72
const uint64_t ff_pw_9
Definition: constants.c:33
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> dc
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1700
#define PTR_ADDU
Definition: asmdefs.h:47
#define av_always_inline
Definition: attributes.h:39
#define stride
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1885
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1960
static uint8_t tmp[11]
Definition: aes_ctr.c:26