FFmpeg
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem_internal.h"
25 
26 #include "libavcodec/vc1dsp.h"
27 #include "constants.h"
28 #include "vc1dsp_mips.h"
29 #include "hpeldsp_mips.h"
30 #include "libavutil/mem_internal.h"
32 
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
34  "li %[tmp0], "#r1" \n\t" \
35  "mtc1 %[tmp0], %[ftmp13] \n\t" \
36  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
37  "li %[tmp0], "#r2" \n\t" \
38  "mtc1 %[tmp0], %[ftmp14] \n\t" \
39  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
40  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
41  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
42  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
43  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
44  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
45  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
46  \
47  "li %[tmp0], "#r3" \n\t" \
48  "mtc1 %[tmp0], %[ftmp13] \n\t" \
49  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
50  "li %[tmp0], "#r4" \n\t" \
51  "mtc1 %[tmp0], %[ftmp14] \n\t" \
52  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
53  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
54  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
55  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
56  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
57  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
58  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
59  \
60  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
61  "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
62  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
63  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
64  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
65  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
66  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
67  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
68  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
69  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
70  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
71  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
72  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
73  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
74  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
75  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
76 
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
78  "li %[tmp0], "#r1" \n\t" \
79  "mtc1 %[tmp0], %[ftmp13] \n\t" \
80  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
81  "li %[tmp0], "#r2" \n\t" \
82  "mtc1 %[tmp0], %[ftmp14] \n\t" \
83  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
84  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
85  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
86  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
87  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
88  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
89  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
90  \
91  "li %[tmp0], "#r3" \n\t" \
92  "mtc1 %[tmp0], %[ftmp13] \n\t" \
93  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
94  "li %[tmp0], "#r4" \n\t" \
95  "mtc1 %[tmp0], %[ftmp14] \n\t" \
96  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
97  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
98  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
99  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
100  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
101  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
102  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
103  \
104  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
105  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
106  "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
107  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
108  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
109  "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
110  "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
111  "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
112  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
113  "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
114  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
115  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
116  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
117  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
118  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
119  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
120  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
121  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
122  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
123  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
124 
125 /* Do inverse transform on 8x8 block */
126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127 {
128  int dc = block[0];
129  double ftmp[9];
130  mips_reg addr[1];
131  int count;
132 
133  dc = (3 * dc + 1) >> 1;
134  dc = (3 * dc + 16) >> 5;
135 
136  __asm__ volatile(
137  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
138  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
139  "li %[count], 0x02 \n\t"
140 
141  "1: \n\t"
142  MMI_LDC1(%[ftmp1], %[dest], 0x00)
143  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
144  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
145  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
146  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
147  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
148  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
149 
150  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
151  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
152  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
153  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
154  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
155  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
156  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
157  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
158 
159  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
160  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
161  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
162  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
163  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
164  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
165  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
166  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
167 
168  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
169  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
170  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
171  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
172 
173  MMI_SDC1(%[ftmp1], %[dest], 0x00)
174  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
175  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
176  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
177  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
178  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
179  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
180 
181  "addiu %[count], %[count], -0x01 \n\t"
182  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
183  "bnez %[count], 1b \n\t"
184  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
185  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
186  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
187  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
188  [ftmp8]"=&f"(ftmp[8]),
189  [addr0]"=&r"(addr[0]),
190  [count]"=&r"(count), [dest]"+&r"(dest)
191  : [linesize]"r"((mips_reg)linesize),
192  [dc]"f"(dc)
193  : "memory"
194  );
195 }
196 
197 #if _MIPS_SIM != _ABIO32
198 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
199 {
200  DECLARE_ALIGNED(16, int16_t, temp[64]);
201  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
202  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
203  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
204  double ftmp[23];
205  uint64_t tmp[1];
206 
207  __asm__ volatile (
208  /* 1st loop: start */
209  "li %[tmp0], 0x03 \n\t"
210  "mtc1 %[tmp0], %[ftmp0] \n\t"
211 
212  // 1st part
213  MMI_LDC1(%[ftmp1], %[block], 0x00)
214  MMI_LDC1(%[ftmp11], %[block], 0x10)
215  MMI_LDC1(%[ftmp2], %[block], 0x20)
216  MMI_LDC1(%[ftmp12], %[block], 0x30)
217  MMI_LDC1(%[ftmp3], %[block], 0x40)
218  MMI_LDC1(%[ftmp13], %[block], 0x50)
219  MMI_LDC1(%[ftmp4], %[block], 0x60)
220  MMI_LDC1(%[ftmp14], %[block], 0x70)
221  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
222  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
223  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
224  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
225 
226  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
227  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
228  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
229  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
230 
231  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
232  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
233  0x000f0010, 0x00040009, %[ff_pw_4])
234 
235  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
236  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
237  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
238 
239  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
240  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
241  0xfff00009, 0x000f0004, %[ff_pw_4])
242 
243  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
244  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
245  0xfff70004, 0xfff0000f, %[ff_pw_4])
246 
247  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
248  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249 
250  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
251  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
252 
253  MMI_SDC1(%[ftmp15], %[temp], 0x00)
254  MMI_SDC1(%[ftmp19], %[temp], 0x08)
255  MMI_SDC1(%[ftmp16], %[temp], 0x10)
256  MMI_SDC1(%[ftmp20], %[temp], 0x18)
257  MMI_SDC1(%[ftmp17], %[temp], 0x20)
258  MMI_SDC1(%[ftmp21], %[temp], 0x28)
259  MMI_SDC1(%[ftmp18], %[temp], 0x30)
260  MMI_SDC1(%[ftmp22], %[temp], 0x38)
261 
262  // 2nd part
263  MMI_LDC1(%[ftmp1], %[block], 0x08)
264  MMI_LDC1(%[ftmp11], %[block], 0x18)
265  MMI_LDC1(%[ftmp2], %[block], 0x28)
266  MMI_LDC1(%[ftmp12], %[block], 0x38)
267  MMI_LDC1(%[ftmp3], %[block], 0x48)
268  MMI_LDC1(%[ftmp13], %[block], 0x58)
269  MMI_LDC1(%[ftmp4], %[block], 0x68)
270  MMI_LDC1(%[ftmp14], %[block], 0x78)
271  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
272  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
273  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
274  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
275 
276  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
277  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
278  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
279  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
280 
281  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
282  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
283  0x000f0010, 0x00040009, %[ff_pw_4])
284 
285  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
286  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
287  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
288 
289  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
290  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
291  0xfff00009, 0x000f0004, %[ff_pw_4])
292 
293  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
294  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
295  0xfff70004, 0xfff0000f, %[ff_pw_4])
296 
297  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
298  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299 
300  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
301  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
302 
303  MMI_SDC1(%[ftmp19], %[temp], 0x48)
304  MMI_SDC1(%[ftmp20], %[temp], 0x58)
305  MMI_SDC1(%[ftmp21], %[temp], 0x68)
306  MMI_SDC1(%[ftmp22], %[temp], 0x78)
307  /* 1st loop: end */
308 
309  /* 2nd loop: start */
310  "li %[tmp0], 0x07 \n\t"
311  "mtc1 %[tmp0], %[ftmp0] \n\t"
312 
313  // 1st part
314  MMI_LDC1(%[ftmp1], %[temp], 0x00)
315  MMI_LDC1(%[ftmp11], %[temp], 0x10)
316  MMI_LDC1(%[ftmp2], %[temp], 0x20)
317  MMI_LDC1(%[ftmp12], %[temp], 0x30)
318  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
319  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
320  "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
321  "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
322 
323  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
324  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
325  "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
326  "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
327 
328  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
329  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
330  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
331 
332  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
333  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
334  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
335 
336  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
337  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
338  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
339 
340  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
341  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
342  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
343 
344  MMI_SDC1(%[ftmp15], %[block], 0x00)
345  MMI_SDC1(%[ftmp16], %[block], 0x10)
346  MMI_SDC1(%[ftmp17], %[block], 0x20)
347  MMI_SDC1(%[ftmp18], %[block], 0x30)
348  MMI_SDC1(%[ftmp19], %[block], 0x40)
349  MMI_SDC1(%[ftmp20], %[block], 0x50)
350  MMI_SDC1(%[ftmp21], %[block], 0x60)
351  MMI_SDC1(%[ftmp22], %[block], 0x70)
352 
353  // 2nd part
354  MMI_LDC1(%[ftmp1], %[temp], 0x08)
355  MMI_LDC1(%[ftmp11], %[temp], 0x18)
356  MMI_LDC1(%[ftmp2], %[temp], 0x28)
357  MMI_LDC1(%[ftmp12], %[temp], 0x38)
358  MMI_LDC1(%[ftmp3], %[temp], 0x48)
359  MMI_LDC1(%[ftmp13], %[temp], 0x58)
360  MMI_LDC1(%[ftmp4], %[temp], 0x68)
361  MMI_LDC1(%[ftmp14], %[temp], 0x78)
362  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
363  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
364  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
365  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
366 
367  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
368  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
369  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
370  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
371 
372  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
373  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
374  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
375 
376  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
377  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
378  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
379 
380  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
381  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
382  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
383 
384  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
385  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
386  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
387 
388  MMI_SDC1(%[ftmp15], %[block], 0x08)
389  MMI_SDC1(%[ftmp16], %[block], 0x18)
390  MMI_SDC1(%[ftmp17], %[block], 0x28)
391  MMI_SDC1(%[ftmp18], %[block], 0x38)
392  MMI_SDC1(%[ftmp19], %[block], 0x48)
393  MMI_SDC1(%[ftmp20], %[block], 0x58)
394  MMI_SDC1(%[ftmp21], %[block], 0x68)
395  MMI_SDC1(%[ftmp22], %[block], 0x78)
396  /* 2nd loop: end */
397  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
398  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
399  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
400  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
401  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
402  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
403  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
404  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
405  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
406  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
407  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
408  [ftmp22]"=&f"(ftmp[22]),
409  [tmp0]"=&r"(tmp[0])
410  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
411  [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
412  [temp]"r"(temp)
413  : "memory"
414  );
415 }
416 #endif
417 
418 /* Do inverse transform on 8x4 part of block */
419 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
420 {
421  int dc = block[0];
422  double ftmp[9];
423 
424  dc = ( 3 * dc + 1) >> 1;
425  dc = (17 * dc + 64) >> 7;
426 
427  __asm__ volatile(
428  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
429  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
430 
431  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
432  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
433  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
434  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
435 
436  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
437  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
438  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
439  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
440  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
441  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
442  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
443  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
444 
445  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
446  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
447  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
448  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
449  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
450  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
451  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
452  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
453 
454  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
455  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
456  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
457  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
458 
459  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
460  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
461  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
462  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
463  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
464  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
465  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
466  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
467  [ftmp8]"=&f"(ftmp[8])
468  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
469  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
470  [dc]"f"(dc)
471  : "memory"
472  );
473 }
474 
475 #if _MIPS_SIM != _ABIO32
476 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
477 {
478  int16_t *src = block;
479  int16_t *dst = block;
480  double ftmp[16];
481  uint32_t tmp[1];
482  int16_t count = 4;
483  DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
484  DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
485  int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
486  12, 15, 6, -4, -12, -16, -16, -9,
487  12, 9, -6, -16, -12, 4, 16, 15,
488  12, 4, -16, -9, 12, 15, -6, -16,
489  12, -4, -16, 9, 12, -15, -6, 16,
490  12, -9, -6, 16, -12, -4, 16, -15,
491  12, -15, 6, 4, -12, 16, -16, 9,
492  12, -16, 16, -15, 12, -9, 6, -4};
493 
494  // 1st loop
495  __asm__ volatile (
496  "li %[tmp0], 0x03 \n\t"
497  "mtc1 %[tmp0], %[ftmp0] \n\t"
498 
499  "1: \n\t"
500  MMI_LDC1(%[ftmp1], %[src], 0x00)
501  MMI_LDC1(%[ftmp2], %[src], 0x08)
502 
503  /* ftmp11: dst1,dst0 */
504  MMI_LDC1(%[ftmp3], %[coeff], 0x00)
505  MMI_LDC1(%[ftmp4], %[coeff], 0x08)
506  MMI_LDC1(%[ftmp5], %[coeff], 0x10)
507  MMI_LDC1(%[ftmp6], %[coeff], 0x18)
508  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
509  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
510  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
511  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
512  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
513  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
514  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
515  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
516  "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
517  "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
518 
519  /* ftmp12: dst3,dst2 */
520  MMI_LDC1(%[ftmp3], %[coeff], 0x20)
521  MMI_LDC1(%[ftmp4], %[coeff], 0x28)
522  MMI_LDC1(%[ftmp5], %[coeff], 0x30)
523  MMI_LDC1(%[ftmp6], %[coeff], 0x38)
524  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
525  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
526  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
527  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
528  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
529  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
530  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
531  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
532  "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
533  "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
534 
535  /* ftmp13: dst5,dst4 */
536  MMI_LDC1(%[ftmp3], %[coeff], 0x40)
537  MMI_LDC1(%[ftmp4], %[coeff], 0x48)
538  MMI_LDC1(%[ftmp5], %[coeff], 0x50)
539  MMI_LDC1(%[ftmp6], %[coeff], 0x58)
540  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
541  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
542  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
543  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
544  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
545  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
546  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
547  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
548  "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
549  "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
550 
551  /* ftmp14: dst7,dst6 */
552  MMI_LDC1(%[ftmp3], %[coeff], 0x60)
553  MMI_LDC1(%[ftmp4], %[coeff], 0x68)
554  MMI_LDC1(%[ftmp5], %[coeff], 0x70)
555  MMI_LDC1(%[ftmp6], %[coeff], 0x78)
556  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
557  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
558  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
559  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
560  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
561  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
562  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
563  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
564  "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
565  "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
566 
567  /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
568  "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
569  "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
570  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
571  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
572  "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
573  "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
574  "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
575  "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
576  "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
577  "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
578  MMI_SDC1(%[ftmp9], %[dst], 0x00)
579  MMI_SDC1(%[ftmp10], %[dst], 0x08)
580 
581  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
582  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
583  "addiu %[count], %[count], -0x01 \n\t"
584  "bnez %[count], 1b \n\t"
585  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
586  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
587  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
588  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
589  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
590  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
591  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
592  [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
593  [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
594  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
595  : "memory"
596  );
597 
598  src = block;
599 
600  // 2nd loop
601  __asm__ volatile (
602  "li %[tmp0], 0x44 \n\t"
603  "mtc1 %[tmp0], %[ftmp15] \n\t"
604 
605  // 1st part
606  "li %[tmp0], 0x07 \n\t"
607  "mtc1 %[tmp0], %[ftmp0] \n\t"
608  MMI_LDC1(%[ftmp1], %[src], 0x00)
609  MMI_LDC1(%[ftmp2], %[src], 0x10)
610  MMI_LDC1(%[ftmp3], %[src], 0x20)
611  MMI_LDC1(%[ftmp4], %[src], 0x30)
612  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
613  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
614  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
615  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
616 
617  /* ftmp11: dst03,dst02,dst01,dst00 */
618  "li %[tmp0], 0x00160011 \n\t"
619  "mtc1 %[tmp0], %[ftmp3] \n\t"
620  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
621  "li %[tmp0], 0x000a0011 \n\t"
622  "mtc1 %[tmp0], %[ftmp4] \n\t"
623  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
624  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
625  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
626  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
627  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
628  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
629  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
630  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
631  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
632  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
633  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
634  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
635  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
636  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
637 
638  /* ftmp12: dst13,dst12,dst11,dst10 */
639  "li %[tmp0], 0x000a0011 \n\t"
640  "mtc1 %[tmp0], %[ftmp3] \n\t"
641  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
642  "li %[tmp0], 0xffeaffef \n\t"
643  "mtc1 %[tmp0], %[ftmp4] \n\t"
644  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
645  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
646  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
647  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
648  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
649  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
650  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
651  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
652  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
653  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
654  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
655  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
656  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
657  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
658 
659  /* ftmp13: dst23,dst22,dst21,dst20 */
660  "li %[tmp0], 0xfff60011 \n\t"
661  "mtc1 %[tmp0], %[ftmp3] \n\t"
662  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
663  "li %[tmp0], 0x0016ffef \n\t"
664  "mtc1 %[tmp0], %[ftmp4] \n\t"
665  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
666  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
667  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
668  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
669  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
670  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
671  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
672  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
673  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
674  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
675  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
676  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
677  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
678  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
679 
680  /* ftmp14: dst33,dst32,dst31,dst30 */
681  "li %[tmp0], 0xffea0011 \n\t"
682  "mtc1 %[tmp0], %[ftmp3] \n\t"
683  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
684  "li %[tmp0], 0xfff60011 \n\t"
685  "mtc1 %[tmp0], %[ftmp4] \n\t"
686  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
687  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
688  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
689  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
690  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
691  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
692  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
693  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
694  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
695  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
696  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
697  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
698  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
699  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
700 
701  MMI_LWC1(%[ftmp1], %[dest], 0x00)
702  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
703  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
704  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
705  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
706  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
707  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
708  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
709  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
710  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
711  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
712  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
713  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
714  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
715  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
716  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
717  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
718  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
719  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
720  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
721  MMI_SWC1(%[ftmp1], %[dest], 0x00)
722  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
723  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
724  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
725  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
726  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
727  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
728 
729  // 2nd part
730  "li %[tmp0], 0x07 \n\t"
731  "mtc1 %[tmp0], %[ftmp0] \n\t"
732  MMI_LDC1(%[ftmp1], %[src], 0x08)
733  MMI_LDC1(%[ftmp2], %[src], 0x18)
734  MMI_LDC1(%[ftmp3], %[src], 0x28)
735  MMI_LDC1(%[ftmp4], %[src], 0x38)
736  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
737  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
738  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
739  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
740 
741  /* ftmp11: dst03,dst02,dst01,dst00 */
742  "li %[tmp0], 0x00160011 \n\t"
743  "mtc1 %[tmp0], %[ftmp3] \n\t"
744  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
745  "li %[tmp0], 0x000a0011 \n\t"
746  "mtc1 %[tmp0], %[ftmp4] \n\t"
747  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
748  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
749  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
750  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
751  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
752  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
753  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
754  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
755  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
756  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
757  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
758  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
759  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
760  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
761 
762  /* ftmp12: dst13,dst12,dst11,dst10 */
763  "li %[tmp0], 0x000a0011 \n\t"
764  "mtc1 %[tmp0], %[ftmp3] \n\t"
765  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
766  "li %[tmp0], 0xffeaffef \n\t"
767  "mtc1 %[tmp0], %[ftmp4] \n\t"
768  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
769  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
770  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
771  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
772  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
773  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
774  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
775  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
776  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
777  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
778  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
779  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
780  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
781  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
782 
783  /* ftmp13: dst23,dst22,dst21,dst20 */
784  "li %[tmp0], 0xfff60011 \n\t"
785  "mtc1 %[tmp0], %[ftmp3] \n\t"
786  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
787  "li %[tmp0], 0x0016ffef \n\t"
788  "mtc1 %[tmp0], %[ftmp4] \n\t"
789  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
790  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
791  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
792  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
793  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
794  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
795  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
796  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
797  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
798  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
799  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
800  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
801  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
802  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
803 
804  /* ftmp14: dst33,dst32,dst31,dst30 */
805  "li %[tmp0], 0xffea0011 \n\t"
806  "mtc1 %[tmp0], %[ftmp3] \n\t"
807  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
808  "li %[tmp0], 0xfff60011 \n\t"
809  "mtc1 %[tmp0], %[ftmp4] \n\t"
810  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
811  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
812  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
813  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
814  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
815  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
816  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
817  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
818  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
819  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
820  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
821  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
822  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
823  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
824 
825  MMI_LWC1(%[ftmp1], %[dest], 0x04)
826  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
827  MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
828  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
829  MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
830  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
831  MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
832  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
833  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
834  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
835  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
836  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
837  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
838  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
839  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
840  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
841  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
842  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
843  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
844  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
845  MMI_SWC1(%[ftmp1], %[dest], 0x04)
846  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
847  MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
848  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
849  MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
850  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
851  MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
852 
853  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
854  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
855  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
856  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
857  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
858  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
859  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
860  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
861  [tmp0]"=&r"(tmp[0])
862  : [ff_pw_64]"f"(ff_pw_64_local),
863  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
864  :"memory"
865  );
866 }
867 #endif
868 
869 /* Do inverse transform on 4x8 parts of block */
870 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
871 {
872  int dc = block[0];
873  double ftmp[9];
874  DECLARE_VAR_LOW32;
875 
876  dc = (17 * dc + 4) >> 3;
877  dc = (12 * dc + 64) >> 7;
878 
879  __asm__ volatile(
880  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
881  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
882 
883  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
884  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
885  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
886  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
887  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
888  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
889  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
890  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
891 
892  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
893  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
894  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
895  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
896  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
897  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
898  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
899  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
900 
901  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
902  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
903  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
904  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
905  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
906  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
907  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
908  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
909 
910  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
911  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
912  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
913  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
914  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
915  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
916  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
917  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
918 
919  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
920  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
921  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
922  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
923  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
924  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
925  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
926  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
927  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
928  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
929  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
930  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
931  RESTRICT_ASM_LOW32
932  [ftmp8]"=&f"(ftmp[8])
933  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
934  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
935  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
936  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
937  [dc]"f"(dc)
938  : "memory"
939  );
940 }
941 
942 #if _MIPS_SIM != _ABIO32
943 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
944 {
945  int16_t *src = block;
946  int16_t *dst = block;
947  double ftmp[23];
948  uint32_t count = 8, tmp[1];
949  int16_t coeff[16] = {17, 22, 17, 10,
950  17, 10,-17,-22,
951  17,-10,-17, 22,
952  17,-22, 17,-10};
953  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
954  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
955  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
956 
957  // 1st loop
958  __asm__ volatile (
959 
960  "li %[tmp0], 0x03 \n\t"
961  "mtc1 %[tmp0], %[ftmp0] \n\t"
962 
963  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
964  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
965  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
966  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
967  "1: \n\t"
968  /* ftmp8: dst3,dst2,dst1,dst0 */
969  MMI_LDC1(%[ftmp1], %[src], 0x00)
970  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
971  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
972  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
973  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
974  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
975  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
976  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
977  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
978  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
979  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
980  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
981  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
982  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
983  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
984  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
985  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
986  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
987  MMI_SDC1(%[ftmp8], %[dst], 0x00)
988 
989  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
990  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
991  "addiu %[count], %[count], -0x01 \n\t"
992  "bnez %[count], 1b \n\t"
993  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
994  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
995  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
996  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
997  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
998  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
999  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1000  [src]"+&r"(src), [dst]"+&r"(dst)
1001  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1002  : "memory"
1003  );
1004 
1005  src = block;
1006 
1007  // 2nd loop
1008  __asm__ volatile (
1009  "li %[tmp0], 0x07 \n\t"
1010  "mtc1 %[tmp0], %[ftmp0] \n\t"
1011 
1012  MMI_LDC1(%[ftmp1], %[src], 0x00)
1013  MMI_LDC1(%[ftmp2], %[src], 0x20)
1014  MMI_LDC1(%[ftmp3], %[src], 0x40)
1015  MMI_LDC1(%[ftmp4], %[src], 0x60)
1016  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1017  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1018  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1019  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1020 
1021  MMI_LDC1(%[ftmp1], %[src], 0x10)
1022  MMI_LDC1(%[ftmp2], %[src], 0x30)
1023  MMI_LDC1(%[ftmp3], %[src], 0x50)
1024  MMI_LDC1(%[ftmp4], %[src], 0x70)
1025  "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1026  "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1027  "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1028  "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1029 
1030  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1031  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1032  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1033 
1034  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1035  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1036  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1037 
1038  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1039  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1040  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1041 
1042  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1043  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1044  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1045 
1046  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1047  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1048  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1049  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1050  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1051  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1052  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1053  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1054  MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1055  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1056  MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1057  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1058  MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1059  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1060  MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1061  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1062  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1063  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1064  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1065  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1066  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1067  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1068  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1069  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1070 
1071  "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1072  "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1073  "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1074  "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1075  "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1076  "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1077  "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1078  "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1079 
1080  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1081  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1082  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1083  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1084  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1085  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1086  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1087  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1088 
1089  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1090  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1091  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1092  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1093  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1094  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1095  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1096  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1097  MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1098  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1099  MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1100  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1101  MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1102  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1103  MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1104 
1105  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1106  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1107  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1108  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1109  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1110  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1111  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1112  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1113  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1114  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1115  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1116  [ftmp22]"=&f"(ftmp[22]),
1117  [tmp0]"=&r"(tmp[0])
1118  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1119  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1120  : "memory"
1121  );
1122 }
1123 #endif
1124 
1125 /* Do inverse transform on 4x4 part of block */
1126 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1127 {
1128  int dc = block[0];
1129  double ftmp[5];
1130  DECLARE_VAR_LOW32;
1131 
1132  dc = (17 * dc + 4) >> 3;
1133  dc = (17 * dc + 64) >> 7;
1134 
1135  __asm__ volatile(
1136  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1137  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1138 
1139  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143 
1144  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1145  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1146  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1147  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1148 
1149  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1150  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1151  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1152  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1153 
1154  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1155  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1156  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1157  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1158 
1159  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1164  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1165  RESTRICT_ASM_LOW32
1166  [ftmp4]"=&f"(ftmp[4])
1167  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1168  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1169  [dc]"f"(dc)
1170  : "memory"
1171  );
1172 }
1173 
1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176  int16_t *src = block;
1177  int16_t *dst = block;
1178  double ftmp[16];
1179  uint32_t count = 4, tmp[1];
1180  int16_t coeff[16] = {17, 22, 17, 10,
1181  17, 10,-17,-22,
1182  17,-10,-17, 22,
1183  17,-22, 17,-10};
1184  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1185  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1186  // 1st loop
1187  __asm__ volatile (
1188 
1189  "li %[tmp0], 0x03 \n\t"
1190  "mtc1 %[tmp0], %[ftmp0] \n\t"
1191  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1192  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1193  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1194  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1195  "1: \n\t"
1196  /* ftmp8: dst3,dst2,dst1,dst0 */
1197  MMI_LDC1(%[ftmp1], %[src], 0x00)
1198  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1199  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1200  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1201  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1202  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1203  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1204  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1205  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1206  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1207  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1208  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1209  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1210  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1211  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1212  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1213  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1214  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1215  MMI_SDC1(%[ftmp8], %[dst], 0x00)
1216 
1217  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1218  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1219  "addiu %[count], %[count], -0x01 \n\t"
1220  "bnez %[count], 1b \n\t"
1221  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1222  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1223  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1224  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1225  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1226  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1227  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1228  [src]"+&r"(src), [dst]"+&r"(dst)
1229  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1230  : "memory"
1231  );
1232 
1233  src = block;
1234 
1235  // 2nd loop
1236  __asm__ volatile (
1237  "li %[tmp0], 0x07 \n\t"
1238  "mtc1 %[tmp0], %[ftmp0] \n\t"
1239  "li %[tmp0], 0x44 \n\t"
1240  "mtc1 %[tmp0], %[ftmp15] \n\t"
1241 
1242  MMI_LDC1(%[ftmp1], %[src], 0x00)
1243  MMI_LDC1(%[ftmp2], %[src], 0x10)
1244  MMI_LDC1(%[ftmp3], %[src], 0x20)
1245  MMI_LDC1(%[ftmp4], %[src], 0x30)
1246  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1247  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1248  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1249  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1250 
1251  /* ftmp11: dst03,dst02,dst01,dst00 */
1252  "li %[tmp0], 0x00160011 \n\t"
1253  "mtc1 %[tmp0], %[ftmp3] \n\t"
1254  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1255  "li %[tmp0], 0x000a0011 \n\t"
1256  "mtc1 %[tmp0], %[ftmp4] \n\t"
1257  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1258  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1259  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1260  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1261  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1262  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1263  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1264  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1265  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1266  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1267  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1268  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1269  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1270  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1271 
1272  /* ftmp12: dst13,dst12,dst11,dst10 */
1273  "li %[tmp0], 0x000a0011 \n\t"
1274  "mtc1 %[tmp0], %[ftmp3] \n\t"
1275  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1276  "li %[tmp0], 0xffeaffef \n\t"
1277  "mtc1 %[tmp0], %[ftmp4] \n\t"
1278  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1279  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1280  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1281  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1282  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1283  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1284  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1285  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1286  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1287  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1288  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1289  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1290  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1291  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1292 
1293  /* ftmp13: dst23,dst22,dst21,dst20 */
1294  "li %[tmp0], 0xfff60011 \n\t"
1295  "mtc1 %[tmp0], %[ftmp3] \n\t"
1296  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1297  "li %[tmp0], 0x0016ffef \n\t"
1298  "mtc1 %[tmp0], %[ftmp4] \n\t"
1299  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1300  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1301  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1302  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1303  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1304  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1305  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1306  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1307  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1308  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1309  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1310  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1311  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1312  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1313 
1314  /* ftmp14: dst33,dst32,dst31,dst30 */
1315  "li %[tmp0], 0xffea0011 \n\t"
1316  "mtc1 %[tmp0], %[ftmp3] \n\t"
1317  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1318  "li %[tmp0], 0xfff60011 \n\t"
1319  "mtc1 %[tmp0], %[ftmp4] \n\t"
1320  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1321  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1322  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1323  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1324  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1325  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1326  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1327  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1328  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1329  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1330  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1331  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1332  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1333  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1334 
1335  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1336  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1337  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1338  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1339  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1340  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1341  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1342  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1343  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1344  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1345  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1346  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1347  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1348  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1349  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1350  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1351  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1352  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1353  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1354  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1355 
1356  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1357  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1358  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1359  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1360  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1361  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1362  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1363 
1364  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1365  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1366  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1367  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1368  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1369  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1370  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1371  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1372  [tmp0]"=&r"(tmp[0])
1373  : [ff_pw_64]"f"(ff_pw_64_local),
1374  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1375  :"memory"
1376  );
1377 }
1378 
1379 /* Apply overlap transform to horizontal edge */
1381 {
1382  int i;
1383  int a, b, c, d;
1384  int d1, d2;
1385  int rnd = 1;
1386  for (i = 0; i < 8; i++) {
1387  a = src[-2];
1388  b = src[-1];
1389  c = src[0];
1390  d = src[1];
1391  d1 = (a - d + 3 + rnd) >> 3;
1392  d2 = (a - d + b - c + 4 - rnd) >> 3;
1393 
1394  src[-2] = a - d1;
1395  src[-1] = av_clip_uint8(b - d2);
1396  src[0] = av_clip_uint8(c + d2);
1397  src[1] = d + d1;
1398  src += stride;
1399  rnd = !rnd;
1400  }
1401 }
1402 
1403 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1404 {
1405  int i;
1406  int a, b, c, d;
1407  int d1, d2;
1408  int rnd1 = flags & 2 ? 3 : 4;
1409  int rnd2 = 7 - rnd1;
1410  for (i = 0; i < 8; i++) {
1411  a = left[6];
1412  b = left[7];
1413  c = right[0];
1414  d = right[1];
1415  d1 = a - d;
1416  d2 = a - d + b - c;
1417 
1418  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1419  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1420  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1421  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1422 
1423  right += right_stride;
1424  left += left_stride;
1425  if (flags & 1) {
1426  rnd2 = 7 - rnd2;
1427  rnd1 = 7 - rnd1;
1428  }
1429  }
1430 }
1431 
1432 /* Apply overlap transform to vertical edge */
1434 {
1435  int i;
1436  int a, b, c, d;
1437  int d1, d2;
1438  int rnd = 1;
1439  for (i = 0; i < 8; i++) {
1440  a = src[-2 * stride];
1441  b = src[-stride];
1442  c = src[0];
1443  d = src[stride];
1444  d1 = (a - d + 3 + rnd) >> 3;
1445  d2 = (a - d + b - c + 4 - rnd) >> 3;
1446 
1447  src[-2 * stride] = a - d1;
1448  src[-stride] = av_clip_uint8(b - d2);
1449  src[0] = av_clip_uint8(c + d2);
1450  src[stride] = d + d1;
1451  src++;
1452  rnd = !rnd;
1453  }
1454 }
1455 
1456 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1457 {
1458  int i;
1459  int a, b, c, d;
1460  int d1, d2;
1461  int rnd1 = 4, rnd2 = 3;
1462  for (i = 0; i < 8; i++) {
1463  a = top[48];
1464  b = top[56];
1465  c = bottom[0];
1466  d = bottom[8];
1467  d1 = a - d;
1468  d2 = a - d + b - c;
1469 
1470  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1471  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1472  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1473  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1474 
1475  bottom++;
1476  top++;
1477  rnd2 = 7 - rnd2;
1478  rnd1 = 7 - rnd1;
1479  }
1480 }
1481 
1482 /**
1483  * VC-1 in-loop deblocking filter for one line
1484  * @param src source block type
1485  * @param stride block stride
1486  * @param pq block quantizer
1487  * @return whether other 3 pairs should be filtered or not
1488  * @see 8.6
1489  */
1491 {
1492  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1493  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1494  int a0_sign = a0 >> 31; /* Store sign */
1495 
1496  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1497  if (a0 < pq) {
1498  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1499  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1500  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1501  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1502  if (a1 < a0 || a2 < a0) {
1503  int clip = src[-1 * stride] - src[0 * stride];
1504  int clip_sign = clip >> 31;
1505 
1506  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1507  if (clip) {
1508  int a3 = FFMIN(a1, a2);
1509  int d = 5 * (a3 - a0);
1510  int d_sign = (d >> 31);
1511 
1512  d = ((d ^ d_sign) - d_sign) >> 3;
1513  d_sign ^= a0_sign;
1514 
1515  if (d_sign ^ clip_sign)
1516  d = 0;
1517  else {
1518  d = FFMIN(d, clip);
1519  d = (d ^ d_sign) - d_sign; /* Restore sign */
1520  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1521  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1522  }
1523  return 1;
1524  }
1525  }
1526  }
1527  return 0;
1528 }
1529 
1530 /**
1531  * VC-1 in-loop deblocking filter
1532  * @param src source block type
1533  * @param step distance between horizontally adjacent elements
1534  * @param stride distance between vertically adjacent elements
1535  * @param len edge length to filter (4 or 8 pixels)
1536  * @param pq block quantizer
1537  * @see 8.6
1538  */
1539 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1540  int len, int pq)
1541 {
1542  int i;
1543  int filt3;
1544 
1545  for (i = 0; i < len; i += 4) {
1546  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1547  if (filt3) {
1548  vc1_filter_line(src + 0 * step, stride, pq);
1549  vc1_filter_line(src + 1 * step, stride, pq);
1550  vc1_filter_line(src + 3 * step, stride, pq);
1551  }
1552  src += step * 4;
1553  }
1554 }
1555 
1557 {
1558  vc1_loop_filter(src, 1, stride, 4, pq);
1559 }
1560 
1562 {
1563  vc1_loop_filter(src, stride, 1, 4, pq);
1564 }
1565 
1567 {
1568  vc1_loop_filter(src, 1, stride, 8, pq);
1569 }
1570 
1572 {
1573  vc1_loop_filter(src, stride, 1, 8, pq);
1574 }
1575 
1577 {
1578  vc1_loop_filter(src, 1, stride, 16, pq);
1579 }
1580 
1582 {
1583  vc1_loop_filter(src, stride, 1, 16, pq);
1584 }
1585 
1587  ptrdiff_t stride, int rnd)
1588 {
1589  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1590 }
1592  ptrdiff_t stride, int rnd)
1593 {
1594  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1595 }
1597  ptrdiff_t stride, int rnd)
1598 {
1599  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1600 }
1602  ptrdiff_t stride, int rnd)
1603 {
1604  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1605 }
1606 
1607 #define OP_PUT(S, D)
1608 #define OP_AVG(S, D) \
1609  "ldc1 $f16, "#S" \n\t" \
1610  "pavgb "#D", "#D", $f16 \n\t"
1611 
1612 /** Add rounder from $f14 to $f6 and pack result at destination */
1613 #define NORMALIZE_MMI(SHIFT) \
1614  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1615  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1616  "psrah $f6, $f6, "SHIFT" \n\t" \
1617  "psrah $f8, $f8, "SHIFT" \n\t"
1618 
1619 #define TRANSFER_DO_PACK(OP) \
1620  "packushb $f6, $f6, $f8 \n\t" \
1621  OP((%[dst]), $f6) \
1622  "sdc1 $f6, 0x00(%[dst]) \n\t"
1623 
1624 #define TRANSFER_DONT_PACK(OP) \
1625  OP(0(%[dst]), $f6) \
1626  OP(8(%[dst]), $f8) \
1627  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1628  "sdc1 $f8, 0x08(%[dst]) \n\t"
1629 
1630 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1631 #define DO_UNPACK(reg) \
1632  "punpcklbh "reg", "reg", $f0 \n\t"
1633 #define DONT_UNPACK(reg)
1634 
1635 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1636 #define LOAD_ROUNDER_MMI(ROUND) \
1637  "lwc1 $f14, "ROUND" \n\t" \
1638  "punpcklhw $f14, $f14, $f14 \n\t" \
1639  "punpcklwd $f14, $f14, $f14 \n\t"
1640 
1641 
1642 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1643  "paddh "#R1", "#R1", "#R2" \n\t" \
1644  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1645  MMI_ULWC1(R0, $9, 0x00) \
1646  "pmullh "#R1", "#R1", $f6 \n\t" \
1647  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1648  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1649  MMI_ULWC1(R3, $9, 0x00) \
1650  "psubh "#R1", "#R1", "#R0" \n\t" \
1651  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1652  "paddh "#R1", "#R1", $f14 \n\t" \
1653  "psubh "#R1", "#R1", "#R3" \n\t" \
1654  "psrah "#R1", "#R1", %[shift] \n\t" \
1655  MMI_SDC1(R1, %[dst], OFF) \
1656  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1657 
1658 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1659 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1660  const uint8_t *src, mips_reg stride,
1661  int rnd, int64_t shift)
1662 {
1663  DECLARE_VAR_LOW32;
1664  DECLARE_VAR_ADDRT;
1665 
1666  __asm__ volatile(
1667  "xor $f0, $f0, $f0 \n\t"
1668  "li $8, 0x03 \n\t"
1669  LOAD_ROUNDER_MMI("%[rnd]")
1670  "ldc1 $f12, %[ff_pw_9] \n\t"
1671  "1: \n\t"
1672  MMI_ULWC1($f4, %[src], 0x00)
1673  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1674  MMI_ULWC1($f6, %[src], 0x00)
1675  "punpcklbh $f4, $f4, $f0 \n\t"
1676  "punpcklbh $f6, $f6, $f0 \n\t"
1677  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1678  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1679  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1680  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1681  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1682  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1683  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1684  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1685  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1686  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1687  "addiu $8, $8, -0x01 \n\t"
1688  "bnez $8, 1b \n\t"
1689  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1690  [src]"+r"(src), [dst]"+r"(dst)
1691  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1692  [shift]"f"(shift), [rnd]"m"(rnd),
1693  [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1694  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1695  "$f14", "$f16", "memory"
1696  );
1697 }
1698 
1699 /**
1700  * Data is already unpacked, so some operations can directly be made from
1701  * memory.
1702  */
1703 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1704 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1705  const int16_t *src, int rnd) \
1706 { \
1707  int h = 8; \
1708  DECLARE_VAR_ALL64; \
1709  DECLARE_VAR_ADDRT; \
1710  \
1711  src -= 1; \
1712  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1713  \
1714  __asm__ volatile( \
1715  LOAD_ROUNDER_MMI("%[rnd]") \
1716  "ldc1 $f12, %[ff_pw_128] \n\t" \
1717  "ldc1 $f10, %[ff_pw_9] \n\t" \
1718  "1: \n\t" \
1719  MMI_ULDC1($f2, %[src], 0x00) \
1720  MMI_ULDC1($f4, %[src], 0x08) \
1721  MMI_ULDC1($f6, %[src], 0x02) \
1722  MMI_ULDC1($f8, %[src], 0x0a) \
1723  MMI_ULDC1($f0, %[src], 0x06) \
1724  "paddh $f2, $f2, $f0 \n\t" \
1725  MMI_ULDC1($f0, %[src], 0x0e) \
1726  "paddh $f4, $f4, $f0 \n\t" \
1727  MMI_ULDC1($f0, %[src], 0x04) \
1728  "paddh $f6, $f6, $f0 \n\t" \
1729  MMI_ULDC1($f0, %[src], 0x0b) \
1730  "paddh $f8, $f8, $f0 \n\t" \
1731  "pmullh $f6, $f6, $f10 \n\t" \
1732  "pmullh $f8, $f8, $f10 \n\t" \
1733  "psubh $f6, $f6, $f2 \n\t" \
1734  "psubh $f8, $f8, $f4 \n\t" \
1735  "li $8, 0x07 \n\t" \
1736  "mtc1 $8, $f16 \n\t" \
1737  NORMALIZE_MMI("$f16") \
1738  /* Remove bias */ \
1739  "paddh $f6, $f6, $f12 \n\t" \
1740  "paddh $f8, $f8, $f12 \n\t" \
1741  TRANSFER_DO_PACK(OP) \
1742  "addiu %[h], %[h], -0x01 \n\t" \
1743  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1744  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1745  "bnez %[h], 1b \n\t" \
1746  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1747  [h]"+r"(h), \
1748  [src]"+r"(src), [dst]"+r"(dst) \
1749  : [stride]"r"(stride), [rnd]"m"(rnd), \
1750  [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1751  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1752  "$f16", "memory" \
1753  ); \
1754 }
1755 
1758 
1759 /**
1760  * Purely vertical or horizontal 1/2 shift interpolation.
1761  * Sacrify $f12 for *9 factor.
1762  */
1763 #define VC1_SHIFT2(OP, OPNAME)\
1764 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1765  mips_reg stride, int rnd, \
1766  mips_reg offset) \
1767 { \
1768  DECLARE_VAR_LOW32; \
1769  DECLARE_VAR_ADDRT; \
1770  \
1771  rnd = 8 - rnd; \
1772  \
1773  __asm__ volatile( \
1774  "xor $f0, $f0, $f0 \n\t" \
1775  "li $10, 0x08 \n\t" \
1776  LOAD_ROUNDER_MMI("%[rnd]") \
1777  "ldc1 $f12, %[ff_pw_9] \n\t" \
1778  "1: \n\t" \
1779  MMI_ULWC1($f6, %[src], 0x00) \
1780  MMI_ULWC1($f8, %[src], 0x04) \
1781  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1782  MMI_ULWC1($f2, $9, 0x00) \
1783  MMI_ULWC1($f4, $9, 0x04) \
1784  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1785  "punpcklbh $f6, $f6, $f0 \n\t" \
1786  "punpcklbh $f8, $f8, $f0 \n\t" \
1787  "punpcklbh $f2, $f2, $f0 \n\t" \
1788  "punpcklbh $f4, $f4, $f0 \n\t" \
1789  "paddh $f6, $f6, $f2 \n\t" \
1790  "paddh $f8, $f8, $f4 \n\t" \
1791  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1792  MMI_ULWC1($f2, $9, 0x00) \
1793  MMI_ULWC1($f4, $9, 0x04) \
1794  "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1795  "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1796  "punpcklbh $f2, $f2, $f0 \n\t" \
1797  "punpcklbh $f4, $f4, $f0 \n\t" \
1798  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1799  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1800  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1801  MMI_ULWC1($f2, $9, 0x00) \
1802  MMI_ULWC1($f4, $9, 0x04) \
1803  "punpcklbh $f2, $f2, $f0 \n\t" \
1804  "punpcklbh $f4, $f4, $f0 \n\t" \
1805  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1806  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1807  "li $8, 0x04 \n\t" \
1808  "mtc1 $8, $f16 \n\t" \
1809  NORMALIZE_MMI("$f16") \
1810  "packushb $f6, $f6, $f8 \n\t" \
1811  OP((%[dst]), $f6) \
1812  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1813  "addiu $10, $10, -0x01 \n\t" \
1814  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1815  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1816  "bnez $10, 1b \n\t" \
1817  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1818  [src]"+r"(src), [dst]"+r"(dst) \
1819  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1820  [stride]"r"(stride), [rnd]"m"(rnd), \
1821  [stride1]"r"(stride-offset), \
1822  [ff_pw_9]"m"(ff_pw_9) \
1823  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1824  "$f12", "$f14", "$f16", "memory" \
1825  ); \
1826 }
1827 
1828 VC1_SHIFT2(OP_PUT, put_)
1829 VC1_SHIFT2(OP_AVG, avg_)
1830 
1831 /**
1832  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1833  *
1834  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1835  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1836  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1837  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1838  * @param A2 Stride address of 2nd tap
1839  * @param A3 Stride address of 3rd tap
1840  * @param A4 Stride address of 4th tap
1841  */
1842 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1843  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1844  LOAD($f2, $9, M*0) \
1845  LOAD($f4, $9, M*4) \
1846  UNPACK("$f2") \
1847  UNPACK("$f4") \
1848  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1849  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1850  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1851  LOAD($f6, $9, M*0) \
1852  LOAD($f8, $9, M*4) \
1853  UNPACK("$f6") \
1854  UNPACK("$f8") \
1855  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1856  "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1857  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1858  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1859  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1860  LOAD($f2, $9, M*0) \
1861  LOAD($f4, $9, M*4) \
1862  UNPACK("$f2") \
1863  UNPACK("$f4") \
1864  "li $8, 0x02 \n\t" \
1865  "mtc1 $8, $f16 \n\t" \
1866  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1867  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1868  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1869  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1870  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1871  LOAD($f2, $9, M*0) \
1872  LOAD($f4, $9, M*4) \
1873  UNPACK("$f2") \
1874  UNPACK("$f4") \
1875  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1876  "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1877  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1878  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1879 
1880 /**
1881  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1882  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1883  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1884  *
1885  * @param NAME Either 1 or 3
1886  * @see MSPEL_FILTER13_CORE for information on A1->A4
1887  */
1888 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1889 static void \
1890 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1891  mips_reg src_stride, \
1892  int rnd, int64_t shift) \
1893 { \
1894  int h = 8; \
1895  DECLARE_VAR_LOW32; \
1896  DECLARE_VAR_ADDRT; \
1897  \
1898  src -= src_stride; \
1899  \
1900  __asm__ volatile( \
1901  "xor $f0, $f0, $f0 \n\t" \
1902  LOAD_ROUNDER_MMI("%[rnd]") \
1903  "ldc1 $f10, %[ff_pw_53] \n\t" \
1904  "ldc1 $f12, %[ff_pw_18] \n\t" \
1905  ".p2align 3 \n\t" \
1906  "1: \n\t" \
1907  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1908  NORMALIZE_MMI("%[shift]") \
1909  TRANSFER_DONT_PACK(OP_PUT) \
1910  /* Last 3 (in fact 4) bytes on the line */ \
1911  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1912  MMI_ULWC1($f2, $9, 0x08) \
1913  DO_UNPACK("$f2") \
1914  "mov.d $f6, $f2 \n\t" \
1915  "paddh $f2, $f2, $f2 \n\t" \
1916  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1917  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1918  MMI_ULWC1($f6, $9, 0x08) \
1919  DO_UNPACK("$f6") \
1920  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1921  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1922  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1923  MMI_ULWC1($f2, $9, 0x08) \
1924  DO_UNPACK("$f2") \
1925  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1926  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1927  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1928  MMI_ULWC1($f2, $9, 0x08) \
1929  DO_UNPACK("$f2") \
1930  "li $8, 0x02 \n\t" \
1931  "mtc1 $8, $f16 \n\t" \
1932  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1933  "psubh $f6, $f6, $f2 \n\t" \
1934  "paddh $f6, $f6, $f14 \n\t" \
1935  "li $8, 0x06 \n\t" \
1936  "mtc1 $8, $f16 \n\t" \
1937  "psrah $f6, $f6, $f16 \n\t" \
1938  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1939  "addiu %[h], %[h], -0x01 \n\t" \
1940  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1941  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1942  "bnez %[h], 1b \n\t" \
1943  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1944  [h]"+r"(h), \
1945  [src]"+r"(src), [dst]"+r"(dst) \
1946  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1947  [stride_x3]"r"(3*src_stride), \
1948  [rnd]"m"(rnd), [shift]"f"(shift), \
1949  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1950  [ff_pw_3]"f"(ff_pw_3) \
1951  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1952  "$f14", "$f16", "memory" \
1953  ); \
1954 }
1955 
1956 /**
1957  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1958  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1959  *
1960  * @param NAME Either 1 or 3
1961  * @see MSPEL_FILTER13_CORE for information on A1->A4
1962  */
1963 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1964 static void \
1965 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1966  const int16_t *src, int rnd) \
1967 { \
1968  int h = 8; \
1969  DECLARE_VAR_ALL64; \
1970  DECLARE_VAR_ADDRT; \
1971  \
1972  src -= 1; \
1973  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1974  \
1975  __asm__ volatile( \
1976  "xor $f0, $f0, $f0 \n\t" \
1977  LOAD_ROUNDER_MMI("%[rnd]") \
1978  "ldc1 $f10, %[ff_pw_53] \n\t" \
1979  "ldc1 $f12, %[ff_pw_18] \n\t" \
1980  ".p2align 3 \n\t" \
1981  "1: \n\t" \
1982  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1983  "li $8, 0x07 \n\t" \
1984  "mtc1 $8, $f16 \n\t" \
1985  NORMALIZE_MMI("$f16") \
1986  /* Remove bias */ \
1987  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1988  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1989  TRANSFER_DO_PACK(OP) \
1990  "addiu %[h], %[h], -0x01 \n\t" \
1991  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1992  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1993  "bnez %[h], 1b \n\t" \
1994  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1995  [h]"+r"(h), \
1996  [src]"+r"(src), [dst]"+r"(dst) \
1997  : [stride]"r"(stride), [rnd]"m"(rnd), \
1998  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1999  [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
2000  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2001  "$f14", "$f16", "memory" \
2002  ); \
2003 }
2004 
2005 /**
2006  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2007  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2008  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2009  *
2010  * @param NAME Either 1 or 3
2011  * @see MSPEL_FILTER13_CORE for information on A1->A4
2012  */
2013 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2014 static void \
2015 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2016  mips_reg stride, int rnd, mips_reg offset) \
2017 { \
2018  int h = 8; \
2019  DECLARE_VAR_LOW32; \
2020  DECLARE_VAR_ADDRT; \
2021  \
2022  src -= offset; \
2023  rnd = 32-rnd; \
2024  \
2025  __asm__ volatile ( \
2026  "xor $f0, $f0, $f0 \n\t" \
2027  LOAD_ROUNDER_MMI("%[rnd]") \
2028  "ldc1 $f10, %[ff_pw_53] \n\t" \
2029  "ldc1 $f12, %[ff_pw_18] \n\t" \
2030  ".p2align 3 \n\t" \
2031  "1: \n\t" \
2032  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2033  "li $8, 0x06 \n\t" \
2034  "mtc1 $8, $f16 \n\t" \
2035  NORMALIZE_MMI("$f16") \
2036  TRANSFER_DO_PACK(OP) \
2037  "addiu %[h], %[h], -0x01 \n\t" \
2038  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2039  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2040  "bnez %[h], 1b \n\t" \
2041  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2042  [h]"+r"(h), \
2043  [src]"+r"(src), [dst]"+r"(dst) \
2044  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2045  [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2046  [rnd]"m"(rnd), \
2047  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2048  [ff_pw_3]"f"(ff_pw_3) \
2049  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2050  "$f14", "$f16", "memory" \
2051  ); \
2052 }
2053 
2054 
2055 /** 1/4 shift bicubic interpolation */
2056 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2057 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2058 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2059 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2060 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2061 
2062 /** 3/4 shift bicubic interpolation */
2063 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2064 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2065 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2066 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2067 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2068 
2069 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2070  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2071  int64_t shift);
2072 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2073  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2074 typedef void (*vc1_mspel_mc_filter_8bits)
2075  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2076  mips_reg offset);
2077 
2078 /**
2079  * Interpolate fractional pel values by applying proper vertical then
2080  * horizontal filter.
2081  *
2082  * @param dst Destination buffer for interpolated pels.
2083  * @param src Source buffer.
2084  * @param stride Stride for both src and dst buffers.
2085  * @param hmode Horizontal filter (expressed in quarter pixels shift).
2086  * @param hmode Vertical filter.
2087  * @param rnd Rounding bias.
2088  */
2089 #define VC1_MSPEL_MC(OP) \
2090 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2091  int hmode, int vmode, int rnd) \
2092 { \
2093  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2094  { NULL, vc1_put_ver_16b_shift1_mmi, \
2095  vc1_put_ver_16b_shift2_mmi, \
2096  vc1_put_ver_16b_shift3_mmi }; \
2097  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2098  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2099  OP ## vc1_hor_16b_shift2_mmi, \
2100  OP ## vc1_hor_16b_shift3_mmi }; \
2101  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2102  { NULL, OP ## vc1_shift1_mmi, \
2103  OP ## vc1_shift2_mmi, \
2104  OP ## vc1_shift3_mmi }; \
2105  \
2106  if (vmode) { /* Vertical filter to apply */ \
2107  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2108  static const int shift_value[] = { 0, 5, 1, 5 }; \
2109  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2110  int r; \
2111  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2112  \
2113  r = (1<<(shift-1)) + rnd-1; \
2114  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2115  \
2116  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2117  return; \
2118  } \
2119  else { /* No horizontal filter, output 8 lines to dst */ \
2120  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2121  return; \
2122  } \
2123  } \
2124  \
2125  /* Horizontal mode with no vertical mode */ \
2126  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2127 } \
2128 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2129  int stride, int hmode, int vmode, int rnd)\
2130 { \
2131  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2132  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2133  dst += 8*stride; src += 8*stride; \
2134  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2135  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2136 }
2137 
2138 VC1_MSPEL_MC(put_)
2139 VC1_MSPEL_MC(avg_)
2140 
2141 /** Macro to ease bicubic filter interpolation functions declarations */
2142 #define DECLARE_FUNCTION(a, b) \
2143 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2144  const uint8_t *src, \
2145  ptrdiff_t stride, \
2146  int rnd) \
2147 { \
2148  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2149 } \
2150 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2151  const uint8_t *src, \
2152  ptrdiff_t stride, \
2153  int rnd) \
2154 { \
2155  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2156 } \
2157 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2158  const uint8_t *src, \
2159  ptrdiff_t stride, \
2160  int rnd) \
2161 { \
2162  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2163 } \
2164 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2165  const uint8_t *src, \
2166  ptrdiff_t stride, \
2167  int rnd) \
2168 { \
2169  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2170 }
2171 
2172 DECLARE_FUNCTION(0, 1)
2173 DECLARE_FUNCTION(0, 2)
2174 DECLARE_FUNCTION(0, 3)
2175 
2176 DECLARE_FUNCTION(1, 0)
2177 DECLARE_FUNCTION(1, 1)
2178 DECLARE_FUNCTION(1, 2)
2179 DECLARE_FUNCTION(1, 3)
2180 
2181 DECLARE_FUNCTION(2, 0)
2182 DECLARE_FUNCTION(2, 1)
2183 DECLARE_FUNCTION(2, 2)
2184 DECLARE_FUNCTION(2, 3)
2185 
2186 DECLARE_FUNCTION(3, 0)
2187 DECLARE_FUNCTION(3, 1)
2188 DECLARE_FUNCTION(3, 2)
2189 DECLARE_FUNCTION(3, 3)
2190 
2191 #define CHROMA_MC_8_MMI \
2192  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2193  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2194  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2195  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2196  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2197  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2198  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2199  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2200  \
2201  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2202  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2203  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2204  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2205  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2206  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2207  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2208  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2209  \
2210  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2211  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2212  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2213  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2214  \
2215  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2216  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2217  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2218  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2219  \
2220  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2221  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2222  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2223 
2224 
2225 #define CHROMA_MC_4_MMI \
2226  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2227  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2228  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2229  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2230  \
2231  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2232  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2233  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2234  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2235  \
2236  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2237  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2238  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2239  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2240  \
2241  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2242  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2243 
2244 
2246  uint8_t *src /* align 1 */,
2247  ptrdiff_t stride, int h, int x, int y)
2248 {
2249  const int A = (8 - x) * (8 - y);
2250  const int B = (x) * (8 - y);
2251  const int C = (8 - x) * (y);
2252  const int D = (x) * (y);
2253  double ftmp[10];
2254  uint32_t tmp[1];
2255  DECLARE_VAR_ALL64;
2256  DECLARE_VAR_ADDRT;
2257 
2258  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2259 
2260  __asm__ volatile(
2261  "li %[tmp0], 0x06 \n\t"
2262  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2263  "mtc1 %[tmp0], %[ftmp9] \n\t"
2264  "pshufh %[A], %[A], %[ftmp0] \n\t"
2265  "pshufh %[B], %[B], %[ftmp0] \n\t"
2266  "pshufh %[C], %[C], %[ftmp0] \n\t"
2267  "pshufh %[D], %[D], %[ftmp0] \n\t"
2268 
2269  "1: \n\t"
2270  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2271  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2272  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2273  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2274  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2275 
2277 
2278  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2279  "addiu %[h], %[h], -0x01 \n\t"
2280  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2281  "bnez %[h], 1b \n\t"
2282  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2283  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2284  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2285  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2286  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2287  RESTRICT_ASM_ALL64
2288  RESTRICT_ASM_ADDRT
2289  [tmp0]"=&r"(tmp[0]),
2290  [src]"+&r"(src), [dst]"+&r"(dst),
2291  [h]"+&r"(h)
2292  : [stride]"r"((mips_reg)stride),
2293  [A]"f"(A), [B]"f"(B),
2294  [C]"f"(C), [D]"f"(D),
2295  [ff_pw_28]"f"(ff_pw_28)
2296  : "memory"
2297  );
2298 }
2299 
2301  uint8_t *src /* align 1 */,
2302  ptrdiff_t stride, int h, int x, int y)
2303 {
2304  const int A = (8 - x) * (8 - y);
2305  const int B = (x) * (8 - y);
2306  const int C = (8 - x) * (y);
2307  const int D = (x) * (y);
2308  double ftmp[6];
2309  uint32_t tmp[1];
2310  DECLARE_VAR_LOW32;
2311  DECLARE_VAR_ADDRT;
2312 
2313  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2314 
2315  __asm__ volatile(
2316  "li %[tmp0], 0x06 \n\t"
2317  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2318  "mtc1 %[tmp0], %[ftmp5] \n\t"
2319  "pshufh %[A], %[A], %[ftmp0] \n\t"
2320  "pshufh %[B], %[B], %[ftmp0] \n\t"
2321  "pshufh %[C], %[C], %[ftmp0] \n\t"
2322  "pshufh %[D], %[D], %[ftmp0] \n\t"
2323 
2324  "1: \n\t"
2325  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2326  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2327  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2328  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2329  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2330 
2332 
2333  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2334  "addiu %[h], %[h], -0x01 \n\t"
2335  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2336  "bnez %[h], 1b \n\t"
2337  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2338  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2339  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2340  [tmp0]"=&r"(tmp[0]),
2341  RESTRICT_ASM_LOW32
2342  RESTRICT_ASM_ADDRT
2343  [src]"+&r"(src), [dst]"+&r"(dst),
2344  [h]"+&r"(h)
2345  : [stride]"r"((mips_reg)stride),
2346  [A]"f"(A), [B]"f"(B),
2347  [C]"f"(C), [D]"f"(D),
2348  [ff_pw_28]"f"(ff_pw_28)
2349  : "memory"
2350  );
2351 }
2352 
2354  uint8_t *src /* align 1 */,
2355  ptrdiff_t stride, int h, int x, int y)
2356 {
2357  const int A = (8 - x) * (8 - y);
2358  const int B = (x) * (8 - y);
2359  const int C = (8 - x) * (y);
2360  const int D = (x) * (y);
2361  double ftmp[10];
2362  uint32_t tmp[1];
2363  DECLARE_VAR_ALL64;
2364  DECLARE_VAR_ADDRT;
2365 
2366  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2367 
2368  __asm__ volatile(
2369  "li %[tmp0], 0x06 \n\t"
2370  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2371  "mtc1 %[tmp0], %[ftmp9] \n\t"
2372  "pshufh %[A], %[A], %[ftmp0] \n\t"
2373  "pshufh %[B], %[B], %[ftmp0] \n\t"
2374  "pshufh %[C], %[C], %[ftmp0] \n\t"
2375  "pshufh %[D], %[D], %[ftmp0] \n\t"
2376 
2377  "1: \n\t"
2378  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2379  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2380  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2381  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2382  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2383 
2385 
2386  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2387  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2388 
2389  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2390  "addiu %[h], %[h], -0x01 \n\t"
2391  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2392  "bnez %[h], 1b \n\t"
2393  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2394  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2395  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2396  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2397  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2398  [tmp0]"=&r"(tmp[0]),
2399  RESTRICT_ASM_ALL64
2400  RESTRICT_ASM_ADDRT
2401  [src]"+&r"(src), [dst]"+&r"(dst),
2402  [h]"+&r"(h)
2403  : [stride]"r"((mips_reg)stride),
2404  [A]"f"(A), [B]"f"(B),
2405  [C]"f"(C), [D]"f"(D),
2406  [ff_pw_28]"f"(ff_pw_28)
2407  : "memory"
2408  );
2409 }
2410 
2412  uint8_t *src /* align 1 */,
2413  ptrdiff_t stride, int h, int x, int y)
2414 {
2415  const int A = (8 - x) * (8 - y);
2416  const int B = ( x) * (8 - y);
2417  const int C = (8 - x) * ( y);
2418  const int D = ( x) * ( y);
2419  double ftmp[6];
2420  uint32_t tmp[1];
2421  DECLARE_VAR_LOW32;
2422  DECLARE_VAR_ADDRT;
2423 
2424  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2425 
2426  __asm__ volatile(
2427  "li %[tmp0], 0x06 \n\t"
2428  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2429  "mtc1 %[tmp0], %[ftmp5] \n\t"
2430  "pshufh %[A], %[A], %[ftmp0] \n\t"
2431  "pshufh %[B], %[B], %[ftmp0] \n\t"
2432  "pshufh %[C], %[C], %[ftmp0] \n\t"
2433  "pshufh %[D], %[D], %[ftmp0] \n\t"
2434 
2435  "1: \n\t"
2436  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2437  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2438  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2439  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2440  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2441 
2443 
2444  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2445  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2446 
2447  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2448  "addiu %[h], %[h], -0x01 \n\t"
2449  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2450  "bnez %[h], 1b \n\t"
2451  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2452  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2453  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2454  [tmp0]"=&r"(tmp[0]),
2455  RESTRICT_ASM_LOW32
2456  RESTRICT_ASM_ADDRT
2457  [src]"+&r"(src), [dst]"+&r"(dst),
2458  [h]"+&r"(h)
2459  : [stride]"r"((mips_reg)stride),
2460  [A]"f"(A), [B]"f"(B),
2461  [C]"f"(C), [D]"f"(D),
2462  [ff_pw_28]"f"(ff_pw_28)
2463  : "memory"
2464  );
2465 }
vc1_filter_line
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1490
stride
int stride
Definition: mace.c:144
ff_pw_64
const uint64_t ff_pw_64
Definition: constants.c:45
OP_AVG
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1608
mem_internal.h
ff_vc1_h_loop_filter16_mmi
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1581
ff_vc1_v_s_overlap_mmi
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1456
ff_put_no_rnd_vc1_chroma_mc8_mmi
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2245
ff_vc1_h_s_overlap_mmi
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
Definition: vc1dsp_mmi.c:1403
vc1dsp.h
ff_vc1_inv_trans_8x4_mmi
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
ff_pw_4
const uint64_t ff_pw_4
Definition: constants.c:29
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:27
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
b
#define b
Definition: input.c:41
ff_avg_no_rnd_vc1_chroma_mc8_mmi
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2353
mips_reg
#define mips_reg
Definition: asmdefs.h:44
ff_avg_no_rnd_vc1_chroma_mc4_mmi
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2411
ff_vc1_inv_trans_8x8_dc_mmi
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:126
D
D(D(float, sse)
Definition: rematrix_init.c:28
ff_avg_pixels8_8_mmi
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
A
#define A(x)
Definition: vp56_arith.h:28
DECLARE_FUNCTION
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:2142
vc1dsp_mips.h
MSPEL_FILTER13_VER_16B
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1888
MSPEL_FILTER13_HOR_16B
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1963
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:269
constants.h
mmiutils.h
vc1_mspel_mc_filter_ver_16bits
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:2070
ff_vc1_h_overlap_mmi
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1380
a1
#define a1
Definition: regdef.h:47
ff_vc1_inv_trans_4x4_dc_mmi
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1126
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
rnd
#define rnd()
Definition: checkasm.h:117
ff_put_pixels8_8_mmi
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
ff_vc1_h_loop_filter8_mmi
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1571
CHROMA_MC_8_MMI
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:2191
ff_vc1_inv_trans_4x8_dc_mmi
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:870
vc1_mspel_mc_filter_8bits
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:2075
vc1_mspel_mc_filter_hor_16bits
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:2073
CHROMA_MC_4_MMI
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:2225
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
ff_put_vc1_mspel_mc00_mmi
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1586
ff_vc1_inv_trans_8x8_mmi
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
SHIFT2_LINE
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1642
VC1_INV_TRANCS_8_TYPE2
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
Definition: vc1dsp_mmi.c:77
MSPEL_FILTER13_8B
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:2013
src
#define src
Definition: vp8dsp.c:255
ff_vc1_inv_trans_4x4_mmi
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1174
VC1_MSPEL_MC
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:2089
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_pw_1
const uint64_t ff_pw_1
Definition: constants.c:26
shift1
static const int shift1[6]
Definition: dxa.c:50
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vc1_v_overlap_mmi
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1433
hpeldsp_mips.h
VC1_HOR_16B_SHIFT2
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1703
FFMIN
#define FFMIN(a, b)
Definition: common.h:105
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
a0
#define a0
Definition: regdef.h:46
ff_put_vc1_mspel_mc00_16_mmi
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1591
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:117
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
i
int i
Definition: input.c:407
a2
#define a2
Definition: regdef.h:48
ff_vc1_inv_trans_4x8_mmi
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
len
int len
Definition: vorbis_enc_data.h:452
vc1_loop_filter
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1539
ff_vc1_v_loop_filter16_mmi
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1576
ff_avg_pixels16_8_mmi
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
VC1_INV_TRANCS_8_TYPE1
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
Definition: vc1dsp_mmi.c:33
VC1_SHIFT2
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1763
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
ff_vc1_inv_trans_8x4_dc_mmi
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:419
B
#define B
Definition: huffyuvdsp.h:32
ff_vc1_v_loop_filter8_mmi
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1566
temp
else temp
Definition: vf_mcdeint.c:259
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_avg_vc1_mspel_mc00_16_mmi
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1601
av_clip_uint8
#define av_clip_uint8
Definition: common.h:128
shift
static int shift(int a, int b)
Definition: sonic.c:82
OP_PUT
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1607
ff_put_no_rnd_vc1_chroma_mc4_mmi
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2300
LOAD_ROUNDER_MMI
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1636
ff_put_pixels16_8_mmi
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
ff_vc1_h_loop_filter4_mmi
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1561
ff_vc1_v_loop_filter4_mmi
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1556
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:73
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
ff_pw_28
const uint64_t ff_pw_28
Definition: constants.c:42
h
h
Definition: vp9dsp_template.c:2038
ff_avg_vc1_mspel_mc00_mmi
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1596
a3
#define a3
Definition: regdef.h:49
clip
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
vc1_put_ver_16b_shift2_mmi
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1659
ff_pw_9
const uint64_t ff_pw_9
Definition: constants.c:33