FFmpeg
h264dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized h264dsp
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  * Heiher <r@hev.cc>
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
27 #include "h264dsp_mips.h"
29 
30 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
31 {
32  double ftmp[9];
33  DECLARE_VAR_LOW32;
34 
35  __asm__ volatile (
36  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
37  MMI_LDC1(%[ftmp1], %[src], 0x00)
38  MMI_LDC1(%[ftmp2], %[src], 0x08)
39  MMI_LDC1(%[ftmp3], %[src], 0x10)
40  MMI_LDC1(%[ftmp4], %[src], 0x18)
41  /* memset(src, 0, 32); */
42  "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t"
43  "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t"
44  MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
45  MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
46  MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
47  MMI_ULWC1(%[ftmp8], %[dst3], 0x00)
48  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
49  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
50  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
51  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
52  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
53  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
54  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
55  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
56  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
57  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
58  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
59  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
60  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
61  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
62  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
63  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
64  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
65  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
66  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
67  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
68  RESTRICT_ASM_LOW32
69  [ftmp8]"=&f"(ftmp[8])
70  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
71  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
72  [src]"r"(src)
73  : "memory"
74  );
75 
76 }
77 
78 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
79 {
80  double ftmp[12];
81  uint64_t tmp[1];
82  DECLARE_VAR_LOW32;
83  DECLARE_VAR_ADDRT;
84 
85  __asm__ volatile (
86  MMI_LDC1(%[ftmp0], %[block], 0x00)
87  MMI_LDC1(%[ftmp1], %[block], 0x08)
88  MMI_LDC1(%[ftmp2], %[block], 0x10)
89  MMI_LDC1(%[ftmp3], %[block], 0x18)
90  /* memset(block, 0, 32) */
91  "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
92  "gssqc1 %[ftmp4], %[ftmp4], 0x00(%[block]) \n\t"
93  "gssqc1 %[ftmp4], %[ftmp4], 0x10(%[block]) \n\t"
94  "dli %[tmp0], 0x01 \n\t"
95  "mtc1 %[tmp0], %[ftmp8] \n\t"
96  "dli %[tmp0], 0x06 \n\t"
97  "mtc1 %[tmp0], %[ftmp9] \n\t"
98  "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t"
99  "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t"
100  "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
101  "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
102  "paddh %[ftmp10], %[ftmp2], %[ftmp0] \n\t"
103  "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
104  "paddh %[ftmp11], %[ftmp5], %[ftmp10] \n\t"
105  "psubh %[ftmp2], %[ftmp10], %[ftmp5] \n\t"
106  "paddh %[ftmp10], %[ftmp4], %[ftmp0] \n\t"
107  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
108  "punpckhhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
109  "punpcklhw %[ftmp5], %[ftmp11], %[ftmp10] \n\t"
110  "punpckhhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
111  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
112  "punpckhwd %[ftmp2], %[ftmp5], %[ftmp0] \n\t"
113  "punpcklwd %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
114  "punpcklwd %[ftmp10], %[ftmp1], %[ftmp4] \n\t"
115  "punpckhwd %[ftmp0], %[ftmp1], %[ftmp4] \n\t"
116  "paddh %[ftmp5], %[ftmp5], %[ff_pw_32] \n\t"
117  "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
118  "psrah %[ftmp3], %[ftmp0], %[ftmp8] \n\t"
119  "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
120  "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
121  "paddh %[ftmp1], %[ftmp10], %[ftmp5] \n\t"
122  "psubh %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
123  "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t"
124  "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
125  "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t"
126  "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
127  MMI_ULWC1(%[ftmp2], %[dst], 0x00)
128  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
129  "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
130  "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
131  "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
132  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
133  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
134  "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
135  "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
136  "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
137  "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
138  MMI_SWC1(%[ftmp2], %[dst], 0x00)
139  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
140  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
141  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
142  MMI_ULWC1(%[ftmp2], %[dst], 0x00)
143  "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
144  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
145  "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
146  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
147  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
148  "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
149  "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
150  "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
151  MMI_SWC1(%[ftmp2], %[dst], 0x00)
152  "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
153  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
154  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
155  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
156  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
157  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
158  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
159  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
160  RESTRICT_ASM_LOW32
161  RESTRICT_ASM_ADDRT
162  [tmp0]"=&r"(tmp[0])
163  : [dst]"r"(dst), [block]"r"(block),
164  [stride]"r"((mips_reg)stride), [ff_pw_32]"f"(ff_pw_32)
165  : "memory"
166  );
167 
168 }
169 
170 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
171 {
172  double ftmp[16];
173  uint64_t tmp[7];
174  mips_reg addr[1];
175  DECLARE_VAR_LOW32;
176  DECLARE_VAR_ADDRT;
177 
178  __asm__ volatile (
179  "lhu %[tmp0], 0x00(%[block]) \n\t"
180  PTR_ADDI "$29, $29, -0x20 \n\t"
181  PTR_ADDIU "%[tmp0], %[tmp0], 0x20 \n\t"
182  MMI_LDC1(%[ftmp1], %[block], 0x10)
183  "sh %[tmp0], 0x00(%[block]) \n\t"
184  MMI_LDC1(%[ftmp2], %[block], 0x20)
185  "dli %[tmp0], 0x01 \n\t"
186  MMI_LDC1(%[ftmp3], %[block], 0x30)
187  "mtc1 %[tmp0], %[ftmp8] \n\t"
188  MMI_LDC1(%[ftmp5], %[block], 0x50)
189  MMI_LDC1(%[ftmp6], %[block], 0x60)
190  MMI_LDC1(%[ftmp7], %[block], 0x70)
191  "mov.d %[ftmp0], %[ftmp1] \n\t"
192  "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
193  "psrah %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
194  "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
195  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
196  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
197  "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
198  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
199  "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
200  "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
201  "psubh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
202  "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
203  "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
204  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
205  "psrah %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
206  "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
207  "dli %[tmp0], 0x02 \n\t"
208  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
209  "mtc1 %[tmp0], %[ftmp9] \n\t"
210  "mov.d %[ftmp7], %[ftmp1] \n\t"
211  "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
212  "psrah %[ftmp3], %[ftmp4], %[ftmp9] \n\t"
213  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
214  "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
215  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
216  "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
217  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
218  "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
219  "mov.d %[ftmp5], %[ftmp6] \n\t"
220  "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
221  "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
222  "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
223  "psubh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
224  MMI_LDC1(%[ftmp2], %[block], 0x00)
225  MMI_LDC1(%[ftmp5], %[block], 0x40)
226  "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
227  "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
228  "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
229  "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
230  "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
231  "paddh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
232  "psubh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
233  "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
234  "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
235  "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
236  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
237  "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
238  "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
239  "paddh %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
240  "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
241  "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
242  "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
243  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
244  "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
245  "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
246  MMI_SDC1(%[ftmp6], %[block], 0x00)
247  "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
248  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
249  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
250  "punpckhhw %[ftmp0], %[ftmp3], %[ftmp1] \n\t"
251  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
252  "punpckhwd %[ftmp1], %[ftmp7], %[ftmp3] \n\t"
253  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
254  "punpckhwd %[ftmp3], %[ftmp6], %[ftmp0] \n\t"
255  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
256  MMI_LDC1(%[ftmp0], %[block], 0x00)
257  MMI_SDC1(%[ftmp7], $29, 0x00)
258  MMI_SDC1(%[ftmp1], $29, 0x10)
259  "dmfc1 %[tmp1], %[ftmp6] \n\t"
260  "dmfc1 %[tmp3], %[ftmp3] \n\t"
261  "punpckhhw %[ftmp3], %[ftmp5], %[ftmp2] \n\t"
262  "punpcklhw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
263  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t"
264  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
265  "punpckhwd %[ftmp0], %[ftmp5], %[ftmp4] \n\t"
266  "punpcklwd %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
267  "punpckhwd %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
268  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
269  MMI_SDC1(%[ftmp5], $29, 0x08)
270  MMI_SDC1(%[ftmp0], $29, 0x18)
271  "dmfc1 %[tmp2], %[ftmp3] \n\t"
272  "dmfc1 %[tmp4], %[ftmp4] \n\t"
273  MMI_LDC1(%[ftmp1], %[block], 0x18)
274  MMI_LDC1(%[ftmp6], %[block], 0x28)
275  MMI_LDC1(%[ftmp2], %[block], 0x38)
276  MMI_LDC1(%[ftmp0], %[block], 0x58)
277  MMI_LDC1(%[ftmp3], %[block], 0x68)
278  MMI_LDC1(%[ftmp4], %[block], 0x78)
279  "mov.d %[ftmp7], %[ftmp1] \n\t"
280  "psrah %[ftmp5], %[ftmp0], %[ftmp8] \n\t"
281  "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
282  "paddh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
283  "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
284  "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
285  "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
286  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
287  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
288  "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
289  "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
290  "psrah %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
291  "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
292  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
293  "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
294  "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
295  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
296  "mov.d %[ftmp4], %[ftmp1] \n\t"
297  "psrah %[ftmp2], %[ftmp5], %[ftmp9] \n\t"
298  "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
299  "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
300  "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
301  "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
302  "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
303  "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
304  "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
305  "mov.d %[ftmp0], %[ftmp3] \n\t"
306  "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
307  "psrah %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
308  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
309  "psubh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
310  MMI_LDC1(%[ftmp6], %[block], 0x08)
311  MMI_LDC1(%[ftmp0], %[block], 0x48)
312  "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
313  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
314  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
315  "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
316  "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
317  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
318  "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
319  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
320  "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
321  "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
322  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
323  "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
324  "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
325  "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
326  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
327  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
328  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
329  "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
330  "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
331  "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
332  MMI_SDC1(%[ftmp3], %[block], 0x08)
333  "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
334  "punpckhhw %[ftmp3], %[ftmp4], %[ftmp7] \n\t"
335  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
336  "punpckhhw %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
337  "punpcklhw %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
338  "punpckhwd %[ftmp1], %[ftmp4], %[ftmp2] \n\t"
339  "punpcklwd %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
340  "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
341  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
342  MMI_LDC1(%[ftmp7], %[block], 0x08)
343  "dmfc1 %[tmp5], %[ftmp4] \n\t"
344  "mov.d %[ftmp10], %[ftmp1] \n\t"
345  "mov.d %[ftmp12], %[ftmp3] \n\t"
346  "mov.d %[ftmp14], %[ftmp2] \n\t"
347  "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
348  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
349  "punpckhhw %[ftmp6], %[ftmp5], %[ftmp7] \n\t"
350  "punpcklhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
351  "punpckhwd %[ftmp7], %[ftmp0], %[ftmp5] \n\t"
352  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
353  "punpckhwd %[ftmp5], %[ftmp2], %[ftmp6] \n\t"
354  "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
355  "dmfc1 %[tmp6], %[ftmp0] \n\t"
356  "mov.d %[ftmp11], %[ftmp7] \n\t"
357  "mov.d %[ftmp13], %[ftmp2] \n\t"
358  "mov.d %[ftmp15], %[ftmp5] \n\t"
359  PTR_ADDIU "%[addr0], %[dst], 0x04 \n\t"
360  "mov.d %[ftmp7], %[ftmp10] \n\t"
361  "dmtc1 %[tmp3], %[ftmp6] \n\t"
362  MMI_LDC1(%[ftmp1], $29, 0x10)
363  "dmtc1 %[tmp1], %[ftmp3] \n\t"
364  "mov.d %[ftmp4], %[ftmp1] \n\t"
365  "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
366  "psrah %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
367  "paddh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
368  "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
369  "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
370  "paddh %[ftmp0], %[ftmp0], %[ftmp14] \n\t"
371  "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
372  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
373  "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
374  "psubh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
375  "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
376  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
377  "psubh %[ftmp7], %[ftmp7], %[ftmp14] \n\t"
378  "psrah %[ftmp5], %[ftmp14], %[ftmp8] \n\t"
379  "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
380  "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
381  "mov.d %[ftmp5], %[ftmp1] \n\t"
382  "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
383  "psrah %[ftmp6], %[ftmp0], %[ftmp9] \n\t"
384  "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
385  "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
386  "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
387  "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
388  "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
389  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
390  "mov.d %[ftmp7], %[ftmp12] \n\t"
391  "psrah %[ftmp2], %[ftmp12], %[ftmp8] \n\t"
392  "psrah %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
393  "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
394  "psubh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
395  MMI_LDC1(%[ftmp3], $29, 0x00)
396  "dmtc1 %[tmp5], %[ftmp7] \n\t"
397  "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
398  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
399  "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
400  "psubh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
401  "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
402  "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
403  "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
404  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
405  "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
406  "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
407  "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
408  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
409  "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
410  "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
411  "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
412  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
413  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
414  "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
415  "psubh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
416  "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
417  MMI_SDC1(%[ftmp3], $29, 0x00)
418  "psubh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
419  MMI_SDC1(%[ftmp0], $29, 0x10)
420  "dmfc1 %[tmp1], %[ftmp2] \n\t"
421  "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
422  MMI_SDC1(%[ftmp2], %[block], 0x00)
423  MMI_SDC1(%[ftmp2], %[block], 0x08)
424  MMI_SDC1(%[ftmp2], %[block], 0x10)
425  MMI_SDC1(%[ftmp2], %[block], 0x18)
426  MMI_SDC1(%[ftmp2], %[block], 0x20)
427  MMI_SDC1(%[ftmp2], %[block], 0x28)
428  MMI_SDC1(%[ftmp2], %[block], 0x30)
429  MMI_SDC1(%[ftmp2], %[block], 0x38)
430  MMI_SDC1(%[ftmp2], %[block], 0x40)
431  MMI_SDC1(%[ftmp2], %[block], 0x48)
432  MMI_SDC1(%[ftmp2], %[block], 0x50)
433  MMI_SDC1(%[ftmp2], %[block], 0x58)
434  MMI_SDC1(%[ftmp2], %[block], 0x60)
435  MMI_SDC1(%[ftmp2], %[block], 0x68)
436  MMI_SDC1(%[ftmp2], %[block], 0x70)
437  MMI_SDC1(%[ftmp2], %[block], 0x78)
438  "dli %[tmp3], 0x06 \n\t"
439  "mtc1 %[tmp3], %[ftmp10] \n\t"
440  MMI_ULWC1(%[ftmp3], %[dst], 0x00)
441  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
442  "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
443  "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
444  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
445  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
446  "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
447  "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
448  "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
449  "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
450  MMI_SWC1(%[ftmp3], %[dst], 0x00)
451  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
452  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
453  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
454  MMI_ULWC1(%[ftmp3], %[dst], 0x00)
455  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
456  "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
457  "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
458  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
459  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
460  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
461  "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
462  "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
463  "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
464  MMI_SWC1(%[ftmp3], %[dst], 0x00)
465  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
466  MMI_LDC1(%[ftmp5], $29, 0x00)
467  MMI_LDC1(%[ftmp4], $29, 0x10)
468  "dmtc1 %[tmp1], %[ftmp6] \n\t"
469  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
470  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
471  MMI_ULWC1(%[ftmp3], %[dst], 0x00)
472  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
473  "psrah %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
474  "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
475  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
476  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
477  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
478  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
479  "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
480  "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
481  MMI_SWC1(%[ftmp3], %[dst], 0x00)
482  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
483  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
484  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
485  MMI_ULWC1(%[ftmp3], %[dst], 0x00)
486  MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
487  "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
488  "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
489  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
490  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
491  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
492  "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
493  "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
494  "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
495  MMI_SWC1(%[ftmp3], %[dst], 0x00)
496  MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
497  "dmtc1 %[tmp4], %[ftmp1] \n\t"
498  "dmtc1 %[tmp2], %[ftmp6] \n\t"
499  MMI_LDC1(%[ftmp4], $29, 0x18)
500  "mov.d %[ftmp5], %[ftmp4] \n\t"
501  "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
502  "psrah %[ftmp7], %[ftmp11], %[ftmp8] \n\t"
503  "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
504  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
505  "paddh %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
506  "paddh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
507  "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
508  "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
509  "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
510  "psubh %[ftmp3], %[ftmp11], %[ftmp1] \n\t"
511  "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
512  "paddh %[ftmp5], %[ftmp5], %[ftmp15] \n\t"
513  "psubh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
514  "psrah %[ftmp2], %[ftmp15], %[ftmp8] \n\t"
515  "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
516  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
517  "mov.d %[ftmp2], %[ftmp4] \n\t"
518  "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
519  "psrah %[ftmp1], %[ftmp7], %[ftmp9] \n\t"
520  "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
521  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
522  "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
523  "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
524  "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
525  "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
526  "mov.d %[ftmp3], %[ftmp13] \n\t"
527  "psrah %[ftmp0], %[ftmp13], %[ftmp8] \n\t"
528  "psrah %[ftmp7], %[ftmp6], %[ftmp8] \n\t"
529  "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
530  "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
531  MMI_LDC1(%[ftmp6], $29, 0x08)
532  "dmtc1 %[tmp6], %[ftmp3] \n\t"
533  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
534  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
535  "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
536  "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
537  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
538  "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
539  "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
540  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
541  "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
542  "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
543  "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
544  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
545  "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
546  "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
547  "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
548  "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
549  "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
550  "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
551  "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
552  "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
553  MMI_SDC1(%[ftmp6], $29, 0x08)
554  "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
555  MMI_SDC1(%[ftmp7], $29, 0x18)
556  "dmfc1 %[tmp2], %[ftmp0] \n\t"
557  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
558  MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
559  MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
560  "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
561  "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
562  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
563  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
564  "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
565  "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
566  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
567  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
568  MMI_SWC1(%[ftmp6], %[addr0], 0x00)
569  MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
570  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
571  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
572  MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
573  MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
574  "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
575  "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
576  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
577  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
578  "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
579  "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
580  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
581  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
582  MMI_SWC1(%[ftmp6], %[addr0], 0x00)
583  MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
584  MMI_LDC1(%[ftmp2], $29, 0x08)
585  MMI_LDC1(%[ftmp5], $29, 0x18)
586  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
587  "dmtc1 %[tmp2], %[ftmp1] \n\t"
588  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
589  MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
590  MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
591  "psrah %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
592  "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
593  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
594  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
595  "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
596  "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
597  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
598  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
599  MMI_SWC1(%[ftmp6], %[addr0], 0x00)
600  MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
601  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
602  PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
603  MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
604  MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
605  "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
606  "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
607  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
608  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
609  "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
610  "paddh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
611  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
612  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
613  MMI_SWC1(%[ftmp6], %[addr0], 0x00)
614  MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
615  PTR_ADDIU "$29, $29, 0x20 \n\t"
616  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
617  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
618  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
619  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
620  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
621  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
622  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
623  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
624  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
625  [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
626  [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
627  [tmp6]"=&r"(tmp[6]),
628  RESTRICT_ASM_LOW32
629  RESTRICT_ASM_ADDRT
630  [addr0]"=&r"(addr[0])
631  : [dst]"r"(dst), [block]"r"(block),
632  [stride]"r"((mips_reg)stride)
633  : "$29","memory"
634  );
635 
636 }
637 
639 {
640  int dc = (block[0] + 32) >> 6;
641  double ftmp[6];
642  DECLARE_VAR_LOW32;
643 
644  block[0] = 0;
645 
646  __asm__ volatile (
647  "mtc1 %[dc], %[ftmp5] \n\t"
648  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
649  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
650  MMI_ULWC1(%[ftmp1], %[dst0], 0x00)
651  MMI_ULWC1(%[ftmp2], %[dst1], 0x00)
652  MMI_ULWC1(%[ftmp3], %[dst2], 0x00)
653  MMI_ULWC1(%[ftmp4], %[dst3], 0x00)
654  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
655  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
656  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
657  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
658  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
659  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
660  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
661  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
662  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
663  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
664  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
665  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
666  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
667  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
668  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
669  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
670  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
671  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
672  [ftmp4]"=&f"(ftmp[4]),
673  RESTRICT_ASM_LOW32
674  [ftmp5]"=&f"(ftmp[5])
675  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
676  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
677  [dc]"r"(dc)
678  : "memory"
679  );
680 }
681 
683 {
684  int dc = (block[0] + 32) >> 6;
685  double ftmp[10];
686  DECLARE_VAR_ALL64;
687 
688  block[0] = 0;
689 
690  __asm__ volatile (
691  "mtc1 %[dc], %[ftmp5] \n\t"
692  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
693  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
694  MMI_LDC1(%[ftmp1], %[dst0], 0x00)
695  MMI_LDC1(%[ftmp2], %[dst1], 0x00)
696  MMI_LDC1(%[ftmp3], %[dst2], 0x00)
697  MMI_LDC1(%[ftmp4], %[dst3], 0x00)
698  "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
699  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
700  "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
701  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
702  "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
703  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
704  "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
705  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
706  "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
707  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
708  "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
709  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
710  "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
711  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
712  "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
713  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
714  "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
715  "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
716  "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
717  "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
718  MMI_SDC1(%[ftmp1], %[dst0], 0x00)
719  MMI_SDC1(%[ftmp2], %[dst1], 0x00)
720  MMI_SDC1(%[ftmp3], %[dst2], 0x00)
721  MMI_SDC1(%[ftmp4], %[dst3], 0x00)
722 
723  MMI_LDC1(%[ftmp1], %[dst4], 0x00)
724  MMI_LDC1(%[ftmp2], %[dst5], 0x00)
725  MMI_LDC1(%[ftmp3], %[dst6], 0x00)
726  MMI_LDC1(%[ftmp4], %[dst7], 0x00)
727  "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
728  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
729  "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
730  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
731  "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
732  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
733  "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
734  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
735  "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
736  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
737  "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
738  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
739  "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
740  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
741  "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
742  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
743  "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
744  "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
745  "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
746  "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
747  MMI_SDC1(%[ftmp1], %[dst4], 0x00)
748  MMI_SDC1(%[ftmp2], %[dst5], 0x00)
749  MMI_SDC1(%[ftmp3], %[dst6], 0x00)
750  MMI_SDC1(%[ftmp4], %[dst7], 0x00)
751  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
752  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
753  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
754  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
755  [ftmp8]"=&f"(ftmp[8]),
756  RESTRICT_ASM_ALL64
757  [ftmp9]"=&f"(ftmp[9])
758  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
759  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
760  [dst4]"r"(dst+4*stride), [dst5]"r"(dst+5*stride),
761  [dst6]"r"(dst+6*stride), [dst7]"r"(dst+7*stride),
762  [dc]"r"(dc)
763  : "memory"
764  );
765 }
766 
767 void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
768  int16_t *block, int stride, const uint8_t nnzc[15*8])
769 {
770  int i;
771  for(i=0; i<16; i++){
772  int nnz = nnzc[ scan8[i] ];
773  if(nnz){
774  if(nnz==1 && ((int16_t*)block)[i*16])
775  ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
776  stride);
777  else
778  ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
779  stride);
780  }
781  }
782 }
783 
784 void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
785  int16_t *block, int stride, const uint8_t nnzc[15*8])
786 {
787  int i;
788  for(i=0; i<16; i++){
789  if(nnzc[ scan8[i] ])
790  ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
791  else if(((int16_t*)block)[i*16])
792  ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
793  stride);
794  }
795 }
796 
797 void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
798  int16_t *block, int stride, const uint8_t nnzc[15*8])
799 {
800  int i;
801  for(i=0; i<16; i+=4){
802  int nnz = nnzc[ scan8[i] ];
803  if(nnz){
804  if(nnz==1 && ((int16_t*)block)[i*16])
805  ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
806  block + i*16, stride);
807  else
808  ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
809  stride);
810  }
811  }
812 }
813 
814 void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
815  int16_t *block, int stride, const uint8_t nnzc[15*8])
816 {
817  int i, j;
818  for(j=1; j<3; j++){
819  for(i=j*16; i<j*16+4; i++){
820  if(nnzc[ scan8[i] ])
821  ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
822  block + i*16, stride);
823  else if(((int16_t*)block)[i*16])
824  ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
825  block + i*16, stride);
826  }
827  }
828 }
829 
830 void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
831  int16_t *block, int stride, const uint8_t nnzc[15*8])
832 {
833  int i, j;
834 
835  for(j=1; j<3; j++){
836  for(i=j*16; i<j*16+4; i++){
837  if(nnzc[ scan8[i] ])
838  ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
839  block + i*16, stride);
840  else if(((int16_t*)block)[i*16])
841  ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
842  block + i*16, stride);
843  }
844  }
845 
846  for(j=1; j<3; j++){
847  for(i=j*16+4; i<j*16+8; i++){
848  if(nnzc[ scan8[i+4] ])
849  ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
850  block + i*16, stride);
851  else if(((int16_t*)block)[i*16])
852  ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
853  block + i*16, stride);
854  }
855  }
856 }
857 
859  int qmul)
860 {
861  double ftmp[10];
862  uint64_t tmp[2];
863  DECLARE_VAR_ALL64;
864 
865  __asm__ volatile (
866  ".set noreorder \n\t"
867  "dli %[tmp0], 0x08 \n\t"
868  MMI_LDC1(%[ftmp3], %[input], 0x18)
869  "mtc1 %[tmp0], %[ftmp8] \n\t"
870  MMI_LDC1(%[ftmp2], %[input], 0x10)
871  "dli %[tmp0], 0x20 \n\t"
872  MMI_LDC1(%[ftmp1], %[input], 0x08)
873  "mtc1 %[tmp0], %[ftmp9] \n\t"
874  MMI_LDC1(%[ftmp0], %[input], 0x00)
875  "mov.d %[ftmp4], %[ftmp3] \n\t"
876  "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
877  "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
878  "mov.d %[ftmp4], %[ftmp1] \n\t"
879  "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
880  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
881  "mov.d %[ftmp4], %[ftmp3] \n\t"
882  "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
883  "psubh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
884  "mov.d %[ftmp4], %[ftmp2] \n\t"
885  "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
886  "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
887  "mov.d %[ftmp4], %[ftmp3] \n\t"
888  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
889  "punpckhhw %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
890  "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
891  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
892  "punpckhwd %[ftmp2], %[ftmp3], %[ftmp0] \n\t"
893  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
894  "mov.d %[ftmp0], %[ftmp4] \n\t"
895  "punpcklwd %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
896  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
897  "mov.d %[ftmp1], %[ftmp0] \n\t"
898  "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
899  "psubh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
900  "mov.d %[ftmp1], %[ftmp2] \n\t"
901  "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
902  "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
903  "mov.d %[ftmp1], %[ftmp0] \n\t"
904  "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
905  "psubh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
906  "mov.d %[ftmp1], %[ftmp4] \n\t"
907  "daddi %[tmp0], %[qmul], -0x7fff \n\t"
908  "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
909  "bgtz %[tmp0], 1f \n\t"
910  "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
911  "ori %[tmp0], $0, 0x80 \n\t"
912  "dsll %[tmp0], %[tmp0], 0x10 \n\t"
913  "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
914  "daddu %[qmul], %[qmul], %[tmp0] \n\t"
915  "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
916  "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
917  "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
918  "mtc1 %[qmul], %[ftmp7] \n\t"
919  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
920  "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
921  "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
922  "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
923  "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
924  "psraw %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
925  "psraw %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
926  "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
927  "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
928  "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
929  "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
930  "dmfc1 %[tmp1], %[ftmp0] \n\t"
931  "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
932  "mfc1 %[input], %[ftmp0] \n\t"
933  "sh %[tmp1], 0x00(%[output]) \n\t"
934  "sh %[input], 0x80(%[output]) \n\t"
935  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
936  PTR_SRL "%[input], %[input], 0x10 \n\t"
937  "sh %[tmp1], 0x20(%[output]) \n\t"
938  "sh %[input], 0xa0(%[output]) \n\t"
939  "dmfc1 %[tmp1], %[ftmp2] \n\t"
940  "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
941  "mfc1 %[input], %[ftmp2] \n\t"
942  "sh %[tmp1], 0x40(%[output]) \n\t"
943  "sh %[input], 0xc0(%[output]) \n\t"
944  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
945  PTR_SRL "%[input], %[input], 0x10 \n\t"
946  "sh %[tmp1], 0x60(%[output]) \n\t"
947  "sh %[input], 0xe0(%[output]) \n\t"
948  "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
949  "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
950  "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
951  "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
952  "mtc1 %[qmul], %[ftmp7] \n\t"
953  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
954  "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
955  "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
956  "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
957  "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
958  "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
959  "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
960  "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
961  "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
962  "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
963  "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
964  "dmfc1 %[tmp1], %[ftmp3] \n\t"
965  "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
966  "mfc1 %[input], %[ftmp3] \n\t"
967  "sh %[tmp1], 0x100(%[output]) \n\t"
968  "sh %[input], 0x180(%[output]) \n\t"
969  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
970  PTR_SRL "%[input], %[input], 0x10 \n\t"
971  "sh %[tmp1], 0x120(%[output]) \n\t"
972  "sh %[input], 0x1a0(%[output]) \n\t"
973  "dmfc1 %[tmp1], %[ftmp4] \n\t"
974  "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
975  "mfc1 %[input], %[ftmp4] \n\t"
976  "sh %[tmp1], 0x140(%[output]) \n\t"
977  "sh %[input], 0x1c0(%[output]) \n\t"
978  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
979  PTR_SRL "%[input], %[input], 0x10 \n\t"
980  "sh %[tmp1], 0x160(%[output]) \n\t"
981  "j 2f \n\t"
982  "sh %[input], 0x1e0(%[output]) \n\t"
983  "1: \n\t"
984  "ori %[tmp0], $0, 0x1f \n\t"
985 #if HAVE_LOONGSON3
986  "clz %[tmp1], %[qmul] \n\t"
987 #elif HAVE_LOONGSON2
988 #endif
989  "ori %[input], $0, 0x07 \n\t"
990  "dsubu %[tmp1], %[tmp0], %[tmp1] \n\t"
991  "ori %[tmp0], $0, 0x80 \n\t"
992  "dsll %[tmp0], %[tmp0], 0x10 \n\t"
993  "daddu %[qmul], %[qmul], %[tmp0] \n\t"
994  "dsubu %[tmp0], %[tmp1], %[input] \n\t"
995  "movn %[tmp1], %[input], %[tmp0] \n\t"
996  PTR_ADDIU "%[input], %[input], 0x01 \n\t"
997  "andi %[tmp0], %[tmp1], 0xff \n\t"
998  "srlv %[qmul], %[qmul], %[tmp0] \n\t"
999  PTR_SUBU "%[input], %[input], %[tmp1] \n\t"
1000  "mtc1 %[input], %[ftmp6] \n\t"
1001  "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
1002  "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
1003  "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
1004  "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
1005  "mtc1 %[qmul], %[ftmp7] \n\t"
1006  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1007  "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
1008  "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1009  "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1010  "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1011  "psraw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
1012  "psraw %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1013  "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1014  "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1015  "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
1016  "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1017  "dmfc1 %[tmp1], %[ftmp0] \n\t"
1018  "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
1019  "sh %[tmp1], 0x00(%[output]) \n\t"
1020  "mfc1 %[input], %[ftmp0] \n\t"
1021  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1022  "sh %[input], 0x80(%[output]) \n\t"
1023  "sh %[tmp1], 0x20(%[output]) \n\t"
1024  PTR_SRL "%[input], %[input], 0x10 \n\t"
1025  "dmfc1 %[tmp1], %[ftmp2] \n\t"
1026  "sh %[input], 0xa0(%[output]) \n\t"
1027  "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
1028  "sh %[tmp1], 0x40(%[output]) \n\t"
1029  "mfc1 %[input], %[ftmp2] \n\t"
1030  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1031  "sh %[input], 0xc0(%[output]) \n\t"
1032  "sh %[tmp1], 0x60(%[output]) \n\t"
1033  PTR_SRL "%[input], %[input], 0x10 \n\t"
1034  "sh %[input], 0xe0(%[output]) \n\t"
1035  "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
1036  "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
1037  "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
1038  "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
1039  "mtc1 %[qmul], %[ftmp7] \n\t"
1040  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1041  "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1042  "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
1043  "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1044  "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1045  "psraw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
1046  "psraw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1047  "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1048  "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1049  "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
1050  "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1051  "dmfc1 %[tmp1], %[ftmp3] \n\t"
1052  "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
1053  "mfc1 %[input], %[ftmp3] \n\t"
1054  "sh %[tmp1], 0x100(%[output]) \n\t"
1055  "sh %[input], 0x180(%[output]) \n\t"
1056  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1057  PTR_SRL "%[input], %[input], 0x10 \n\t"
1058  "sh %[tmp1], 0x120(%[output]) \n\t"
1059  "sh %[input], 0x1a0(%[output]) \n\t"
1060  "dmfc1 %[tmp1], %[ftmp4] \n\t"
1061  "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1062  "mfc1 %[input], %[ftmp4] \n\t"
1063  "sh %[tmp1], 0x140(%[output]) \n\t"
1064  "sh %[input], 0x1c0(%[output]) \n\t"
1065  "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1066  PTR_SRL "%[input], %[input], 0x10 \n\t"
1067  "sh %[tmp1], 0x160(%[output]) \n\t"
1068  "sh %[input], 0x1e0(%[output]) \n\t"
1069  "2: \n\t"
1070  ".set reorder \n\t"
1071  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1072  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1073  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1074  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1075  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1076  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1077  RESTRICT_ASM_ALL64
1078  [output]"+&r"(output), [input]"+&r"(input),
1079  [qmul]"+&r"(qmul)
1080  : [ff_pw_1]"f"(ff_pw_1)
1081  : "memory"
1082  );
1083 }
1084 
1086 {
1087  int temp[8];
1088  int t[8];
1089 
1090  temp[0] = block[0] + block[16];
1091  temp[1] = block[0] - block[16];
1092  temp[2] = block[32] + block[48];
1093  temp[3] = block[32] - block[48];
1094  temp[4] = block[64] + block[80];
1095  temp[5] = block[64] - block[80];
1096  temp[6] = block[96] + block[112];
1097  temp[7] = block[96] - block[112];
1098 
1099  t[0] = temp[0] + temp[4] + temp[2] + temp[6];
1100  t[1] = temp[0] - temp[4] + temp[2] - temp[6];
1101  t[2] = temp[0] - temp[4] - temp[2] + temp[6];
1102  t[3] = temp[0] + temp[4] - temp[2] - temp[6];
1103  t[4] = temp[1] + temp[5] + temp[3] + temp[7];
1104  t[5] = temp[1] - temp[5] + temp[3] - temp[7];
1105  t[6] = temp[1] - temp[5] - temp[3] + temp[7];
1106  t[7] = temp[1] + temp[5] - temp[3] - temp[7];
1107 
1108  block[ 0]= (t[0]*qmul + 128) >> 8;
1109  block[ 32]= (t[1]*qmul + 128) >> 8;
1110  block[ 64]= (t[2]*qmul + 128) >> 8;
1111  block[ 96]= (t[3]*qmul + 128) >> 8;
1112  block[ 16]= (t[4]*qmul + 128) >> 8;
1113  block[ 48]= (t[5]*qmul + 128) >> 8;
1114  block[ 80]= (t[6]*qmul + 128) >> 8;
1115  block[112]= (t[7]*qmul + 128) >> 8;
1116 }
1117 
1119 {
1120  int a,b,c,d;
1121 
1122  d = block[0] - block[16];
1123  a = block[0] + block[16];
1124  b = block[32] - block[48];
1125  c = block[32] + block[48];
1126  block[0] = ((a+c)*qmul) >> 7;
1127  block[16]= ((d+b)*qmul) >> 7;
1128  block[32]= ((a-c)*qmul) >> 7;
1129  block[48]= ((d-b)*qmul) >> 7;
1130 }
1131 
1133  int log2_denom, int weight, int offset)
1134 {
1135  int y;
1136  double ftmp[8];
1137  DECLARE_VAR_ALL64;
1138 
1139  offset <<= log2_denom;
1140 
1141  if (log2_denom)
1142  offset += 1 << (log2_denom - 1);
1143 
1144  for (y=0; y<height; y++, block+=stride) {
1145  __asm__ volatile (
1146  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1147  MMI_LDC1(%[ftmp1], %[block0], 0x00)
1148  MMI_LDC1(%[ftmp2], %[block1], 0x00)
1149  "mtc1 %[weight], %[ftmp3] \n\t"
1150  "mtc1 %[offset], %[ftmp4] \n\t"
1151  "mtc1 %[log2_denom], %[ftmp5] \n\t"
1152  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1153  "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1154  "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
1155  "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
1156  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1157  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1158  "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
1159  "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1160  "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1161  "pmullh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
1162  "paddsh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1163  "paddsh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1164  "paddsh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1165  "paddsh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1166  "psrah %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1167  "psrah %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1168  "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1169  "psrah %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1170  "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1171  "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1172  MMI_SDC1(%[ftmp1], %[block0], 0x00)
1173  MMI_SDC1(%[ftmp2], %[block1], 0x00)
1174  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1175  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1176  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1177  [ftmp6]"=&f"(ftmp[6]),
1178  RESTRICT_ASM_ALL64
1179  [ftmp7]"=&f"(ftmp[7])
1180  : [block0]"r"(block), [block1]"r"(block+8),
1181  [weight]"r"(weight), [offset]"r"(offset),
1182  [log2_denom]"r"(log2_denom)
1183  : "memory"
1184  );
1185  }
1186 }
1187 
1189  ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
1190  int offset)
1191 {
1192  int y;
1193  double ftmp[9];
1194  DECLARE_VAR_ALL64;
1195 
1196  offset = ((offset + 1) | 1) << log2_denom;
1197 
1198  for (y=0; y<height; y++, dst+=stride, src+=stride) {
1199  __asm__ volatile (
1200  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1201  MMI_LDC1(%[ftmp1], %[src0], 0x00)
1202  MMI_LDC1(%[ftmp2], %[dst0], 0x00)
1203  "mtc1 %[weights], %[ftmp3] \n\t"
1204  "mtc1 %[weightd], %[ftmp4] \n\t"
1205  "mtc1 %[offset], %[ftmp5] \n\t"
1206  "mtc1 %[log2_denom], %[ftmp6] \n\t"
1207  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1208  "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1209  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1210  "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1211  "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1212  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1213  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1214  "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1215  "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1216  "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1217  "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1218  "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1219  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1220  "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1221  "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1222  "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1223  "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1224  "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1225  MMI_SDC1(%[ftmp1], %[dst0], 0x00)
1226  MMI_LDC1(%[ftmp1], %[src1], 0x00)
1227  MMI_LDC1(%[ftmp2], %[dst1], 0x00)
1228  "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1229  "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1230  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1231  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1232  "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1233  "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1234  "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1235  "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1236  "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1237  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1238  "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1239  "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1240  "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1241  "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1242  "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1243  MMI_SDC1(%[ftmp1], %[dst1], 0x00)
1244  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1245  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1246  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1247  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1248  RESTRICT_ASM_ALL64
1249  [ftmp8]"=&f"(ftmp[8])
1250  : [dst0]"r"(dst), [dst1]"r"(dst+8),
1251  [src0]"r"(src), [src1]"r"(src+8),
1252  [weights]"r"(weights), [weightd]"r"(weightd),
1253  [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
1254  : "memory"
1255  );
1256  }
1257 }
1258 
1260  int log2_denom, int weight, int offset)
1261 {
1262  int y;
1263  double ftmp[6];
1264  DECLARE_VAR_ALL64;
1265 
1266  offset <<= log2_denom;
1267 
1268  if (log2_denom)
1269  offset += 1 << (log2_denom - 1);
1270 
1271  for (y=0; y<height; y++, block+=stride) {
1272  __asm__ volatile (
1273  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1274  MMI_LDC1(%[ftmp1], %[block], 0x00)
1275  "mtc1 %[weight], %[ftmp2] \n\t"
1276  "mtc1 %[offset], %[ftmp3] \n\t"
1277  "mtc1 %[log2_denom], %[ftmp5] \n\t"
1278  "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1279  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1280  "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
1281  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1282  "pmullh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
1283  "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1284  "paddsh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
1285  "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1286  "psrah %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1287  "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1288  "packushb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1289  MMI_SDC1(%[ftmp1], %[block], 0x00)
1290  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1291  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1292  [ftmp4]"=&f"(ftmp[4]),
1293  RESTRICT_ASM_ALL64
1294  [ftmp5]"=&f"(ftmp[5])
1295  : [block]"r"(block), [weight]"r"(weight),
1296  [offset]"r"(offset), [log2_denom]"r"(log2_denom)
1297  : "memory"
1298  );
1299  }
1300 }
1301 
1303  ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
1304  int offset)
1305 {
1306  int y;
1307  double ftmp[9];
1308  DECLARE_VAR_ALL64;
1309 
1310  offset = ((offset + 1) | 1) << log2_denom;
1311 
1312  for (y=0; y<height; y++, dst+=stride, src+=stride) {
1313  __asm__ volatile (
1314  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1315  MMI_LDC1(%[ftmp1], %[src], 0x00)
1316  MMI_LDC1(%[ftmp2], %[dst], 0x00)
1317  "mtc1 %[weights], %[ftmp3] \n\t"
1318  "mtc1 %[weightd], %[ftmp4] \n\t"
1319  "mtc1 %[offset], %[ftmp5] \n\t"
1320  "mtc1 %[log2_denom], %[ftmp6] \n\t"
1321  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1322  "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1323  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1324  "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1325  "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1326  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1327  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1328  "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1329  "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1330  "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1331  "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1332  "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1333  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1334  "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1335  "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1336  "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1337  "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1338  "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1339  MMI_SDC1(%[ftmp1], %[dst], 0x00)
1340  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1341  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1342  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1343  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1344  RESTRICT_ASM_ALL64
1345  [ftmp8]"=&f"(ftmp[8])
1346  : [dst]"r"(dst), [src]"r"(src),
1347  [weights]"r"(weights), [weightd]"r"(weightd),
1348  [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
1349  : "memory"
1350  );
1351  }
1352 }
1353 
1355  int log2_denom, int weight, int offset)
1356 {
1357  int y;
1358  double ftmp[5];
1359  DECLARE_VAR_LOW32;
1360 
1361  offset <<= log2_denom;
1362 
1363  if (log2_denom)
1364  offset += 1 << (log2_denom - 1);
1365 
1366  for (y=0; y<height; y++, block+=stride) {
1367  __asm__ volatile (
1368  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1369  MMI_ULWC1(%[ftmp1], %[block], 0x00)
1370  "mtc1 %[weight], %[ftmp2] \n\t"
1371  "mtc1 %[offset], %[ftmp3] \n\t"
1372  "mtc1 %[log2_denom], %[ftmp4] \n\t"
1373  "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1374  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1375  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1376  "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1377  "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1378  "psrah %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1379  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1380  MMI_SWC1(%[ftmp1], %[block], 0x00)
1381  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1382  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1383  RESTRICT_ASM_LOW32
1384  [ftmp4]"=&f"(ftmp[4])
1385  : [block]"r"(block), [weight]"r"(weight),
1386  [offset]"r"(offset), [log2_denom]"r"(log2_denom)
1387  : "memory"
1388  );
1389  }
1390 }
1391 
1393  ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
1394  int offset)
1395 {
1396  int y;
1397  double ftmp[7];
1398  DECLARE_VAR_LOW32;
1399 
1400  offset = ((offset + 1) | 1) << log2_denom;
1401 
1402  for (y=0; y<height; y++, dst+=stride, src+=stride) {
1403  __asm__ volatile (
1404  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1405  MMI_ULWC1(%[ftmp1], %[src], 0x00)
1406  MMI_ULWC1(%[ftmp2], %[dst], 0x00)
1407  "mtc1 %[weight], %[ftmp3] \n\t"
1408  "mtc1 %[weightd], %[ftmp4] \n\t"
1409  "mtc1 %[offset], %[ftmp5] \n\t"
1410  "mtc1 %[log2_denom], %[ftmp6] \n\t"
1411  "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1412  "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1413  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1414  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1415  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1416  "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1417  "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1418  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1419  "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1420  "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1421  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1422  MMI_SWC1(%[ftmp1], %[dst], 0x00)
1423  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1424  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1425  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1426  RESTRICT_ASM_LOW32
1427  [ftmp6]"=&f"(ftmp[6])
1428  : [dst]"r"(dst), [src]"r"(src),
1429  [weight]"r"(weights), [weightd]"r"(weightd),
1430  [offset]"r"(offset), [log2_denom]"r"(log2_denom+1)
1431  : "memory"
1432  );
1433  }
1434 }
1435 
1436 void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1437  int8_t *tc0)
1438 {
1439  double ftmp[12];
1440  mips_reg addr[2];
1441  DECLARE_VAR_LOW32;
1442  DECLARE_VAR_ALL64;
1443  DECLARE_VAR_ADDRT;
1444 
1445  __asm__ volatile (
1446  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
1447  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1448  PTR_ADDU "%[addr1], %[stride], %[addr0] \n\t"
1449  "addi %[alpha], %[alpha], -0x01 \n\t"
1450  PTR_SUBU "%[addr1], $0, %[addr1] \n\t"
1451  "addi %[beta], %[beta], -0x01 \n\t"
1452  PTR_ADDU "%[addr1], %[addr1], %[pix] \n\t"
1453  MMI_LDC1(%[ftmp3], %[pix], 0x00)
1454  MMI_LDXC1(%[ftmp1], %[addr1], %[stride], 0x00)
1455  MMI_LDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
1456  MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
1457  "mtc1 %[alpha], %[ftmp5] \n\t"
1458  "mtc1 %[beta], %[ftmp6] \n\t"
1459  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1460  "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1461  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1462  "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1463  "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1464  "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1465  "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1466  "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1467  "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1468  "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1469  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1470  "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1471  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1472  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1473  "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1474  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1475  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1476  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1477  "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1478  "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
1479  MMI_ULWC1(%[ftmp5], %[tc0], 0x00)
1480  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1481  "punpcklbh %[ftmp9], %[ftmp5], %[ftmp5] \n\t"
1482  "pcmpgtb %[ftmp5], %[ftmp9], %[ftmp4] \n\t"
1483  MMI_LDC1(%[ftmp4], %[addr1], 0x00)
1484  "and %[ftmp10], %[ftmp5], %[ftmp8] \n\t"
1485  "psubusb %[ftmp8], %[ftmp4], %[ftmp2] \n\t"
1486  "psubusb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
1487  "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1488  "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1489  "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1490  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1491  "and %[ftmp5], %[ftmp10], %[ftmp9] \n\t"
1492  "psubb %[ftmp8], %[ftmp5], %[ftmp7] \n\t"
1493  "and %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1494  "pavgb %[ftmp5], %[ftmp2], %[ftmp3] \n\t"
1495  MMI_LDC1(%[ftmp11], %[addr1], 0x00)
1496  "pavgb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1497  "xor %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
1498  "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1499  "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1500  "psubusb %[ftmp5], %[ftmp1], %[ftmp7] \n\t"
1501  "paddusb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
1502  "pmaxub %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1503  "pminub %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
1504  MMI_SDXC1(%[ftmp4], %[addr1], %[stride], 0x00)
1505  MMI_LDXC1(%[ftmp5], %[pix], %[addr0], 0x00)
1506  "psubusb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
1507  "psubusb %[ftmp7], %[ftmp3], %[ftmp5] \n\t"
1508  "psubusb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1509  "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1510  "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1511  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1512  "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1513  "and %[ftmp6], %[ftmp9], %[ftmp7] \n\t"
1514  MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
1515  "pavgb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
1516  MMI_LDXC1(%[ftmp11], %[pix], %[addr0], 0x00)
1517  "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1518  "xor %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1519  "and %[ftmp7], %[ftmp7], %[ff_pb_1] \n\t"
1520  "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1521  "psubusb %[ftmp7], %[ftmp4], %[ftmp6] \n\t"
1522  "paddusb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1523  "pmaxub %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1524  "pminub %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1525  MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
1526  "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1527  "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1528  "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
1529  "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1530  "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
1531  "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
1532  "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
1533  "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
1534  "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1535  "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1536  "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
1537  "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
1538  "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1539  "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1540  "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1541  "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1542  "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1543  "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1544  MMI_SDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
1545  MMI_SDC1(%[ftmp3], %[pix], 0x00)
1546  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1547  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1548  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1549  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1550  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1551  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1552  RESTRICT_ASM_LOW32
1553  RESTRICT_ASM_ALL64
1554  RESTRICT_ASM_ADDRT
1555  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
1556  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
1557  [alpha]"r"((mips_reg)alpha), [beta]"r"((mips_reg)beta),
1558  [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1),
1559  [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1)
1560  : "memory"
1561  );
1562 }
1563 
1565  int beta)
1566 {
1567  DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
1568  double ftmp[16];
1569  uint64_t tmp[1];
1570  mips_reg addr[3];
1571  DECLARE_VAR_ALL64;
1572  DECLARE_VAR_ADDRT;
1573 
1574  __asm__ volatile (
1575  "ori %[tmp0], $0, 0x01 \n\t"
1576  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1577  "mtc1 %[tmp0], %[ftmp9] \n\t"
1578  PTR_SLL "%[addr0], %[stride], 0x02 \n\t"
1579  PTR_ADDU "%[addr2], %[stride], %[stride] \n\t"
1580  PTR_ADDIU "%[alpha], %[alpha], -0x01 \n\t"
1581  PTR_SLL "%[ftmp11], %[ftmp9], %[ftmp9] \n\t"
1582  "bltz %[alpha], 1f \n\t"
1583  PTR_ADDU "%[addr1], %[addr2], %[stride] \n\t"
1584  PTR_ADDIU "%[beta], %[beta], -0x01 \n\t"
1585  "bltz %[beta], 1f \n\t"
1586  PTR_SUBU "%[addr0], $0, %[addr0] \n\t"
1587  PTR_ADDU "%[addr0], %[addr0], %[pix] \n\t"
1588  MMI_LDC1(%[ftmp3], %[pix], 0x00)
1589  MMI_LDXC1(%[ftmp1], %[addr0], %[addr2], 0x00)
1590  MMI_LDXC1(%[ftmp2], %[addr0], %[addr1], 0x00)
1591  MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
1592  "mtc1 %[alpha], %[ftmp5] \n\t"
1593  "mtc1 %[beta], %[ftmp6] \n\t"
1594  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1595  "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1596  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1597  "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1598  "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1599  "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1600  "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1601  MMI_SDC1(%[ftmp5], %[stack], 0x10)
1602  "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1603  "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1604  "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1605  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1606  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1607  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1608  "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1609  "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1610  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1611  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1612  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1613  "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1614  MMI_LDC1(%[ftmp5], %[stack], 0x10)
1615  "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1616  "ldc1 %[ftmp10], %[ff_pb_1] \n\t"
1617  MMI_SDC1(%[ftmp8], %[stack], 0x20)
1618  "pavgb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1619  "psubusb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
1620  "pavgb %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
1621  "psubusb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
1622  "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1623  "psubusb %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1624  MMI_LDC1(%[ftmp15], %[stack], 0x20)
1625  "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1626  "and %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
1627  MMI_LDXC1(%[ftmp15], %[addr0], %[stride], 0x00)
1628  "psubusb %[ftmp8], %[ftmp15], %[ftmp2] \n\t"
1629  "psubusb %[ftmp5], %[ftmp2], %[ftmp15] \n\t"
1630  "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1631  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1632  "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
1633  "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1634  MMI_LDXC1(%[ftmp14], %[pix], %[addr2], 0x00)
1635  MMI_SDC1(%[ftmp5], %[stack], 0x30)
1636  "psubusb %[ftmp8], %[ftmp14], %[ftmp3] \n\t"
1637  "psubusb %[ftmp5], %[ftmp3], %[ftmp14] \n\t"
1638  "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1639  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1640  "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
1641  "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1642  MMI_SDC1(%[ftmp5], %[stack], 0x40)
1643  "pavgb %[ftmp5], %[ftmp15], %[ftmp1] \n\t"
1644  "pavgb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1645  "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1646  MMI_SDC1(%[ftmp6], %[stack], 0x10)
1647  "paddb %[ftmp7], %[ftmp15], %[ftmp1] \n\t"
1648  "paddb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1649  "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1650  "mov.d %[ftmp8], %[ftmp7] \n\t"
1651  MMI_SDC1(%[ftmp7], %[stack], 0x00)
1652  "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
1653  "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1654  "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1655  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1656  "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1657  "pavgb %[ftmp6], %[ftmp15], %[ftmp4] \n\t"
1658  "psubb %[ftmp7], %[ftmp15], %[ftmp4] \n\t"
1659  "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
1660  "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1661  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1662  "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1663  MMI_LDC1(%[ftmp13], %[stack], 0x10)
1664  "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
1665  "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
1666  "pavgb %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
1667  "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1668  "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1669  "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1670  "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
1671  "xor %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
1672  "pavgb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
1673  "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1674  "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1675  MMI_LDC1(%[ftmp13], %[stack], 0x30)
1676  "pavgb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
1677  MMI_LDC1(%[ftmp12], %[stack], 0x20)
1678  "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1679  "xor %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
1680  "and %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
1681  "and %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1682  "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1683  "xor %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
1684  MMI_SDXC1(%[ftmp6], %[addr0], %[addr1], 0x00)
1685  MMI_LDC1(%[ftmp6], %[addr0], 0x00)
1686  "paddb %[ftmp7], %[ftmp15], %[ftmp6] \n\t"
1687  "pavgb %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1688  MMI_LDC1(%[ftmp12], %[stack], 0x00)
1689  "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1690  "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1691  "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1692  "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1693  "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1694  "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1695  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1696  MMI_LDC1(%[ftmp12], %[stack], 0x30)
1697  "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1698  "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
1699  "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1700  "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
1701  "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1702  "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
1703  "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1704  MMI_SDXC1(%[ftmp5], %[addr0], %[addr2], 0x00)
1705  MMI_SDXC1(%[ftmp6], %[addr0], %[stride], 0x00)
1706  "pavgb %[ftmp5], %[ftmp14], %[ftmp4] \n\t"
1707  "pavgb %[ftmp6], %[ftmp3], %[ftmp2] \n\t"
1708  "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1709  MMI_SDC1(%[ftmp6], %[stack], 0x10)
1710  "paddb %[ftmp7], %[ftmp14], %[ftmp4] \n\t"
1711  "paddb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
1712  "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1713  "mov.d %[ftmp8], %[ftmp7] \n\t"
1714  MMI_SDC1(%[ftmp7], %[stack], 0x00)
1715  "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
1716  "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1717  "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1718  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1719  "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1720  "pavgb %[ftmp6], %[ftmp14], %[ftmp1] \n\t"
1721  "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
1722  "psubb %[ftmp7], %[ftmp14], %[ftmp1] \n\t"
1723  "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1724  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1725  "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1726  MMI_LDC1(%[ftmp12], %[stack], 0x10)
1727  "pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1728  "pavgb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1729  "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
1730  "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1731  "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1732  "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1733  "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
1734  "xor %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
1735  "pavgb %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1736  "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1737  MMI_LDC1(%[ftmp12], %[stack], 0x40)
1738  "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1739  MMI_LDC1(%[ftmp13], %[stack], 0x20)
1740  "pavgb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1741  "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1742  "xor %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1743  "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1744  "and %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
1745  "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1746  "xor %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
1747  MMI_SDC1(%[ftmp6], %[pix], 0x00)
1748  MMI_LDXC1(%[ftmp6], %[pix], %[addr1], 0x00)
1749  "paddb %[ftmp7], %[ftmp14], %[ftmp6] \n\t"
1750  "pavgb %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1751  MMI_LDC1(%[ftmp12], %[stack], 0x00)
1752  "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1753  "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1754  "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1755  "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1756  "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1757  "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1758  "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1759  MMI_LDC1(%[ftmp12], %[stack], 0x40)
1760  "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1761  "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
1762  "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1763  "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
1764  "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1765  "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
1766  "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1767  MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
1768  MMI_SDXC1(%[ftmp6], %[pix], %[addr2], 0x00)
1769  "1: \n\t"
1770  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1771  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1772  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1773  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1774  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1775  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1776  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1777  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1778  [tmp0]"=&r"(tmp[0]),
1779  RESTRICT_ASM_ALL64
1780  RESTRICT_ASM_ADDRT
1781  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1782  [addr2]"=&r"(addr[2]),
1783  [alpha]"+&r"(alpha), [beta]"+&r"(beta)
1784  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
1785  [stack]"r"(stack), [ff_pb_1]"m"(ff_pb_1)
1786  : "memory"
1787  );
1788 }
1789 
1790 void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha,
1791  int beta, int8_t *tc0)
1792 {
1793  double ftmp[9];
1794  mips_reg addr[1];
1795  DECLARE_VAR_LOW32;
1796  DECLARE_VAR_ALL64;
1797  DECLARE_VAR_ADDRT;
1798 
1799  __asm__ volatile (
1800  "addi %[alpha], %[alpha], -0x01 \n\t"
1801  "addi %[beta], %[beta], -0x01 \n\t"
1802  "or %[addr0], $0, %[pix] \n\t"
1803  PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1804  PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1805  MMI_LDC1(%[ftmp1], %[addr0], 0x00)
1806  MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
1807  MMI_LDC1(%[ftmp3], %[pix], 0x00)
1808  MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
1809 
1810  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1811  "mtc1 %[alpha], %[ftmp5] \n\t"
1812  "mtc1 %[beta], %[ftmp6] \n\t"
1813  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1814  "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1815  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1816  "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1817  "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1818  "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1819  "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1820  "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1821  "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1822  "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1823  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1824  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1825  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1826  "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1827  "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1828  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1829  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1830  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1831  "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1832  "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1833  MMI_ULWC1(%[ftmp7], %[tc0], 0x00)
1834  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1835  "and %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1836  "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1837  "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1838  "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1839  "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
1840  "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
1841  "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
1842  "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
1843  "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
1844  "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1845  "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1846  "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
1847  "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
1848  "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1849  "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1850  "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1851  "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1852  "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1853  "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1854 
1855  MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
1856  MMI_SDC1(%[ftmp3], %[pix], 0x00)
1857  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1858  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1859  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1860  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1861  [ftmp8]"=&f"(ftmp[8]),
1862  RESTRICT_ASM_LOW32
1863  RESTRICT_ASM_ALL64
1864  RESTRICT_ASM_ADDRT
1865  [addr0]"=&r"(addr[0])
1866  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
1867  [alpha]"r"(alpha), [beta]"r"(beta),
1868  [tc0]"r"(tc0), [ff_pb_1]"f"(ff_pb_1),
1869  [ff_pb_3]"f"(ff_pb_3), [ff_pb_A1]"f"(ff_pb_A1)
1870  : "memory"
1871  );
1872 }
1873 
1875  int beta)
1876 {
1877  double ftmp[9];
1878  mips_reg addr[1];
1879  DECLARE_VAR_ALL64;
1880  DECLARE_VAR_ADDRT;
1881 
1882  __asm__ volatile (
1883  "addi %[alpha], %[alpha], -0x01 \n\t"
1884  "addi %[beta], %[beta], -0x01 \n\t"
1885  "or %[addr0], $0, %[pix] \n\t"
1886  PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1887  PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1888  MMI_LDC1(%[ftmp1], %[addr0], 0x00)
1889  MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
1890  MMI_LDC1(%[ftmp3], %[pix], 0x00)
1891  MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
1892 
1893  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1894  "mtc1 %[alpha], %[ftmp5] \n\t"
1895  "mtc1 %[beta], %[ftmp6] \n\t"
1896  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1897  "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1898  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1899  "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1900  "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1901  "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1902  "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1903  "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1904  "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1905  "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1906  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1907  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1908  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1909  "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1910  "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1911  "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1912  "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1913  "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1914  "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1915  "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1916  "mov.d %[ftmp6], %[ftmp2] \n\t"
1917  "mov.d %[ftmp7], %[ftmp3] \n\t"
1918  "xor %[ftmp5], %[ftmp2], %[ftmp4] \n\t"
1919  "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1920  "pavgb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1921  "psubusb %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1922  "pavgb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
1923  "xor %[ftmp5], %[ftmp3], %[ftmp1] \n\t"
1924  "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1925  "pavgb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
1926  "psubusb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1927  "pavgb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1928  "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1929  "psubb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1930  "and %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
1931  "and %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
1932  "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1933  "paddb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1934 
1935  MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
1936  MMI_SDC1(%[ftmp3], %[pix], 0x00)
1937  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1938  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1939  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1940  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1941  [ftmp8]"=&f"(ftmp[8]),
1942  RESTRICT_ASM_ALL64
1943  RESTRICT_ASM_ADDRT
1944  [addr0]"=&r"(addr[0])
1945  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
1946  [alpha]"r"(alpha), [beta]"r"(beta),
1947  [ff_pb_1]"f"(ff_pb_1)
1948  : "memory"
1949  );
1950 }
1951 
1952 void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1953  int8_t *tc0)
1954 {
1955  double ftmp[11];
1956  mips_reg addr[6];
1957  DECLARE_VAR_LOW32;
1958 
1959  __asm__ volatile (
1960  "addi %[alpha], %[alpha], -0x01 \n\t"
1961  "addi %[beta], %[beta], -0x01 \n\t"
1962  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
1963  PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
1964  PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
1965  PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
1966  "or %[addr5], $0, %[pix] \n\t"
1967  PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
1968  MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
1969  PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
1970  MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
1971  PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
1972  MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
1973  MMI_ULWC1(%[ftmp3], %[pix], 0x00)
1974  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
1975  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1976  PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
1977  "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
1978  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
1979  MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
1980  PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
1981  MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
1982  PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
1983  MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
1984  PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
1985  MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
1986  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1987  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1988  "mov.d %[ftmp6], %[ftmp4] \n\t"
1989  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1990  "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1991  "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
1992  "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
1993  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
1994  "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1995  "mov.d %[ftmp9], %[ftmp0] \n\t"
1996  "mov.d %[ftmp10], %[ftmp3] \n\t"
1997 
1998  "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
1999  "mtc1 %[alpha], %[ftmp4] \n\t"
2000  "mtc1 %[beta], %[ftmp5] \n\t"
2001  "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
2002  "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
2003  "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2004  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2005  "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
2006  "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
2007  "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2008  "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2009  "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
2010  "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
2011  "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2012  "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2013  "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2014  "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
2015  "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
2016  "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2017  "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2018  "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2019  "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2020  "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2021  MMI_ULWC1(%[ftmp6], %[tc0], 0x00)
2022  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2023  "and %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2024  "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2025  "xor %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
2026  "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
2027  "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
2028  "pavgb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
2029  "xor %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
2030  "pavgb %[ftmp3], %[ftmp3], %[ff_pb_3] \n\t"
2031  "pavgb %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
2032  "pavgb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2033  "paddusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
2034  "psubusb %[ftmp6], %[ff_pb_A1], %[ftmp3] \n\t"
2035  "psubusb %[ftmp3], %[ftmp3], %[ff_pb_A1] \n\t"
2036  "pminub %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
2037  "pminub %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
2038  "psubusb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
2039  "psubusb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2040  "paddusb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2041  "paddusb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2042 
2043  "punpckhwd %[ftmp4], %[ftmp9], %[ftmp9] \n\t"
2044  "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2045  "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2046  "punpcklbh %[ftmp0], %[ftmp9], %[ftmp1] \n\t"
2047  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
2048  "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2049  "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2050  MMI_USWC1(%[ftmp1], %[addr5], 0x00)
2051  PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2052  "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2053  MMI_USWC1(%[ftmp1], %[addr3], 0x00)
2054  PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2055  MMI_USWC1(%[ftmp0], %[addr4], 0x00)
2056  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2057  "punpckhwd %[ftmp3], %[ftmp10], %[ftmp10] \n\t"
2058  MMI_USWC1(%[ftmp0], %[pix], 0x00)
2059  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2060  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2061  PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2062  "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2063  "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2064  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2065  "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2066  PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
2067  PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
2068  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2069  MMI_USWC1(%[ftmp4], %[addr4], 0x00)
2070  PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
2071  "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
2072  MMI_USWC1(%[ftmp9], %[addr3], 0x00)
2073  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2074  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2075  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2076  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2077  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2078  [ftmp10]"=&f"(ftmp[10]),
2079  RESTRICT_ASM_LOW32
2080  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2081  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2082  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2083  [pix]"+&r"(pix)
2084  : [alpha]"r"(alpha), [beta]"r"(beta),
2085  [stride]"r"((mips_reg)stride), [tc0]"r"(tc0),
2086  [ff_pb_1]"f"(ff_pb_1), [ff_pb_3]"f"(ff_pb_3),
2087  [ff_pb_A1]"f"(ff_pb_A1)
2088  : "memory"
2089  );
2090 }
2091 
2093  int beta)
2094 {
2095  double ftmp[11];
2096  mips_reg addr[6];
2097  DECLARE_VAR_LOW32;
2098 
2099  __asm__ volatile (
2100  "addi %[alpha], %[alpha], -0x01 \n\t"
2101  "addi %[beta], %[beta], -0x01 \n\t"
2102  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2103  PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
2104  PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
2105  PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
2106  "or %[addr5], $0, %[pix] \n\t"
2107  PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
2108  MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
2109  PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2110  MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
2111  PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2112  MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
2113  MMI_ULWC1(%[ftmp3], %[pix], 0x00)
2114  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2115  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2116  PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2117  "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
2118  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2119  MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
2120  PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
2121  MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
2122  PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
2123  MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
2124  PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
2125  MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
2126  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2127  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
2128  "mov.d %[ftmp6], %[ftmp4] \n\t"
2129  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2130  "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
2131  "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
2132  "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
2133  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2134  "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2135 
2136  "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
2137  "mtc1 %[alpha], %[ftmp4] \n\t"
2138  "mtc1 %[beta], %[ftmp5] \n\t"
2139  "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
2140  "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
2141  "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2142  "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2143  "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
2144  "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
2145  "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2146  "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2147  "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
2148  "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
2149  "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2150  "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2151  "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2152  "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
2153  "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
2154  "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2155  "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2156  "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2157  "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2158  "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2159  "mov.d %[ftmp5], %[ftmp1] \n\t"
2160  "mov.d %[ftmp6], %[ftmp2] \n\t"
2161  "xor %[ftmp4], %[ftmp1], %[ftmp3] \n\t"
2162  "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
2163  "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2164  "psubusb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
2165  "pavgb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2166  "xor %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
2167  "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
2168  "pavgb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
2169  "psubusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
2170  "pavgb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2171  "psubb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2172  "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2173  "and %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
2174  "and %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
2175  "paddb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2176  "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2177 
2178  "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2179  "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2180  "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2181  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2182  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2183  "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2184  "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2185  MMI_USWC1(%[ftmp1], %[addr5], 0x00)
2186  PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2187  "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2188  PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2189  MMI_USWC1(%[ftmp1], %[addr3], 0x00)
2190  MMI_USWC1(%[ftmp0], %[addr4], 0x00)
2191  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2192  "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2193  MMI_USWC1(%[ftmp0], %[pix], 0x00)
2194  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2195  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2196  PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2197  "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2198  "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2199  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2200  "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2201  PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
2202  PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
2203  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2204  PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
2205  MMI_USWC1(%[ftmp4], %[addr4], 0x00)
2206  "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
2207  MMI_USWC1(%[ftmp9], %[addr3], 0x00)
2208  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2209  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2210  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2211  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2212  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2213  [ftmp10]"=&f"(ftmp[10]),
2214  RESTRICT_ASM_LOW32
2215  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2216  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2217  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2218  [pix]"+&r"(pix)
2219  : [alpha]"r"(alpha), [beta]"r"(beta),
2220  [stride]"r"((mips_reg)stride), [ff_pb_1]"f"(ff_pb_1)
2221  : "memory"
2222  );
2223 }
2224 
2225 void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
2226  int8_t *tc0)
2227 {
2228  if ((tc0[0] & tc0[1]) >= 0)
2229  ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
2230  if ((tc0[2] & tc0[3]) >= 0)
2231  ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
2232 }
2233 
2235  int beta)
2236 {
2237  deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
2238  deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
2239 }
2240 
2241 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
2242  int8_t *tc0)
2243 {
2244  DECLARE_ALIGNED(8, const uint64_t, stack[0x0d]);
2245  double ftmp[9];
2246  mips_reg addr[8];
2247  DECLARE_VAR_LOW32;
2248  DECLARE_VAR_ALL64;
2249 
2250  __asm__ volatile (
2251  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2252  PTR_ADDI "%[addr1], %[pix], -0x4 \n\t"
2253  PTR_ADDU "%[addr2], %[stride], %[addr0] \n\t"
2254  MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
2255  PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2256  PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2257  MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
2258  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2259  MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
2260  MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
2261  PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2262  MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
2263  PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2264  MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
2265  PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
2266  MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
2267  PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2268  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2269  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2270  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2271  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2272  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2273  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2274  PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2275  MMI_SDC1(%[ftmp1], %[stack], 0x10)
2276  MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
2277  PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
2278  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2279  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2280  "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2281  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2282  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2283  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2284  MMI_LDC1(%[ftmp8], %[stack], 0x10)
2285  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2286  MMI_SDC1(%[ftmp0], %[stack], 0x00)
2287  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
2288  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
2289  "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
2290  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2291  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
2292  "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
2293  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
2294  "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
2295  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2296  MMI_SDC1(%[ftmp1], %[stack], 0x10)
2297  MMI_SDC1(%[ftmp3], %[stack], 0x20)
2298  MMI_SDC1(%[ftmp7], %[stack], 0x30)
2299  MMI_SDC1(%[ftmp5], %[stack], 0x40)
2300  MMI_SDC1(%[ftmp6], %[stack], 0x50)
2301  PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
2302  PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
2303  MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
2304  PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2305  MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
2306  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2307  MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
2308  MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
2309  PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2310  MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
2311  PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2312  MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
2313  PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
2314  MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
2315  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2316  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2317  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2318  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2319  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2320  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2321  PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2322  MMI_SDC1(%[ftmp1], %[stack], 0x18)
2323  MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
2324  "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2325  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2326  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2327  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2328  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2329  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2330  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2331  MMI_LDC1(%[ftmp8], %[stack], 0x18)
2332  MMI_SDC1(%[ftmp0], %[stack], 0x08)
2333  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
2334  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
2335  "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
2336  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2337  "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
2338  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
2339  "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
2340  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2341  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
2342  MMI_SDC1(%[ftmp1], %[stack], 0x18)
2343  MMI_SDC1(%[ftmp3], %[stack], 0x28)
2344  MMI_SDC1(%[ftmp7], %[stack], 0x38)
2345  MMI_SDC1(%[ftmp5], %[stack], 0x48)
2346  MMI_SDC1(%[ftmp6], %[stack], 0x58)
2347  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2348  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2349  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2350  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2351  [ftmp8]"=&f"(ftmp[8]),
2352  RESTRICT_ASM_ALL64
2353  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2354  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2355  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2356  [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7])
2357  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
2358  [stack]"r"(stack)
2359  : "memory"
2360  );
2361 
2362  ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
2363 
2364  __asm__ volatile (
2365  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2366  PTR_ADDI "%[addr1], %[pix], -0x02 \n\t"
2367  PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2368  PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
2369  PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
2370  PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2371  MMI_LDC1(%[ftmp0], %[stack], 0x10)
2372  MMI_LDC1(%[ftmp1], %[stack], 0x20)
2373  MMI_LDC1(%[ftmp2], %[stack], 0x30)
2374  MMI_LDC1(%[ftmp3], %[stack], 0x40)
2375  "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2376  "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2377  "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2378  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2379  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2380  "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2381  "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2382  MMI_USWC1(%[ftmp1], %[addr1], 0x00)
2383  PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2384  "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2385  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2386  MMI_USWC1(%[ftmp1], %[addr3], 0x00)
2387  MMI_USWC1(%[ftmp0], %[addr5], 0x00)
2388  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2389  "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2390  MMI_USWC1(%[ftmp0], %[addr4], 0x00)
2391  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2392  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2393  "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2394  PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2395  "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2396  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2397  PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
2398  "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2399  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2400  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2401  MMI_USWC1(%[ftmp4], %[addr5], 0x00)
2402  PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2403  "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2404  PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
2405  MMI_USWC1(%[ftmp4], %[addr3], 0x00)
2406  PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
2407  MMI_LDC1(%[ftmp0], %[stack], 0x18)
2408  MMI_LDC1(%[ftmp1], %[stack], 0x28)
2409  MMI_LDC1(%[ftmp2], %[stack], 0x38)
2410  MMI_LDC1(%[ftmp3], %[stack], 0x48)
2411  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2412  "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2413  PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2414  "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2415  "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2416  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2417  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2418  PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2419  "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2420  "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2421  MMI_USWC1(%[ftmp1], %[addr1], 0x00)
2422  "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2423  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2424  MMI_USWC1(%[ftmp1], %[addr3], 0x00)
2425  MMI_USWC1(%[ftmp0], %[addr5], 0x00)
2426  "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2427  "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2428  MMI_USWC1(%[ftmp0], %[addr4], 0x00)
2429  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2430  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2431  PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2432  "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2433  "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2434  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2435  PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
2436  "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2437  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2438  MMI_USWC1(%[ftmp5], %[addr3], 0x00)
2439  MMI_USWC1(%[ftmp4], %[addr5], 0x00)
2440  PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2441  "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2442  MMI_USWC1(%[ftmp4], %[addr3], 0x00)
2443  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2444  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2445  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2446  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2447  [ftmp8]"=&f"(ftmp[8]),
2448  RESTRICT_ASM_LOW32
2449  RESTRICT_ASM_ALL64
2450  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2451  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2452  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2453  [addr6]"=&r"(addr[6]), [addr7]"=&r"(addr[7])
2454  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
2455  [stack]"r"(stack)
2456  : "memory"
2457  );
2458 }
2459 
2461  int beta)
2462 {
2463  DECLARE_ALIGNED(8, const uint64_t, ptmp[0x11]);
2464  DECLARE_ALIGNED(8, const uint64_t, pdat[0x04]);
2465  double ftmp[9];
2466  mips_reg addr[7];
2467  DECLARE_VAR_ALL64;
2468 
2469  __asm__ volatile (
2470  PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2471  PTR_ADDI "%[addr1], %[pix], -0x04 \n\t"
2472  PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
2473  PTR_ADDU "%[addr3], %[addr0], %[addr0] \n\t"
2474  PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2475  PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2476  MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
2477  PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2478  MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
2479  MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
2480  PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
2481  MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
2482  PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
2483  MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
2484  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2485  MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
2486  MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
2487  PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
2488  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2489  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2490  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2491  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2492  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2493  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2494  MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
2495  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2496  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2497  MMI_SDC1(%[ftmp3], %[ptmp], 0x00)
2498  "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2499  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2500  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2501  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2502  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2503  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2504  MMI_SDC1(%[ftmp2], %[ptmp], 0x20)
2505  MMI_LDC1(%[ftmp2], %[ptmp], 0x00)
2506  "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2507  "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2508  "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2509  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2510  "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2511  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2512  MMI_SDC1(%[ftmp0], %[ptmp], 0x00)
2513  MMI_SDC1(%[ftmp5], %[ptmp], 0x10)
2514  MMI_SDC1(%[ftmp7], %[ptmp], 0x40)
2515  MMI_SDC1(%[ftmp4], %[ptmp], 0x50)
2516  MMI_LDC1(%[ftmp8], %[ptmp], 0x20)
2517  "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2518  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2519  "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2520  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2521  PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
2522  MMI_SDC1(%[ftmp3], %[ptmp], 0x20)
2523  MMI_SDC1(%[ftmp0], %[ptmp], 0x30)
2524  MMI_SDC1(%[ftmp6], %[ptmp], 0x60)
2525  MMI_SDC1(%[ftmp5], %[ptmp], 0x70)
2526  PTR_ADDU "%[addr1], %[addr1], %[addr5] \n\t"
2527  PTR_ADDU "%[addr4], %[addr4], %[addr5] \n\t"
2528  PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2529  MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
2530  PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2531  MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
2532  MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
2533  PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
2534  MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
2535  PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
2536  MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
2537  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2538  MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
2539  MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
2540  PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
2541  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2542  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2543  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2544  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2545  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2546  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2547  MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
2548  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2549  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2550  MMI_SDC1(%[ftmp3], %[ptmp], 0x08)
2551  "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2552  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2553  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2554  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2555  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2556  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2557  MMI_SDC1(%[ftmp2], %[ptmp], 0x28)
2558  MMI_LDC1(%[ftmp2], %[ptmp], 0x08)
2559  "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2560  "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2561  "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2562  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2563  "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2564  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2565  MMI_SDC1(%[ftmp0], %[ptmp], 0x08)
2566  MMI_SDC1(%[ftmp5], %[ptmp], 0x18)
2567  MMI_SDC1(%[ftmp7], %[ptmp], 0x48)
2568  MMI_SDC1(%[ftmp4], %[ptmp], 0x58)
2569  MMI_LDC1(%[ftmp8], %[ptmp], 0x28)
2570  "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2571  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2572  "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2573  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2574  MMI_SDC1(%[ftmp3], %[ptmp], 0x28)
2575  MMI_SDC1(%[ftmp0], %[ptmp], 0x38)
2576  MMI_SDC1(%[ftmp6], %[ptmp], 0x68)
2577  MMI_SDC1(%[ftmp5], %[ptmp], 0x78)
2578  PTR_S "%[addr1], 0x00(%[pdat]) \n\t"
2579  PTR_S "%[addr2], 0x08(%[pdat]) \n\t"
2580  PTR_S "%[addr0], 0x10(%[pdat]) \n\t"
2581  PTR_S "%[addr3], 0x18(%[pdat]) \n\t"
2582  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2583  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2584  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2585  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2586  [ftmp8]"=&f"(ftmp[8]),
2587  RESTRICT_ASM_ALL64
2588  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2589  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2590  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2591  [addr6]"=&r"(addr[6])
2592  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
2593  [ptmp]"r"(ptmp), [pdat]"r"(pdat)
2594  : "memory"
2595  );
2596 
2597  ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
2598 
2599  __asm__ volatile (
2600  PTR_L "%[addr1], 0x00(%[pdat]) \n\t"
2601  PTR_L "%[addr2], 0x08(%[pdat]) \n\t"
2602  PTR_L "%[addr0], 0x10(%[pdat]) \n\t"
2603  PTR_L "%[addr3], 0x18(%[pdat]) \n\t"
2604  PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2605  MMI_LDC1(%[ftmp0], %[ptmp], 0x08)
2606  MMI_LDC1(%[ftmp1], %[ptmp], 0x18)
2607  MMI_LDC1(%[ftmp2], %[ptmp], 0x28)
2608  MMI_LDC1(%[ftmp3], %[ptmp], 0x38)
2609  MMI_LDC1(%[ftmp4], %[ptmp], 0x48)
2610  MMI_LDC1(%[ftmp5], %[ptmp], 0x58)
2611  MMI_LDC1(%[ftmp6], %[ptmp], 0x68)
2612  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2613  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2614  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2615  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2616  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2617  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2618  MMI_LDC1(%[ftmp8], %[ptmp], 0x78)
2619  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2620  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2621  MMI_USDC1(%[ftmp3], %[addr1], 0x00)
2622  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2623  "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2624  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2625  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2626  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2627  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2628  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2629  MMI_USDC1(%[ftmp2], %[addr5], 0x00)
2630  MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
2631  "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2632  "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2633  "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2634  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2635  "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2636  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2637  PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2638  MMI_USDC1(%[ftmp0], %[addr1], 0x00)
2639  PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
2640  MMI_USDC1(%[ftmp5], %[addr5], 0x00)
2641  PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2642  MMI_USDC1(%[ftmp7], %[addr6], 0x00)
2643  PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2644  MMI_USDC1(%[ftmp4], %[addr5], 0x00)
2645  MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
2646  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2647  "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2648  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2649  "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2650  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2651  MMI_USDC1(%[ftmp3], %[addr5], 0x00)
2652  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2653  MMI_USDC1(%[ftmp0], %[addr4], 0x00)
2654  PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
2655  MMI_USDC1(%[ftmp6], %[addr5], 0x00)
2656  PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
2657  MMI_USDC1(%[ftmp5], %[addr6], 0x00)
2658  PTR_SUBU "%[addr1], %[addr1], %[addr5] \n\t"
2659  PTR_SUBU "%[addr4], %[addr4], %[addr5] \n\t"
2660  MMI_LDC1(%[ftmp0], %[ptmp], 0x00)
2661  MMI_LDC1(%[ftmp1], %[ptmp], 0x10)
2662  MMI_LDC1(%[ftmp2], %[ptmp], 0x20)
2663  MMI_LDC1(%[ftmp3], %[ptmp], 0x30)
2664  MMI_LDC1(%[ftmp4], %[ptmp], 0x40)
2665  MMI_LDC1(%[ftmp5], %[ptmp], 0x50)
2666  MMI_LDC1(%[ftmp6], %[ptmp], 0x60)
2667  "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2668  "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2669  "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2670  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2671  "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2672  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2673  MMI_LDC1(%[ftmp8], %[ptmp], 0x70)
2674  "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2675  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2676  MMI_USDC1(%[ftmp3], %[addr1], 0x00)
2677  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2678  "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2679  "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2680  "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2681  "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2682  "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2683  "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2684  MMI_USDC1(%[ftmp2], %[addr5], 0x00)
2685  MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
2686  "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2687  "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2688  "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2689  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2690  "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2691  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2692  PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2693  MMI_USDC1(%[ftmp0], %[addr1], 0x00)
2694  PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
2695  MMI_USDC1(%[ftmp5], %[addr5], 0x00)
2696  PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2697  MMI_USDC1(%[ftmp7], %[addr6], 0x00)
2698  PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2699  MMI_USDC1(%[ftmp4], %[addr5], 0x00)
2700  MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
2701  PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2702  "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2703  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2704  "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2705  "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2706  MMI_USDC1(%[ftmp3], %[addr5], 0x00)
2707  PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2708  MMI_USDC1(%[ftmp0], %[addr4], 0x00)
2709  PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
2710  MMI_USDC1(%[ftmp6], %[addr5], 0x00)
2711  MMI_USDC1(%[ftmp5], %[addr6], 0x00)
2712  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2713  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2714  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2715  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2716  [ftmp8]"=&f"(ftmp[8]),
2717  RESTRICT_ASM_ALL64
2718  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
2719  [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
2720  [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
2721  [addr6]"=&r"(addr[6])
2722  : [pix]"r"(pix), [stride]"r"((mips_reg)stride),
2723  [ptmp]"r"(ptmp), [pdat]"r"(pdat)
2724  : "memory"
2725  );
2726 }
stride
int stride
Definition: mace.c:144
ff_h264_idct8_add_8_mmi
void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
Definition: h264dsp_mmi.c:170
PTR_SLL
#define PTR_SLL
Definition: asmdefs.h:55
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ff_pb_3
const uint64_t ff_pb_3
Definition: constants.c:58
b
#define b
Definition: input.c:41
ff_h264_idct8_add4_8_mmi
void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264dsp_mmi.c:797
ff_h264_biweight_pixels4_8_mmi
void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weightd, int weights, int offset)
Definition: h264dsp_mmi.c:1392
mips_reg
#define mips_reg
Definition: asmdefs.h:44
ff_h264_add_pixels4_8_mmi
void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
Definition: h264dsp_mmi.c:30
ff_h264_weight_pixels8_8_mmi
void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height, int log2_denom, int weight, int offset)
Definition: h264dsp_mmi.c:1259
PTR_ADDI
#define PTR_ADDI
Definition: asmdefs.h:49
deblock_v8_luma_intra_8_mmi
static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
Definition: h264dsp_mmi.c:1564
ff_h264_biweight_pixels16_8_mmi
void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weightd, int weights, int offset)
Definition: h264dsp_mmi.c:1188
mmiutils.h
ff_h264_chroma_dc_dequant_idct_8_mmi
void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
Definition: h264dsp_mmi.c:1118
ff_deblock_v8_luma_8_mmi
void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264dsp_mmi.c:1436
ff_pb_A1
const uint64_t ff_pb_A1
Definition: constants.c:60
ff_pw_32
const uint64_t ff_pw_32
Definition: constants.c:43
h264dsp_mips.h
src
#define src
Definition: vp8dsp.c:254
bit_depth_template.c
ff_deblock_h_chroma_8_mmi
void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264dsp_mmi.c:1952
ff_deblock_v_luma_intra_8_mmi
void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
Definition: h264dsp_mmi.c:2234
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
weight
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1560
ff_pw_1
const uint64_t ff_pw_1
Definition: constants.c:26
ff_deblock_h_luma_8_mmi
void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264dsp_mmi.c:2241
ff_h264_idct_add8_8_mmi
void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264dsp_mmi.c:814
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_h264_biweight_pixels8_8_mmi
void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weightd, int weights, int offset)
Definition: h264dsp_mmi.c:1302
height
#define height
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
src0
#define src0
Definition: h264pred.c:138
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
src1
#define src1
Definition: h264pred.c:139
ff_h264_idct_add16intra_8_mmi
void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264dsp_mmi.c:784
ff_h264_chroma422_dc_dequant_idct_8_mmi
void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
Definition: h264dsp_mmi.c:1085
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
ff_h264_idct_add_8_mmi
void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
Definition: h264dsp_mmi.c:78
ff_h264_idct_add8_422_8_mmi
void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264dsp_mmi.c:830
uint8_t
uint8_t
Definition: audio_convert.c:194
ff_deblock_v_luma_8_mmi
void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264dsp_mmi.c:2225
ff_deblock_v_chroma_8_mmi
void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
Definition: h264dsp_mmi.c:1790
ff_h264_idct_add16_8_mmi
void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264dsp_mmi.c:767
ff_h264_weight_pixels4_8_mmi
void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height, int log2_denom, int weight, int offset)
Definition: h264dsp_mmi.c:1354
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
ff_deblock_h_luma_intra_8_mmi
void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
Definition: h264dsp_mmi.c:2460
ff_deblock_h_chroma_intra_8_mmi
void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
Definition: h264dsp_mmi.c:2092
scan8
static const uint8_t scan8[16 *3+3]
Definition: h264dec.h:650
temp
else temp
Definition: vf_mcdeint.c:256
PTR_S
#define PTR_S
Definition: asmdefs.h:52
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_pb_1
const uint64_t ff_pb_1
Definition: constants.c:57
ff_deblock_v_chroma_intra_8_mmi
void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
Definition: h264dsp_mmi.c:1874
PTR_L
#define PTR_L
Definition: asmdefs.h:51
PTR_SRL
#define PTR_SRL
Definition: asmdefs.h:54
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
ff_h264_weight_pixels16_8_mmi
void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height, int log2_denom, int weight, int offset)
Definition: h264dsp_mmi.c:1132
ff_h264_idct8_dc_add_8_mmi
void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
Definition: h264dsp_mmi.c:682
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
block1
static int16_t block1[64]
Definition: dct.c:116
ff_h264_idct_dc_add_8_mmi
void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
Definition: h264dsp_mmi.c:638
ff_h264_luma_dc_dequant_idct_8_mmi
void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input, int qmul)
Definition: h264dsp_mmi.c:858