FFmpeg
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
28 #include "libavutil/mem_internal.h"
29 
30 #define DECLARE_DOUBLE_1 double db_1
31 #define DECLARE_DOUBLE_2 double db_2
32 #define DECLARE_UINT32_T uint32_t it_1
33 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
36 
37 #define MMI_PCMPGTUB(dst, src1, src2) \
38  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
39  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
40  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
41  "xor "#dst", %[db_2], %[db_1] \n\t"
42 
43 #define MMI_BTOH(dst_l, dst_r, src) \
44  "xor %[db_1], %[db_1], %[db_1] \n\t" \
45  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
46  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
47  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
48 
49 #define MMI_VP8_LOOP_FILTER \
50  /* Calculation of hev */ \
51  "dmtc1 %[thresh], %[ftmp3] \n\t" \
52  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
56  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
57  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
58  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
59  /* Calculation of mask */ \
60  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
61  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
62  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
63  "li %[tmp0], 0x09 \n\t" \
64  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
65  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
66  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
67  "dmtc1 %[e], %[ftmp3] \n\t" \
68  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
72  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
73  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
74  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
75  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
76  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
77  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
78  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
79  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
80  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
81  "dmtc1 %[i], %[ftmp3] \n\t" \
82  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
86  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
87  "xor %[mask], %[mask], %[ftmp3] \n\t" \
88  /* VP8_MBFILTER */ \
89  "li %[tmp0], 0x80808080 \n\t" \
90  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
91  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
92  "xor %[p2], %[p2], %[ftmp7] \n\t" \
93  "xor %[p1], %[p1], %[ftmp7] \n\t" \
94  "xor %[p0], %[p0], %[ftmp7] \n\t" \
95  "xor %[q0], %[q0], %[ftmp7] \n\t" \
96  "xor %[q1], %[q1], %[ftmp7] \n\t" \
97  "xor %[q2], %[q2], %[ftmp7] \n\t" \
98  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
99  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
100  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
101  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
102  /* Right part */ \
103  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
104  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
105  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
106  /* Left part */ \
107  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
108  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
109  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
110  /* Combine left and right part */ \
111  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
112  "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
113  "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
114  "li %[tmp0], 0x04040404 \n\t" \
115  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
116  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
117  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
118  "li %[tmp0], 0x0B \n\t" \
119  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
120  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
121  "li %[tmp0], 0x03030303 \n\t" \
122  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
123  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
124  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
125  "li %[tmp0], 0x0B \n\t" \
126  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
127  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
128  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
129  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
130  /* filt_val &= ~hev */ \
131  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
132  "xor %[hev], %[hev], %[ftmp0] \n\t" \
133  "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
134  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
135  "li %[tmp0], 0x07 \n\t" \
136  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
137  "li %[tmp0], 0x001b001b \n\t" \
138  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
139  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
140  "li %[tmp0], 0x003f003f \n\t" \
141  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
142  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
143  /* Right part */ \
144  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
145  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
146  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
147  /* Left part */ \
148  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
149  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
150  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
151  /* Combine left and right part */ \
152  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
153  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
154  "xor %[q0], %[q0], %[ftmp7] \n\t" \
155  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
156  "xor %[p0], %[p0], %[ftmp7] \n\t" \
157  "li %[tmp0], 0x00120012 \n\t" \
158  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
159  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
160  /* Right part */ \
161  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
162  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
163  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
164  /* Left part */ \
165  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
166  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
167  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
168  /* Combine left and right part */ \
169  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
170  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
171  "xor %[q1], %[q1], %[ftmp7] \n\t" \
172  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
173  "xor %[p1], %[p1], %[ftmp7] \n\t" \
174  "li %[tmp0], 0x03 \n\t" \
175  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
176  /* Right part */ \
177  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
178  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
179  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
180  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
181  /* Left part */ \
182  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
183  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
184  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
185  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
186  /* Combine left and right part */ \
187  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
188  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
189  "xor %[q2], %[q2], %[ftmp7] \n\t" \
190  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
191  "xor %[p2], %[p2], %[ftmp7] \n\t"
192 
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
194  MMI_ULWC1(%[ftmp1], src, 0x00) \
195  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
196  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
197  \
198  MMI_ULWC1(%[ftmp1], src, -0x01) \
199  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
200  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
201  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
202  \
203  MMI_ULWC1(%[ftmp1], src, -0x02) \
204  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
205  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
206  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
207  \
208  MMI_ULWC1(%[ftmp1], src, 0x01) \
209  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
210  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
211  \
212  MMI_ULWC1(%[ftmp1], src, 0x02) \
213  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
214  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
215  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
216  \
217  MMI_ULWC1(%[ftmp1], src, 0x03) \
218  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
219  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
220  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
221  \
222  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
223  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
224  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
225  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
226  \
227  MMI_SWC1(%[ftmp1], dst, 0x00)
228 
229 
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
231  MMI_ULWC1(%[ftmp1], src, 0x00) \
232  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
234  \
235  MMI_ULWC1(%[ftmp1], src, -0x01) \
236  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
237  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
238  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
239  \
240  MMI_ULWC1(%[ftmp1], src, 0x01) \
241  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
242  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
243  \
244  MMI_ULWC1(%[ftmp1], src, 0x02) \
245  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
246  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
247  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
248  \
249  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
250  \
251  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
252  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
253  \
254  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
255  MMI_SWC1(%[ftmp1], dst, 0x00)
256 
257 
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
259  MMI_ULWC1(%[ftmp1], src, 0x00) \
260  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
261  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
262  \
263  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
264  MMI_ULWC1(%[ftmp1], src1, 0x00) \
265  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
266  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
267  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
268  \
269  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
270  MMI_ULWC1(%[ftmp1], src1, 0x00) \
271  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
272  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
273  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
274  \
275  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
276  MMI_ULWC1(%[ftmp1], src1, 0x00) \
277  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
278  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
279  \
280  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
281  MMI_ULWC1(%[ftmp1], src1, 0x00) \
282  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
283  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
284  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
285  \
286  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
287  MMI_ULWC1(%[ftmp1], src1, 0x00) \
288  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
289  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
290  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
291  \
292  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
293  \
294  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
295  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
296  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
297  \
298  MMI_SWC1(%[ftmp1], dst, 0x00)
299 
300 
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
302  MMI_ULWC1(%[ftmp1], src, 0x00) \
303  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
304  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
305  \
306  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
307  MMI_ULWC1(%[ftmp1], src1, 0x00) \
308  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
310  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
311  \
312  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
313  MMI_ULWC1(%[ftmp1], src1, 0x00) \
314  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
315  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
316  \
317  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
318  MMI_ULWC1(%[ftmp1], src1, 0x00) \
319  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
320  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
321  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
322  \
323  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
324  \
325  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
326  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
327  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
328  \
329  MMI_SWC1(%[ftmp1], dst, 0x00)
330 
331 
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
333  MMI_ULDC1(%[ftmp1], src, 0x00) \
334  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
335  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
336  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
337  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
338  \
339  MMI_ULDC1(%[ftmp1], src, -0x01) \
340  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
341  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
342  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
343  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
344  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
345  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
346  \
347  MMI_ULDC1(%[ftmp1], src, -0x02) \
348  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
349  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
350  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
351  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
352  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
353  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
354  \
355  MMI_ULDC1(%[ftmp1], src, 0x01) \
356  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
357  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
358  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
359  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
360  \
361  MMI_ULDC1(%[ftmp1], src, 0x02) \
362  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
363  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
364  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
365  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
366  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
367  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
368  \
369  MMI_ULDC1(%[ftmp1], src, 0x03) \
370  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
371  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
372  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
373  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
374  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
375  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
376  \
377  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
378  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
379  \
380  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
381  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
382  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
383  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
384  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
385  \
386  MMI_SDC1(%[ftmp1], dst, 0x00)
387 
388 
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
390  MMI_ULDC1(%[ftmp1], src, 0x00) \
391  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
392  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
393  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
394  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
395  \
396  MMI_ULDC1(%[ftmp1], src, -0x01) \
397  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
399  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
400  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
401  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
402  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
403  \
404  MMI_ULDC1(%[ftmp1], src, 0x01) \
405  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
406  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
407  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
408  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
409  \
410  MMI_ULDC1(%[ftmp1], src, 0x02) \
411  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
412  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
413  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
414  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
415  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
416  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
417  \
418  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
419  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
420  \
421  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
422  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
423  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
424  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
425  \
426  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
427  MMI_SDC1(%[ftmp1], dst, 0x00)
428 
429 
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
431  MMI_ULDC1(%[ftmp1], src, 0x00) \
432  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
433  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
434  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
435  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
436  \
437  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
438  MMI_ULDC1(%[ftmp1], src1, 0x00) \
439  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
441  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
442  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
443  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
444  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
445  \
446  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
447  MMI_ULDC1(%[ftmp1], src1, 0x00) \
448  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
449  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
450  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
451  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
452  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
453  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
454  \
455  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
456  MMI_ULDC1(%[ftmp1], src1, 0x00) \
457  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
458  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
459  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
460  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
461  \
462  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
463  MMI_ULDC1(%[ftmp1], src1, 0x00) \
464  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
465  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
466  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
467  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
468  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
469  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
470  \
471  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
472  MMI_ULDC1(%[ftmp1], src1, 0x00) \
473  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
474  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
475  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
476  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
477  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
478  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
479  \
480  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
481  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
482  \
483  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
484  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
485  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
486  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
487  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
488  \
489  MMI_SDC1(%[ftmp1], dst, 0x00)
490 
491 
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
493  MMI_ULDC1(%[ftmp1], src, 0x00) \
494  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
495  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
496  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
497  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
498  \
499  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
500  MMI_ULDC1(%[ftmp1], src1, 0x00) \
501  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
502  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
503  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
504  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
505  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
506  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
507  \
508  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
509  MMI_ULDC1(%[ftmp1], src1, 0x00) \
510  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
511  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
512  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
513  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
514  \
515  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
516  MMI_ULDC1(%[ftmp1], src1, 0x00) \
517  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
518  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
519  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
520  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
521  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
522  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
523  \
524  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
525  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
526  \
527  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
528  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
529  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
530  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
531  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
532  \
533  MMI_SDC1(%[ftmp1], dst, 0x00)
534 
535 
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
537  MMI_ULDC1(%[ftmp1], src, 0x00) \
538  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
539  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
540  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
541  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
542  \
543  MMI_ULDC1(%[ftmp1], src, 0x01) \
544  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
545  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
546  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
547  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
548  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
549  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
550  \
551  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
552  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
553  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
554  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
555  \
556  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
557  MMI_SDC1(%[ftmp1], dst, 0x00)
558 
559 
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
561  MMI_ULWC1(%[ftmp1], src, 0x00) \
562  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
563  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
564  \
565  MMI_ULWC1(%[ftmp1], src, 0x01) \
566  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
567  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
568  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
569  \
570  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
571  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
572  \
573  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
574  MMI_SWC1(%[ftmp1], dst, 0x00)
575 
576 
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
578  MMI_ULDC1(%[ftmp1], src, 0x00) \
579  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
580  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
581  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
582  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
583  \
584  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
585  MMI_ULDC1(%[ftmp1], src1, 0x00) \
586  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
587  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
588  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
589  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
590  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
591  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
592  \
593  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
594  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
595  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
596  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
597  \
598  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
599  MMI_SDC1(%[ftmp1], dst, 0x00)
600 
601 
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
603  MMI_ULWC1(%[ftmp1], src, 0x00) \
604  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
605  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
606  \
607  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
608  MMI_ULWC1(%[ftmp1], src1, 0x00) \
609  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
610  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
611  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
612  \
613  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
614  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
615  \
616  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
617  MMI_SWC1(%[ftmp1], dst, 0x00)
618 
619 
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623 
624  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626 
627  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629 
630  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632 
633  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635 
636  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638 
639  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641 };
642 
643 #if 0
644 #define FILTER_6TAP(src, F, stride) \
645  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
646  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
647  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648 
649 #define FILTER_4TAP(src, F, stride) \
650  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
651  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652 
653 static const uint8_t subpel_filters[7][6] = {
654  { 0, 6, 123, 12, 1, 0 },
655  { 2, 11, 108, 36, 8, 1 },
656  { 0, 9, 93, 50, 6, 0 },
657  { 3, 16, 77, 77, 16, 3 },
658  { 0, 6, 50, 93, 9, 0 },
659  { 1, 8, 36, 108, 11, 2 },
660  { 0, 1, 12, 123, 6, 0 },
661 };
662 
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a) (((a) * 35468) >> 16)
665 #endif
666 
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
669  ptrdiff_t stride)
670 {
671  int av_unused p1 = p[-2 * stride];
672  int av_unused p0 = p[-1 * stride];
673  int av_unused q0 = p[ 0 * stride];
674  int av_unused q1 = p[ 1 * stride];
675  int a, f1, f2;
676  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677 
678  a = 3 * (q0 - p0);
679  a += clip_int8(p1 - q1);
680  a = clip_int8(a);
681 
682  // We deviate from the spec here with c(a+3) >> 3
683  // since that's what libvpx does.
684  f1 = FFMIN(a + 4, 127) >> 3;
685  f2 = FFMIN(a + 3, 127) >> 3;
686 
687  // Despite what the spec says, we do need to clamp here to
688  // be bitexact with libvpx.
689  p[-1 * stride] = cm[p0 + f2];
690  p[ 0 * stride] = cm[q0 - f1];
691 }
692 
694  ptrdiff_t stride)
695 {
696  int av_unused p1 = p[-2 * stride];
697  int av_unused p0 = p[-1 * stride];
698  int av_unused q0 = p[ 0 * stride];
699  int av_unused q1 = p[ 1 * stride];
700  int a, f1, f2;
701  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702 
703  a = 3 * (q0 - p0);
704  a = clip_int8(a);
705 
706  // We deviate from the spec here with c(a+3) >> 3
707  // since that's what libvpx does.
708  f1 = FFMIN(a + 4, 127) >> 3;
709  f2 = FFMIN(a + 3, 127) >> 3;
710 
711  // Despite what the spec says, we do need to clamp here to
712  // be bitexact with libvpx.
713  p[-1 * stride] = cm[p0 + f2];
714  p[ 0 * stride] = cm[q0 - f1];
715  a = (f1 + 1) >> 1;
716  p[-2 * stride] = cm[p1 + a];
717  p[ 1 * stride] = cm[q1 - a];
718 }
719 
720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721  int flim)
722 {
723  int av_unused p1 = p[-2 * stride];
724  int av_unused p0 = p[-1 * stride];
725  int av_unused q0 = p[ 0 * stride];
726  int av_unused q1 = p[ 1 * stride];
727 
728  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729 }
730 
731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732 {
733  int av_unused p1 = p[-2 * stride];
734  int av_unused p0 = p[-1 * stride];
735  int av_unused q0 = p[ 0 * stride];
736  int av_unused q1 = p[ 1 * stride];
737 
738  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739 }
740 
741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742 {
743  int a0, a1, a2, w;
744  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745 
746  int av_unused p2 = p[-3 * stride];
747  int av_unused p1 = p[-2 * stride];
748  int av_unused p0 = p[-1 * stride];
749  int av_unused q0 = p[ 0 * stride];
750  int av_unused q1 = p[ 1 * stride];
751  int av_unused q2 = p[ 2 * stride];
752 
753  w = clip_int8(p1 - q1);
754  w = clip_int8(w + 3 * (q0 - p0));
755 
756  a0 = (27 * w + 63) >> 7;
757  a1 = (18 * w + 63) >> 7;
758  a2 = (9 * w + 63) >> 7;
759 
760  p[-3 * stride] = cm[p2 + a2];
761  p[-2 * stride] = cm[p1 + a1];
762  p[-1 * stride] = cm[p0 + a0];
763  p[ 0 * stride] = cm[q0 - a0];
764  p[ 1 * stride] = cm[q1 - a1];
765  p[ 2 * stride] = cm[q2 - a2];
766 }
767 
768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769  int E, int I)
770 {
771  int av_unused p3 = p[-4 * stride];
772  int av_unused p2 = p[-3 * stride];
773  int av_unused p1 = p[-2 * stride];
774  int av_unused p0 = p[-1 * stride];
775  int av_unused q0 = p[ 0 * stride];
776  int av_unused q1 = p[ 1 * stride];
777  int av_unused q2 = p[ 2 * stride];
778  int av_unused q3 = p[ 3 * stride];
779 
780  return vp8_simple_limit(p, stride, E) &&
781  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784 }
785 
786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788 {
789  double ftmp[18];
790  uint32_t tmp[1];
794  __asm__ volatile(
795  /* Get data from dst */
796  "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
797  "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
798  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
799  "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
800  "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
801  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
802  "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
803  "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
804  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
805  "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
806  "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
807  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
808  "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
809  "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
810  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
811  "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
812  "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
813  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
814  "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
815  "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
816  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
817  "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
818  "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
820  /* Move to dst */
821  "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
822  "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
823  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
824  "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
825  "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
826  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
827  "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
828  "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
829  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
830  "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
831  "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
832  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
833  "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
834  "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
835  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
836  "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
837  "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
838  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
839  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
840  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
841  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
842  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
843  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
844  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
845  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
846  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
847  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
850  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
851  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
852  : "memory"
853  );
854 }
855 
857  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
858 {
859  int i;
860 
861  for (i = 0; i < 8; i++)
862  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
863  int hv = hev(dst + i * 1, stride, hev_thresh);
864  if (hv)
865  vp8_filter_common_is4tap(dst + i * 1, stride);
866  else
868  }
869 }
870 
871 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
872  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
873 {
874  double ftmp[18];
875  uint32_t tmp[1];
879  __asm__ volatile(
880  /* Get data from dst */
881  "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
882  "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
883  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
884  "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
885  "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
886  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
887  "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
888  "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
889  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
890  "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
891  "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
892  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
893  "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
894  "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
895  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
896  "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
897  "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
898  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
899  "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
900  "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
901  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
902  "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
903  "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
904  /* Matrix transpose */
905  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
906  %[q0], %[q1], %[q2], %[q3],
907  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
909  /* Matrix transpose */
910  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
911  %[q0], %[q1], %[q2], %[q3],
912  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
913  /* Move to dst */
914  "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
915  "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
916  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
917  "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
918  "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
919  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
920  "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
921  "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
922  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
923  "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
924  "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
925  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
926  "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
927  "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
928  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
929  "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
930  "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
931  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
932  "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
933  "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
934  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
935  "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
936  "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
937  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
938  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
939  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
940  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
941  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
942  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
943  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
944  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
945  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
946  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
949  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
950  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
951  : "memory"
952  );
953 }
954 
956  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
957 {
958  int i;
959 
960  for (i = 0; i < 8; i++)
961  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
962  int hv = hev(dst + i * stride, 1, hev_thresh);
963  if (hv)
964  vp8_filter_common_is4tap(dst + i * stride, 1);
965  else
967  }
968 }
969 
970 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
971 {
972 #if 1
973  double ftmp[8];
974  DECLARE_VAR_ALL64;
975 
976  __asm__ volatile (
977  MMI_LDC1(%[ftmp0], %[dc], 0x00)
978  MMI_LDC1(%[ftmp1], %[dc], 0x08)
979  MMI_LDC1(%[ftmp2], %[dc], 0x10)
980  MMI_LDC1(%[ftmp3], %[dc], 0x18)
981  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
982  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
983  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
984  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
985  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
986  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
987  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
988  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
989  MMI_SDC1(%[ftmp0], %[dc], 0x00)
990  MMI_SDC1(%[ftmp1], %[dc], 0x08)
991  MMI_SDC1(%[ftmp2], %[dc], 0x10)
992  MMI_SDC1(%[ftmp3], %[dc], 0x18)
993  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
994  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
995  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
996  [ftmp6]"=&f"(ftmp[6]),
997  RESTRICT_ASM_ALL64
998  [ftmp7]"=&f"(ftmp[7])
999  : [dc]"r"((uint8_t*)dc)
1000  : "memory"
1001  );
1002 
1003  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1004  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1005  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1006  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1007 
1008  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1009  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1010  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1011  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1012 
1013  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1014  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1015  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1016  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1017 
1018  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1019  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1020  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1021  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1022 
1023  __asm__ volatile (
1024  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1025  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1026  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1027  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1028  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1029  : RESTRICT_ASM_ALL64
1030  [ftmp0]"=&f"(ftmp[0])
1031  : [dc]"r"((uint8_t *)dc)
1032  : "memory"
1033  );
1034 #else
1035  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1036 
1037  t00 = dc[0] + dc[12];
1038  t10 = dc[1] + dc[13];
1039  t20 = dc[2] + dc[14];
1040  t30 = dc[3] + dc[15];
1041 
1042  t03 = dc[0] - dc[12];
1043  t13 = dc[1] - dc[13];
1044  t23 = dc[2] - dc[14];
1045  t33 = dc[3] - dc[15];
1046 
1047  t01 = dc[4] + dc[ 8];
1048  t11 = dc[5] + dc[ 9];
1049  t21 = dc[6] + dc[10];
1050  t31 = dc[7] + dc[11];
1051 
1052  t02 = dc[4] - dc[ 8];
1053  t12 = dc[5] - dc[ 9];
1054  t22 = dc[6] - dc[10];
1055  t32 = dc[7] - dc[11];
1056 
1057  dc[ 0] = t00 + t01;
1058  dc[ 1] = t10 + t11;
1059  dc[ 2] = t20 + t21;
1060  dc[ 3] = t30 + t31;
1061 
1062  dc[ 4] = t03 + t02;
1063  dc[ 5] = t13 + t12;
1064  dc[ 6] = t23 + t22;
1065  dc[ 7] = t33 + t32;
1066 
1067  dc[ 8] = t00 - t01;
1068  dc[ 9] = t10 - t11;
1069  dc[10] = t20 - t21;
1070  dc[11] = t30 - t31;
1071 
1072  dc[12] = t03 - t02;
1073  dc[13] = t13 - t12;
1074  dc[14] = t23 - t22;
1075  dc[15] = t33 - t32;
1076 
1077  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1078  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1079  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1080  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1081 
1082  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1083  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1084  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1085  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1086 
1087  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1088  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1089  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1090  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1091 
1092  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1093  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1094  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1095  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1096 
1097  AV_ZERO64(dc + 0);
1098  AV_ZERO64(dc + 4);
1099  AV_ZERO64(dc + 8);
1100  AV_ZERO64(dc + 12);
1101 #endif
1102 }
1103 
1104 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1105 {
1106  int val = (dc[0] + 3) >> 3;
1107 
1108  dc[0] = 0;
1109 
1110  block[0][0][0] = val;
1111  block[0][1][0] = val;
1112  block[0][2][0] = val;
1113  block[0][3][0] = val;
1114  block[1][0][0] = val;
1115  block[1][1][0] = val;
1116  block[1][2][0] = val;
1117  block[1][3][0] = val;
1118  block[2][0][0] = val;
1119  block[2][1][0] = val;
1120  block[2][2][0] = val;
1121  block[2][3][0] = val;
1122  block[3][0][0] = val;
1123  block[3][1][0] = val;
1124  block[3][2][0] = val;
1125  block[3][3][0] = val;
1126 }
1127 
1128 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1129 {
1130 #if 1
1131  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1132  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1133  double ftmp[12];
1134  uint32_t tmp[1];
1135  DECLARE_VAR_LOW32;
1136  DECLARE_VAR_ALL64;
1137 
1138  __asm__ volatile (
1139  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1140  MMI_LDC1(%[ftmp1], %[block], 0x00)
1141  MMI_LDC1(%[ftmp2], %[block], 0x08)
1142  MMI_LDC1(%[ftmp3], %[block], 0x10)
1143  MMI_LDC1(%[ftmp4], %[block], 0x18)
1144 
1145  "li %[tmp0], 0x02 \n\t"
1146  "mtc1 %[tmp0], %[ftmp11] \n\t"
1147 
1148  // block[0...3] + block[8...11]
1149  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1150  // block[0...3] - block[8...11]
1151  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1152  // MUL_35468(block[12...15])
1153  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1154  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1155  // MUL_35468(block[4...7])
1156  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1157  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1158  // MUL_20091(block[4...7]
1159  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1160  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1161  // MUL_20091(block[12...15])
1162  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1163  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1164 
1165  // tmp[0 4 8 12]
1166  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1167  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1168  // tmp[1 5 9 13]
1169  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1170  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1171  // tmp[2 6 10 14]
1172  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1173  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1174  // tmp[3 7 11 15]
1175  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1176  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1177 
1178  MMI_SDC1(%[ftmp0], %[block], 0x00)
1179  MMI_SDC1(%[ftmp0], %[block], 0x08)
1180  MMI_SDC1(%[ftmp0], %[block], 0x10)
1181  MMI_SDC1(%[ftmp0], %[block], 0x18)
1182 
1183  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1184  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1185 
1186  // t[0 4 8 12]
1187  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1188  // t[1 5 9 13]
1189  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1190  // t[2 6 10 14]
1191  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1192  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1193  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1194  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1195  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1196  // t[3 7 11 15]
1197  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1198  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1199  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1200  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1201  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1202 
1203  "li %[tmp0], 0x03 \n\t"
1204  "mtc1 %[tmp0], %[ftmp11] \n\t"
1205  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1206  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1207  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1208  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1209  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1210  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1211  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1212  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1213  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1214  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1215  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1216  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1217 
1218  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1219  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1220 
1221  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1222  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1223  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1224  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1225 
1226  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1227  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1228  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1229  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1230 
1231  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1232  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1233  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1234  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1235 
1236  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1237  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1238  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1239  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1240 
1241  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1242  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1243  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1244  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1245  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1246  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1247  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1248  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1249  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1250  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1251  RESTRICT_ASM_LOW32
1252  RESTRICT_ASM_ALL64
1253  [tmp0]"=&r"(tmp[0])
1254  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1255  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1256  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1257  [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1258  : "memory"
1259  );
1260 #else
1261  int i, t0, t1, t2, t3;
1262  int16_t tmp[16];
1263 
1264  for (i = 0; i < 4; i++) {
1265  t0 = block[0 + i] + block[8 + i];
1266  t1 = block[0 + i] - block[8 + i];
1267  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1268  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1269  block[ 0 + i] = 0;
1270  block[ 4 + i] = 0;
1271  block[ 8 + i] = 0;
1272  block[12 + i] = 0;
1273 
1274  tmp[i * 4 + 0] = t0 + t3;
1275  tmp[i * 4 + 1] = t1 + t2;
1276  tmp[i * 4 + 2] = t1 - t2;
1277  tmp[i * 4 + 3] = t0 - t3;
1278  }
1279 
1280  for (i = 0; i < 4; i++) {
1281  t0 = tmp[0 + i] + tmp[8 + i];
1282  t1 = tmp[0 + i] - tmp[8 + i];
1283  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1284  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1285 
1286  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1287  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1288  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1289  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1290  dst += stride;
1291  }
1292 #endif
1293 }
1294 
1295 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1296 {
1297 #if 1
1298  int dc = (block[0] + 4) >> 3;
1299  double ftmp[6];
1300  DECLARE_VAR_LOW32;
1301 
1302  block[0] = 0;
1303 
1304  __asm__ volatile (
1305  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1306  "mtc1 %[dc], %[ftmp5] \n\t"
1307  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1308  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1309  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1310  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1311  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1312  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1313  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1314  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1315  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1316  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1317  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1318  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1319  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1320  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1321  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1322  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1323  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1324  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1325  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1326  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1327  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1328  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1329  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1330  [ftmp4]"=&f"(ftmp[4]),
1331  RESTRICT_ASM_LOW32
1332  [ftmp5]"=&f"(ftmp[5])
1333  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1334  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1335  [dc]"r"(dc)
1336  : "memory"
1337  );
1338 #else
1339  int i, dc = (block[0] + 4) >> 3;
1340 
1341  block[0] = 0;
1342 
1343  for (i = 0; i < 4; i++) {
1344  dst[0] = av_clip_uint8(dst[0] + dc);
1345  dst[1] = av_clip_uint8(dst[1] + dc);
1346  dst[2] = av_clip_uint8(dst[2] + dc);
1347  dst[3] = av_clip_uint8(dst[3] + dc);
1348  dst += stride;
1349  }
1350 #endif
1351 }
1352 
1353 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1354  ptrdiff_t stride)
1355 {
1356  ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1357  ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1358  ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1359  ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1360 }
1361 
1362 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1363  ptrdiff_t stride)
1364 {
1365  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1366  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1367  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1368  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1369 }
1370 
1371 // loop filter applied to edges between macroblocks
1372 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1373  int flim_I, int hev_thresh)
1374 {
1375  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1376  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1377 }
1378 
1379 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1380  int flim_I, int hev_thresh)
1381 {
1382  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1383  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1384  hev_thresh);
1385 }
1386 
1387 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1388  int flim_E, int flim_I, int hev_thresh)
1389 {
1390  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1391  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1392 }
1393 
1394 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1395  int flim_E, int flim_I, int hev_thresh)
1396 {
1397  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1398  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1399 }
1400 
1401 // loop filter applied to inner macroblock edges
1402 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1403  int flim_E, int flim_I, int hev_thresh)
1404 {
1405  int i;
1406 
1407  for (i = 0; i < 16; i++)
1408  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1409  int hv = hev(dst + i * 1, stride, hev_thresh);
1410  if (hv)
1411  vp8_filter_common_is4tap(dst + i * 1, stride);
1412  else
1413  vp8_filter_common_isnot4tap(dst + i * 1, stride);
1414  }
1415 }
1416 
1417 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1418  int flim_E, int flim_I, int hev_thresh)
1419 {
1420  int i;
1421 
1422  for (i = 0; i < 16; i++)
1423  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1424  int hv = hev(dst + i * stride, 1, hev_thresh);
1425  if (hv)
1426  vp8_filter_common_is4tap(dst + i * stride, 1);
1427  else
1428  vp8_filter_common_isnot4tap(dst + i * stride, 1);
1429  }
1430 }
1431 
1432 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1433  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1434 {
1435  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1436  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1437 }
1438 
1439 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1440  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1441 {
1442  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1443  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1444 }
1445 
1446 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1447 {
1448  int i;
1449 
1450  for (i = 0; i < 16; i++)
1451  if (vp8_simple_limit(dst + i, stride, flim))
1453 }
1454 
1455 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1456 {
1457  int i;
1458 
1459  for (i = 0; i < 16; i++)
1460  if (vp8_simple_limit(dst + i * stride, 1, flim))
1461  vp8_filter_common_is4tap(dst + i * stride, 1);
1462 }
1463 
1464 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1465  ptrdiff_t srcstride, int h, int x, int y)
1466 {
1467 #if 1
1468  double ftmp[2];
1469  uint64_t tmp[2];
1470  mips_reg addr[2];
1471  DECLARE_VAR_ALL64;
1472 
1473  __asm__ volatile (
1474  "1: \n\t"
1475  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1476  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1477  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1478  "ldr %[tmp0], 0x08(%[src]) \n\t"
1479  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1480  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1481  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1482  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1483  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1484  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1485  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1486  "addiu %[h], %[h], -0x02 \n\t"
1487  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1488  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1489  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1490  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1491  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1492  "bnez %[h], 1b \n\t"
1493  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1494  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1495  RESTRICT_ASM_ALL64
1496  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1497  [dst]"+&r"(dst), [src]"+&r"(src),
1498  [h]"+&r"(h)
1499  : [dststride]"r"((mips_reg)dststride),
1500  [srcstride]"r"((mips_reg)srcstride)
1501  : "memory"
1502  );
1503 #else
1504  int i;
1505 
1506  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1507  memcpy(dst, src, 16);
1508 #endif
1509 }
1510 
1511 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1512  ptrdiff_t srcstride, int h, int x, int y)
1513 {
1514 #if 1
1515  double ftmp[1];
1516  uint64_t tmp[1];
1517  mips_reg addr[2];
1518  DECLARE_VAR_ALL64;
1519 
1520  __asm__ volatile (
1521  "1: \n\t"
1522  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1523  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1524  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1525  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1526  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1527  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1528  "addiu %[h], %[h], -0x02 \n\t"
1529  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1530  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1531  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1532  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1533  "bnez %[h], 1b \n\t"
1534  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1535  RESTRICT_ASM_ALL64
1536  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1537  [dst]"+&r"(dst), [src]"+&r"(src),
1538  [h]"+&r"(h)
1539  : [dststride]"r"((mips_reg)dststride),
1540  [srcstride]"r"((mips_reg)srcstride)
1541  : "memory"
1542  );
1543 #else
1544  int i;
1545 
1546  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1547  memcpy(dst, src, 8);
1548 #endif
1549 }
1550 
1551 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1552  ptrdiff_t srcstride, int h, int x, int y)
1553 {
1554 #if 1
1555  double ftmp[1];
1556  uint64_t tmp[1];
1557  mips_reg addr[2];
1558  DECLARE_VAR_LOW32;
1559 
1560  __asm__ volatile (
1561  "1: \n\t"
1562  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1563  MMI_LWC1(%[ftmp0], %[src], 0x00)
1564  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1565  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1566  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1567  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1568  "addiu %[h], %[h], -0x02 \n\t"
1569  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1570  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1571  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1572  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1573  "bnez %[h], 1b \n\t"
1574  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1575  RESTRICT_ASM_LOW32
1576  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1577  [dst]"+&r"(dst), [src]"+&r"(src),
1578  [h]"+&r"(h)
1579  : [dststride]"r"((mips_reg)dststride),
1580  [srcstride]"r"((mips_reg)srcstride)
1581  : "memory"
1582  );
1583 #else
1584  int i;
1585 
1586  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1587  memcpy(dst, src, 4);
1588 #endif
1589 }
1590 
1591 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1592  ptrdiff_t srcstride, int h, int mx, int my)
1593 {
1594 #if 1
1595  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1596  double ftmp[9];
1597  uint32_t tmp[1];
1598  mips_reg src1, dst1;
1599  DECLARE_VAR_ALL64;
1600 
1601  /*
1602  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1603  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1604  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1605  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1606  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1607  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1608  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1609  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1610 
1611  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1612  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1613  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1614  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1615  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1616  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1617  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1618  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1619  */
1620  __asm__ volatile (
1621  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1622  "li %[tmp0], 0x07 \n\t"
1623  "mtc1 %[tmp0], %[ftmp4] \n\t"
1624 
1625  "1: \n\t"
1626  // 0 - 7
1627  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1628  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1629  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1630  // 8 - 15
1631  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1632 
1633  "addiu %[h], %[h], -0x01 \n\t"
1634  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1635  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1636  "bnez %[h], 1b \n\t"
1637  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1638  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1639  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1640  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1641  [ftmp8]"=&f"(ftmp[8]),
1642  [tmp0]"=&r"(tmp[0]),
1643  RESTRICT_ASM_ALL64
1644  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1645  [h]"+&r"(h),
1646  [dst]"+&r"(dst), [src]"+&r"(src)
1647  : [ff_pw_64]"f"(ff_pw_64),
1648  [srcstride]"r"((mips_reg)srcstride),
1649  [dststride]"r"((mips_reg)dststride),
1650  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1651  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1652  : "memory"
1653  );
1654 #else
1655  const uint8_t *filter = subpel_filters[mx - 1];
1656  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1657  int x, y;
1658 
1659  for (y = 0; y < h; y++) {
1660  for (x = 0; x < 16; x++)
1661  dst[x] = FILTER_4TAP(src, filter, 1);
1662  dst += dststride;
1663  src += srcstride;
1664  }
1665 #endif
1666 }
1667 
1668 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1669  ptrdiff_t srcstride, int h, int mx, int my)
1670 {
1671 #if 1
1672  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1673  double ftmp[9];
1674  uint32_t tmp[1];
1675  DECLARE_VAR_ALL64;
1676 
1677  /*
1678  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1679  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1680  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1681  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1682  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1683  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1684  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1685  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1686  */
1687  __asm__ volatile (
1688  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1689  "li %[tmp0], 0x07 \n\t"
1690  "mtc1 %[tmp0], %[ftmp4] \n\t"
1691 
1692  "1: \n\t"
1693  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1694 
1695  "addiu %[h], %[h], -0x01 \n\t"
1696  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1697  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1698  "bnez %[h], 1b \n\t"
1699  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1700  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1701  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1702  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1703  [ftmp8]"=&f"(ftmp[8]),
1704  [tmp0]"=&r"(tmp[0]),
1705  RESTRICT_ASM_ALL64
1706  [h]"+&r"(h),
1707  [dst]"+&r"(dst), [src]"+&r"(src)
1708  : [ff_pw_64]"f"(ff_pw_64),
1709  [srcstride]"r"((mips_reg)srcstride),
1710  [dststride]"r"((mips_reg)dststride),
1711  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1712  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1713  : "memory"
1714  );
1715 #else
1716  const uint8_t *filter = subpel_filters[mx - 1];
1717  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1718  int x, y;
1719 
1720  for (y = 0; y < h; y++) {
1721  for (x = 0; x < 8; x++)
1722  dst[x] = FILTER_4TAP(src, filter, 1);
1723  dst += dststride;
1724  src += srcstride;
1725  }
1726 #endif
1727 }
1728 
1729 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1730  ptrdiff_t srcstride, int h, int mx, int my)
1731 {
1732 #if 1
1733  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1734  double ftmp[6];
1735  uint32_t tmp[1];
1736  DECLARE_VAR_LOW32;
1737 
1738  /*
1739  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1740  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1741  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1742  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1743  */
1744  __asm__ volatile (
1745  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1746  "li %[tmp0], 0x07 \n\t"
1747  "mtc1 %[tmp0], %[ftmp4] \n\t"
1748 
1749  "1: \n\t"
1750  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1751 
1752  "addiu %[h], %[h], -0x01 \n\t"
1753  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1754  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1755  "bnez %[h], 1b \n\t"
1756  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1757  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1758  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1759  [tmp0]"=&r"(tmp[0]),
1760  RESTRICT_ASM_LOW32
1761  [h]"+&r"(h),
1762  [dst]"+&r"(dst), [src]"+&r"(src)
1763  : [ff_pw_64]"f"(ff_pw_64),
1764  [srcstride]"r"((mips_reg)srcstride),
1765  [dststride]"r"((mips_reg)dststride),
1766  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1767  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1768  : "memory"
1769  );
1770 #else
1771  const uint8_t *filter = subpel_filters[mx - 1];
1772  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1773  int x, y;
1774 
1775  for (y = 0; y < h; y++) {
1776  for (x = 0; x < 4; x++)
1777  dst[x] = FILTER_4TAP(src, filter, 1);
1778  dst += dststride;
1779  src += srcstride;
1780  }
1781 #endif
1782 }
1783 
1784 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1785  ptrdiff_t srcstride, int h, int mx, int my)
1786 {
1787 #if 1
1788  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1789  double ftmp[9];
1790  uint32_t tmp[1];
1791  mips_reg src1, dst1;
1792  DECLARE_VAR_ALL64;
1793 
1794  /*
1795  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1796  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1797  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1798  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1799  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1800  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1801  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1802  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1803 
1804  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1805  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1806  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1807  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1808  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1809  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1810  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1811  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1812  */
1813  __asm__ volatile (
1814  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1815  "li %[tmp0], 0x07 \n\t"
1816  "mtc1 %[tmp0], %[ftmp4] \n\t"
1817 
1818  "1: \n\t"
1819  // 0 - 7
1820  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1821  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1822  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1823  // 8 - 15
1824  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1825 
1826  "addiu %[h], %[h], -0x01 \n\t"
1827  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1828  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1829  "bnez %[h], 1b \n\t"
1830  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1831  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1832  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1833  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1834  [ftmp8]"=&f"(ftmp[8]),
1835  [tmp0]"=&r"(tmp[0]),
1836  RESTRICT_ASM_ALL64
1837  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1838  [h]"+&r"(h),
1839  [dst]"+&r"(dst), [src]"+&r"(src)
1840  : [ff_pw_64]"f"(ff_pw_64),
1841  [srcstride]"r"((mips_reg)srcstride),
1842  [dststride]"r"((mips_reg)dststride),
1843  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1844  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1845  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1846  : "memory"
1847  );
1848 #else
1849  const uint8_t *filter = subpel_filters[mx - 1];
1850  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1851  int x, y;
1852 
1853  for (y = 0; y < h; y++) {
1854  for (x = 0; x < 16; x++)
1855  dst[x] = FILTER_6TAP(src, filter, 1);
1856  dst += dststride;
1857  src += srcstride;
1858  }
1859 #endif
1860 }
1861 
1862 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1863  ptrdiff_t srcstride, int h, int mx, int my)
1864 {
1865 #if 1
1866  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1867  double ftmp[9];
1868  uint32_t tmp[1];
1869  DECLARE_VAR_ALL64;
1870 
1871  /*
1872  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1873  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1874  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1875  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1876  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1877  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1878  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1879  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1880  */
1881  __asm__ volatile (
1882  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1883  "li %[tmp0], 0x07 \n\t"
1884  "mtc1 %[tmp0], %[ftmp4] \n\t"
1885 
1886  "1: \n\t"
1887  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1888 
1889  "addiu %[h], %[h], -0x01 \n\t"
1890  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1891  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1892  "bnez %[h], 1b \n\t"
1893  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1894  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1895  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1896  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1897  [ftmp8]"=&f"(ftmp[8]),
1898  [tmp0]"=&r"(tmp[0]),
1899  RESTRICT_ASM_ALL64
1900  [h]"+&r"(h),
1901  [dst]"+&r"(dst), [src]"+&r"(src)
1902  : [ff_pw_64]"f"(ff_pw_64),
1903  [srcstride]"r"((mips_reg)srcstride),
1904  [dststride]"r"((mips_reg)dststride),
1905  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1906  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1907  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1908  : "memory"
1909  );
1910 #else
1911  const uint8_t *filter = subpel_filters[mx - 1];
1912  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1913  int x, y;
1914 
1915  for (y = 0; y < h; y++) {
1916  for (x = 0; x < 8; x++)
1917  dst[x] = FILTER_6TAP(src, filter, 1);
1918  dst += dststride;
1919  src += srcstride;
1920  }
1921 #endif
1922 }
1923 
1924 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1925  ptrdiff_t srcstride, int h, int mx, int my)
1926 {
1927 #if 1
1928  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1929  double ftmp[6];
1930  uint32_t tmp[1];
1931  DECLARE_VAR_LOW32;
1932 
1933  /*
1934  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1935  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1936  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1937  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1938  */
1939  __asm__ volatile (
1940  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1941  "li %[tmp0], 0x07 \n\t"
1942  "mtc1 %[tmp0], %[ftmp4] \n\t"
1943 
1944  "1: \n\t"
1945  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1946 
1947  "addiu %[h], %[h], -0x01 \n\t"
1948  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1949  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1950  "bnez %[h], 1b \n\t"
1951  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1952  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1953  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1954  [tmp0]"=&r"(tmp[0]),
1955  RESTRICT_ASM_LOW32
1956  [h]"+&r"(h),
1957  [dst]"+&r"(dst), [src]"+&r"(src)
1958  : [ff_pw_64]"f"(ff_pw_64),
1959  [srcstride]"r"((mips_reg)srcstride),
1960  [dststride]"r"((mips_reg)dststride),
1961  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1962  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1963  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1964  : "memory"
1965  );
1966 #else
1967  const uint8_t *filter = subpel_filters[mx - 1];
1968  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1969  int x, y;
1970 
1971  for (y = 0; y < h; y++) {
1972  for (x = 0; x < 4; x++)
1973  dst[x] = FILTER_6TAP(src, filter, 1);
1974  dst += dststride;
1975  src += srcstride;
1976  }
1977 #endif
1978 }
1979 
1980 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1981  ptrdiff_t srcstride, int h, int mx, int my)
1982 {
1983 #if 1
1984  const uint64_t *filter = fourtap_subpel_filters[my - 1];
1985  double ftmp[9];
1986  uint32_t tmp[1];
1987  mips_reg src0, src1, dst0;
1988  DECLARE_VAR_ALL64;
1989 
1990  /*
1991  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1992  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1993  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1994  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1995  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1996  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1997  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1998  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1999 
2000  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2001  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2002  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2003  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2004  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2005  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2006  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2007  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2008  */
2009  __asm__ volatile (
2010  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2011  "li %[tmp0], 0x07 \n\t"
2012  "mtc1 %[tmp0], %[ftmp4] \n\t"
2013 
2014  "1: \n\t"
2015  // 0 - 7
2016  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2017  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2018  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2019  // 8 - 15
2020  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2021 
2022  "addiu %[h], %[h], -0x01 \n\t"
2023  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2024  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2025  "bnez %[h], 1b \n\t"
2026  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2027  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2028  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2029  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2030  [ftmp8]"=&f"(ftmp[8]),
2031  [tmp0]"=&r"(tmp[0]),
2032  RESTRICT_ASM_ALL64
2033  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2034  [src1]"=&r"(src1),
2035  [h]"+&r"(h),
2036  [dst]"+&r"(dst), [src]"+&r"(src)
2037  : [ff_pw_64]"f"(ff_pw_64),
2038  [srcstride]"r"((mips_reg)srcstride),
2039  [dststride]"r"((mips_reg)dststride),
2040  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2041  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2042  : "memory"
2043  );
2044 #else
2045  const uint8_t *filter = subpel_filters[my - 1];
2046  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2047  int x, y;
2048 
2049  for (y = 0; y < h; y++) {
2050  for (x = 0; x < 16; x++)
2051  dst[x] = FILTER_4TAP(src, filter, srcstride);
2052  dst += dststride;
2053  src += srcstride;
2054  }
2055 #endif
2056 }
2057 
2058 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2059  ptrdiff_t srcstride, int h, int mx, int my)
2060 {
2061 #if 1
2062  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2063  double ftmp[9];
2064  uint32_t tmp[1];
2065  mips_reg src1;
2066  DECLARE_VAR_ALL64;
2067 
2068  /*
2069  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2070  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2071  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2072  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2073  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2074  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2075  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2076  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2077  */
2078  __asm__ volatile (
2079  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2080  "li %[tmp0], 0x07 \n\t"
2081  "mtc1 %[tmp0], %[ftmp4] \n\t"
2082 
2083  "1: \n\t"
2084  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2085 
2086  "addiu %[h], %[h], -0x01 \n\t"
2087  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2088  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2089  "bnez %[h], 1b \n\t"
2090  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2091  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2092  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2093  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2094  [ftmp8]"=&f"(ftmp[8]),
2095  [tmp0]"=&r"(tmp[0]),
2096  RESTRICT_ASM_ALL64
2097  [src1]"=&r"(src1),
2098  [h]"+&r"(h),
2099  [dst]"+&r"(dst), [src]"+&r"(src)
2100  : [ff_pw_64]"f"(ff_pw_64),
2101  [srcstride]"r"((mips_reg)srcstride),
2102  [dststride]"r"((mips_reg)dststride),
2103  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2104  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2105  : "memory"
2106  );
2107 #else
2108  const uint8_t *filter = subpel_filters[my - 1];
2109  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2110  int x, y;
2111 
2112  for (y = 0; y < h; y++) {
2113  for (x = 0; x < 8; x++)
2114  dst[x] = FILTER_4TAP(src, filter, srcstride);
2115  dst += dststride;
2116  src += srcstride;
2117  }
2118 #endif
2119 }
2120 
2121 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2122  ptrdiff_t srcstride, int h, int mx, int my)
2123 {
2124 #if 1
2125  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2126  double ftmp[6];
2127  uint32_t tmp[1];
2128  mips_reg src1;
2129  DECLARE_VAR_LOW32;
2130 
2131  /*
2132  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2133  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2134  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2135  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2136  */
2137  __asm__ volatile (
2138  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2139  "li %[tmp0], 0x07 \n\t"
2140  "mtc1 %[tmp0], %[ftmp4] \n\t"
2141 
2142  "1: \n\t"
2143  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2144 
2145  "addiu %[h], %[h], -0x01 \n\t"
2146  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2147  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2148  "bnez %[h], 1b \n\t"
2149  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2150  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2151  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2152  [tmp0]"=&r"(tmp[0]),
2153  RESTRICT_ASM_LOW32
2154  [src1]"=&r"(src1),
2155  [h]"+&r"(h),
2156  [dst]"+&r"(dst), [src]"+&r"(src)
2157  : [ff_pw_64]"f"(ff_pw_64),
2158  [srcstride]"r"((mips_reg)srcstride),
2159  [dststride]"r"((mips_reg)dststride),
2160  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2161  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2162  : "memory"
2163  );
2164 #else
2165  const uint8_t *filter = subpel_filters[my - 1];
2166  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2167  int x, y;
2168 
2169  for (y = 0; y < h; y++) {
2170  for (x = 0; x < 4; x++)
2171  dst[x] = FILTER_4TAP(src, filter, srcstride);
2172  dst += dststride;
2173  src += srcstride;
2174  }
2175 #endif
2176 }
2177 
2178 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2179  ptrdiff_t srcstride, int h, int mx, int my)
2180 {
2181 #if 1
2182  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2183  double ftmp[9];
2184  uint32_t tmp[1];
2185  mips_reg src0, src1, dst0;
2186  DECLARE_VAR_ALL64;
2187 
2188  /*
2189  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2190  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2191  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2192  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2193  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2194  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2195  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2196  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2197 
2198  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2199  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2200  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2201  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2202  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2203  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2204  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2205  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2206  */
2207  __asm__ volatile (
2208  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2209  "li %[tmp0], 0x07 \n\t"
2210  "mtc1 %[tmp0], %[ftmp4] \n\t"
2211 
2212  "1: \n\t"
2213  // 0 - 7
2214  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2215  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2216  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2217  // 8 - 15
2218  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2219 
2220  "addiu %[h], %[h], -0x01 \n\t"
2221  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2222  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2223  "bnez %[h], 1b \n\t"
2224  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2225  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2226  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2227  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2228  [ftmp8]"=&f"(ftmp[8]),
2229  [tmp0]"=&r"(tmp[0]),
2230  RESTRICT_ASM_ALL64
2231  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2232  [src1]"=&r"(src1),
2233  [h]"+&r"(h),
2234  [dst]"+&r"(dst), [src]"+&r"(src)
2235  : [ff_pw_64]"f"(ff_pw_64),
2236  [srcstride]"r"((mips_reg)srcstride),
2237  [dststride]"r"((mips_reg)dststride),
2238  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2239  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2240  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2241  : "memory"
2242  );
2243 #else
2244  const uint8_t *filter = subpel_filters[my - 1];
2245  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2246  int x, y;
2247 
2248  for (y = 0; y < h; y++) {
2249  for (x = 0; x < 16; x++)
2250  dst[x] = FILTER_6TAP(src, filter, srcstride);
2251  dst += dststride;
2252  src += srcstride;
2253  }
2254 #endif
2255 }
2256 
2257 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2258  ptrdiff_t srcstride, int h, int mx, int my)
2259 {
2260 #if 1
2261  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2262  double ftmp[9];
2263  uint32_t tmp[1];
2264  mips_reg src1;
2265  DECLARE_VAR_ALL64;
2266 
2267  /*
2268  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2269  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2270  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2271  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2272  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2273  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2274  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2275  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2276  */
2277  __asm__ volatile (
2278  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2279  "li %[tmp0], 0x07 \n\t"
2280  "mtc1 %[tmp0], %[ftmp4] \n\t"
2281 
2282  "1: \n\t"
2283  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2284 
2285  "addiu %[h], %[h], -0x01 \n\t"
2286  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2287  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2288  "bnez %[h], 1b \n\t"
2289  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2290  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2291  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2292  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2293  [ftmp8]"=&f"(ftmp[8]),
2294  [tmp0]"=&r"(tmp[0]),
2295  RESTRICT_ASM_ALL64
2296  [src1]"=&r"(src1),
2297  [h]"+&r"(h),
2298  [dst]"+&r"(dst), [src]"+&r"(src)
2299  : [ff_pw_64]"f"(ff_pw_64),
2300  [srcstride]"r"((mips_reg)srcstride),
2301  [dststride]"r"((mips_reg)dststride),
2302  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2303  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2304  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2305  : "memory"
2306  );
2307 #else
2308  const uint8_t *filter = subpel_filters[my - 1];
2309  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2310  int x, y;
2311 
2312  for (y = 0; y < h; y++) {
2313  for (x = 0; x < 8; x++)
2314  dst[x] = FILTER_6TAP(src, filter, srcstride);
2315  dst += dststride;
2316  src += srcstride;
2317  }
2318 #endif
2319 }
2320 
2321 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2322  ptrdiff_t srcstride, int h, int mx, int my)
2323 {
2324 #if 1
2325  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2326  double ftmp[6];
2327  uint32_t tmp[1];
2328  mips_reg src1;
2329  DECLARE_VAR_LOW32;
2330 
2331  /*
2332  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2333  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2334  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2335  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2336  */
2337  __asm__ volatile (
2338  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2339  "li %[tmp0], 0x07 \n\t"
2340  "mtc1 %[tmp0], %[ftmp4] \n\t"
2341 
2342  "1: \n\t"
2343  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2344 
2345  "addiu %[h], %[h], -0x01 \n\t"
2346  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2347  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2348  "bnez %[h], 1b \n\t"
2349  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2350  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2351  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2352  [tmp0]"=&r"(tmp[0]),
2353  RESTRICT_ASM_LOW32
2354  [src1]"=&r"(src1),
2355  [h]"+&r"(h),
2356  [dst]"+&r"(dst), [src]"+&r"(src)
2357  : [ff_pw_64]"f"(ff_pw_64),
2358  [srcstride]"r"((mips_reg)srcstride),
2359  [dststride]"r"((mips_reg)dststride),
2360  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2361  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2362  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2363  : "memory"
2364  );
2365 #else
2366  const uint8_t *filter = subpel_filters[my - 1];
2367  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2368  int x, y;
2369 
2370  for (y = 0; y < h; y++) {
2371  for (x = 0; x < 4; x++)
2372  dst[x] = FILTER_6TAP(src, filter, srcstride);
2373  dst += dststride;
2374  src += srcstride;
2375  }
2376 #endif
2377 }
2378 
2379 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2380  ptrdiff_t srcstride, int h, int mx, int my)
2381 {
2382 #if 1
2383  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2384  uint8_t *tmp = tmp_array;
2385 
2386  src -= srcstride;
2387  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2388  tmp = tmp_array + 16;
2389  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2390 #else
2391  const uint8_t *filter = subpel_filters[mx - 1];
2392  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2393  int x, y;
2394  uint8_t tmp_array[560];
2395  uint8_t *tmp = tmp_array;
2396 
2397  src -= srcstride;
2398 
2399  for (y = 0; y < h + 3; y++) {
2400  for (x = 0; x < 16; x++)
2401  tmp[x] = FILTER_4TAP(src, filter, 1);
2402  tmp += 16;
2403  src += srcstride;
2404  }
2405 
2406  tmp = tmp_array + 16;
2407  filter = subpel_filters[my - 1];
2408 
2409  for (y = 0; y < h; y++) {
2410  for (x = 0; x < 16; x++)
2411  dst[x] = FILTER_4TAP(tmp, filter, 16);
2412  dst += dststride;
2413  tmp += 16;
2414  }
2415 #endif
2416 }
2417 
2418 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2419  ptrdiff_t srcstride, int h, int mx, int my)
2420 {
2421 #if 1
2422  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2423  uint8_t *tmp = tmp_array;
2424 
2425  src -= srcstride;
2426  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2427  tmp = tmp_array + 8;
2428  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2429 #else
2430  const uint8_t *filter = subpel_filters[mx - 1];
2431  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2432  int x, y;
2433  uint8_t tmp_array[152];
2434  uint8_t *tmp = tmp_array;
2435 
2436  src -= srcstride;
2437 
2438  for (y = 0; y < h + 3; y++) {
2439  for (x = 0; x < 8; x++)
2440  tmp[x] = FILTER_4TAP(src, filter, 1);
2441  tmp += 8;
2442  src += srcstride;
2443  }
2444 
2445  tmp = tmp_array + 8;
2446  filter = subpel_filters[my - 1];
2447 
2448  for (y = 0; y < h; y++) {
2449  for (x = 0; x < 8; x++)
2450  dst[x] = FILTER_4TAP(tmp, filter, 8);
2451  dst += dststride;
2452  tmp += 8;
2453  }
2454 #endif
2455 }
2456 
2457 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2458  ptrdiff_t srcstride, int h, int mx, int my)
2459 {
2460 #if 1
2461  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2462  uint8_t *tmp = tmp_array;
2463 
2464  src -= srcstride;
2465  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2466  tmp = tmp_array + 4;
2467  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2468 #else
2469  const uint8_t *filter = subpel_filters[mx - 1];
2470  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2471  int x, y;
2472  uint8_t tmp_array[44];
2473  uint8_t *tmp = tmp_array;
2474 
2475  src -= srcstride;
2476 
2477  for (y = 0; y < h + 3; y++) {
2478  for (x = 0; x < 4; x++)
2479  tmp[x] = FILTER_4TAP(src, filter, 1);
2480  tmp += 4;
2481  src += srcstride;
2482  }
2483  tmp = tmp_array + 4;
2484  filter = subpel_filters[my - 1];
2485 
2486  for (y = 0; y < h; y++) {
2487  for (x = 0; x < 4; x++)
2488  dst[x] = FILTER_4TAP(tmp, filter, 4);
2489  dst += dststride;
2490  tmp += 4;
2491  }
2492 #endif
2493 }
2494 
2495 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2496  ptrdiff_t srcstride, int h, int mx, int my)
2497 {
2498 #if 1
2499  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2500  uint8_t *tmp = tmp_array;
2501 
2502  src -= 2 * srcstride;
2503  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2504  tmp = tmp_array + 32;
2505  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2506 #else
2507  const uint8_t *filter = subpel_filters[mx - 1];
2508  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2509  int x, y;
2510  uint8_t tmp_array[592];
2511  uint8_t *tmp = tmp_array;
2512 
2513  src -= 2 * srcstride;
2514 
2515  for (y = 0; y < h + 5; y++) {
2516  for (x = 0; x < 16; x++)
2517  tmp[x] = FILTER_4TAP(src, filter, 1);
2518  tmp += 16;
2519  src += srcstride;
2520  }
2521 
2522  tmp = tmp_array + 32;
2523  filter = subpel_filters[my - 1];
2524 
2525  for (y = 0; y < h; y++) {
2526  for (x = 0; x < 16; x++)
2527  dst[x] = FILTER_6TAP(tmp, filter, 16);
2528  dst += dststride;
2529  tmp += 16;
2530  }
2531 #endif
2532 }
2533 
2534 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2535  ptrdiff_t srcstride, int h, int mx, int my)
2536 {
2537 #if 1
2538  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2539  uint8_t *tmp = tmp_array;
2540 
2541  src -= 2 * srcstride;
2542  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2543  tmp = tmp_array + 16;
2544  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2545 #else
2546  const uint8_t *filter = subpel_filters[mx - 1];
2547  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2548  int x, y;
2549  uint8_t tmp_array[168];
2550  uint8_t *tmp = tmp_array;
2551 
2552  src -= 2 * srcstride;
2553 
2554  for (y = 0; y < h + 5; y++) {
2555  for (x = 0; x < 8; x++)
2556  tmp[x] = FILTER_4TAP(src, filter, 1);
2557  tmp += 8;
2558  src += srcstride;
2559  }
2560 
2561  tmp = tmp_array + 16;
2562  filter = subpel_filters[my - 1];
2563 
2564  for (y = 0; y < h; y++) {
2565  for (x = 0; x < 8; x++)
2566  dst[x] = FILTER_6TAP(tmp, filter, 8);
2567  dst += dststride;
2568  tmp += 8;
2569  }
2570 #endif
2571 }
2572 
2573 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2574  ptrdiff_t srcstride, int h, int mx, int my)
2575 {
2576 #if 1
2577  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2578  uint8_t *tmp = tmp_array;
2579 
2580  src -= 2 * srcstride;
2581  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2582  tmp = tmp_array + 8;
2583  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2584 #else
2585  const uint8_t *filter = subpel_filters[mx - 1];
2586  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2587  int x, y;
2588  uint8_t tmp_array[52];
2589  uint8_t *tmp = tmp_array;
2590 
2591  src -= 2 * srcstride;
2592 
2593  for (y = 0; y < h + 5; y++) {
2594  for (x = 0; x < 4; x++)
2595  tmp[x] = FILTER_4TAP(src, filter, 1);
2596  tmp += 4;
2597  src += srcstride;
2598  }
2599 
2600  tmp = tmp_array + 8;
2601  filter = subpel_filters[my - 1];
2602 
2603  for (y = 0; y < h; y++) {
2604  for (x = 0; x < 4; x++)
2605  dst[x] = FILTER_6TAP(tmp, filter, 4);
2606  dst += dststride;
2607  tmp += 4;
2608  }
2609 #endif
2610 }
2611 
2612 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2613  ptrdiff_t srcstride, int h, int mx, int my)
2614 {
2615 #if 1
2616  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2617  uint8_t *tmp = tmp_array;
2618 
2619  src -= srcstride;
2620  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2621  tmp = tmp_array + 16;
2622  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2623 #else
2624  const uint8_t *filter = subpel_filters[mx - 1];
2625  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2626  int x, y;
2627  uint8_t tmp_array[560];
2628  uint8_t *tmp = tmp_array;
2629 
2630  src -= srcstride;
2631 
2632  for (y = 0; y < h + 3; y++) {
2633  for (x = 0; x < 16; x++)
2634  tmp[x] = FILTER_6TAP(src, filter, 1);
2635  tmp += 16;
2636  src += srcstride;
2637  }
2638 
2639  tmp = tmp_array + 16;
2640  filter = subpel_filters[my - 1];
2641 
2642  for (y = 0; y < h; y++) {
2643  for (x = 0; x < 16; x++)
2644  dst[x] = FILTER_4TAP(tmp, filter, 16);
2645  dst += dststride;
2646  tmp += 16;
2647  }
2648 #endif
2649 }
2650 
2651 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2652  ptrdiff_t srcstride, int h, int mx, int my)
2653 {
2654 #if 1
2655  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2656  uint8_t *tmp = tmp_array;
2657 
2658  src -= srcstride;
2659  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2660  tmp = tmp_array + 8;
2661  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2662 #else
2663  const uint8_t *filter = subpel_filters[mx - 1];
2664  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2665  int x, y;
2666  uint8_t tmp_array[152];
2667  uint8_t *tmp = tmp_array;
2668 
2669  src -= srcstride;
2670 
2671  for (y = 0; y < h + 3; y++) {
2672  for (x = 0; x < 8; x++)
2673  tmp[x] = FILTER_6TAP(src, filter, 1);
2674  tmp += 8;
2675  src += srcstride;
2676  }
2677 
2678  tmp = tmp_array + 8;
2679  filter = subpel_filters[my - 1];
2680 
2681  for (y = 0; y < h; y++) {
2682  for (x = 0; x < 8; x++)
2683  dst[x] = FILTER_4TAP(tmp, filter, 8);
2684  dst += dststride;
2685  tmp += 8;
2686  }
2687 #endif
2688 }
2689 
2690 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2691  ptrdiff_t srcstride, int h, int mx, int my)
2692 {
2693 #if 1
2694  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2695  uint8_t *tmp = tmp_array;
2696 
2697  src -= srcstride;
2698  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2699  tmp = tmp_array + 4;
2700  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2701 #else
2702  const uint8_t *filter = subpel_filters[mx - 1];
2703  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2704  int x, y;
2705  uint8_t tmp_array[44];
2706  uint8_t *tmp = tmp_array;
2707 
2708  src -= srcstride;
2709 
2710  for (y = 0; y < h + 3; y++) {
2711  for (x = 0; x < 4; x++)
2712  tmp[x] = FILTER_6TAP(src, filter, 1);
2713  tmp += 4;
2714  src += srcstride;
2715  }
2716 
2717  tmp = tmp_array + 4;
2718  filter = subpel_filters[my - 1];
2719 
2720  for (y = 0; y < h; y++) {
2721  for (x = 0; x < 4; x++)
2722  dst[x] = FILTER_4TAP(tmp, filter, 4);
2723  dst += dststride;
2724  tmp += 4;
2725  }
2726 #endif
2727 }
2728 
2729 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2730  ptrdiff_t srcstride, int h, int mx, int my)
2731 {
2732 #if 1
2733  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2734  uint8_t *tmp = tmp_array;
2735 
2736  src -= 2 * srcstride;
2737  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2738  tmp = tmp_array + 32;
2739  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2740 #else
2741  const uint8_t *filter = subpel_filters[mx - 1];
2742  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2743  int x, y;
2744  uint8_t tmp_array[592];
2745  uint8_t *tmp = tmp_array;
2746 
2747  src -= 2 * srcstride;
2748 
2749  for (y = 0; y < h + 5; y++) {
2750  for (x = 0; x < 16; x++)
2751  tmp[x] = FILTER_6TAP(src, filter, 1);
2752  tmp += 16;
2753  src += srcstride;
2754  }
2755 
2756  tmp = tmp_array + 32;
2757  filter = subpel_filters[my - 1];
2758 
2759  for (y = 0; y < h; y++) {
2760  for (x = 0; x < 16; x++)
2761  dst[x] = FILTER_6TAP(tmp, filter, 16);
2762  dst += dststride;
2763  tmp += 16;
2764  }
2765 #endif
2766 }
2767 
2768 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2769  ptrdiff_t srcstride, int h, int mx, int my)
2770 {
2771 #if 1
2772  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2773  uint8_t *tmp = tmp_array;
2774 
2775  src -= 2 * srcstride;
2776  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2777  tmp = tmp_array + 16;
2778  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2779 #else
2780  const uint8_t *filter = subpel_filters[mx - 1];
2781  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2782  int x, y;
2783  uint8_t tmp_array[168];
2784  uint8_t *tmp = tmp_array;
2785 
2786  src -= 2 * srcstride;
2787 
2788  for (y = 0; y < h + 5; y++) {
2789  for (x = 0; x < 8; x++)
2790  tmp[x] = FILTER_6TAP(src, filter, 1);
2791  tmp += 8;
2792  src += srcstride;
2793  }
2794 
2795  tmp = tmp_array + 16;
2796  filter = subpel_filters[my - 1];
2797 
2798  for (y = 0; y < h; y++) {
2799  for (x = 0; x < 8; x++)
2800  dst[x] = FILTER_6TAP(tmp, filter, 8);
2801  dst += dststride;
2802  tmp += 8;
2803  }
2804 #endif
2805 }
2806 
2807 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2808  ptrdiff_t srcstride, int h, int mx, int my)
2809 {
2810 #if 1
2811  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2812  uint8_t *tmp = tmp_array;
2813 
2814  src -= 2 * srcstride;
2815  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2816  tmp = tmp_array + 8;
2817  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2818 #else
2819  const uint8_t *filter = subpel_filters[mx - 1];
2820  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2821  int x, y;
2822  uint8_t tmp_array[52];
2823  uint8_t *tmp = tmp_array;
2824 
2825  src -= 2 * srcstride;
2826 
2827  for (y = 0; y < h + 5; y++) {
2828  for (x = 0; x < 4; x++)
2829  tmp[x] = FILTER_6TAP(src, filter, 1);
2830  tmp += 4;
2831  src += srcstride;
2832  }
2833 
2834  tmp = tmp_array + 8;
2835  filter = subpel_filters[my - 1];
2836 
2837  for (y = 0; y < h; y++) {
2838  for (x = 0; x < 4; x++)
2839  dst[x] = FILTER_6TAP(tmp, filter, 4);
2840  dst += dststride;
2841  tmp += 4;
2842  }
2843 #endif
2844 }
2845 
2846 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2847  ptrdiff_t sstride, int h, int mx, int my)
2848 {
2849 #if 1
2850  int a = 8 - mx, b = mx;
2851  double ftmp[7];
2852  uint32_t tmp[1];
2853  mips_reg dst0, src0;
2854  DECLARE_VAR_ALL64;
2855 
2856  /*
2857  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2858  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2859  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2860  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2861  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2862  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2863  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2864  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2865 
2866  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2867  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2868  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2869  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2870  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2871  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2872  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2873  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2874  */
2875  __asm__ volatile (
2876  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2877  "li %[tmp0], 0x03 \n\t"
2878  "mtc1 %[tmp0], %[ftmp4] \n\t"
2879  "pshufh %[a], %[a], %[ftmp0] \n\t"
2880  "pshufh %[b], %[b], %[ftmp0] \n\t"
2881 
2882  "1: \n\t"
2883  // 0 - 7
2884  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2885  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2886  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2887  // 8 - 15
2888  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2889 
2890  "addiu %[h], %[h], -0x01 \n\t"
2891  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2892  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2893  "bnez %[h], 1b \n\t"
2894  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2895  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2896  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2897  [ftmp6]"=&f"(ftmp[6]),
2898  [tmp0]"=&r"(tmp[0]),
2899  RESTRICT_ASM_ALL64
2900  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2901  [h]"+&r"(h),
2902  [dst]"+&r"(dst), [src]"+&r"(src),
2903  [a]"+&f"(a), [b]"+&f"(b)
2904  : [sstride]"r"((mips_reg)sstride),
2905  [dstride]"r"((mips_reg)dstride),
2906  [ff_pw_4]"f"(ff_pw_4)
2907  : "memory"
2908  );
2909 #else
2910  int a = 8 - mx, b = mx;
2911  int x, y;
2912 
2913  for (y = 0; y < h; y++) {
2914  for (x = 0; x < 16; x++)
2915  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2916  dst += dstride;
2917  src += sstride;
2918  }
2919 #endif
2920 }
2921 
2922 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2923  ptrdiff_t sstride, int h, int mx, int my)
2924 {
2925 #if 1
2926  int c = 8 - my, d = my;
2927  double ftmp[7];
2928  uint32_t tmp[1];
2929  mips_reg src0, src1, dst0;
2930  DECLARE_VAR_ALL64;
2931 
2932  /*
2933  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2934  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2935  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2936  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2937  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2938  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2939  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2940  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2941  */
2942  __asm__ volatile (
2943  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2944  "li %[tmp0], 0x03 \n\t"
2945  "mtc1 %[tmp0], %[ftmp4] \n\t"
2946  "pshufh %[c], %[c], %[ftmp0] \n\t"
2947  "pshufh %[d], %[d], %[ftmp0] \n\t"
2948 
2949  "1: \n\t"
2950  // 0 - 7
2951  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2952  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2953  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2954  // 8 - 15
2955  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2956 
2957  "addiu %[h], %[h], -0x01 \n\t"
2958  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2959  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2960  "bnez %[h], 1b \n\t"
2961  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2962  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2963  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2964  [ftmp6]"=&f"(ftmp[6]),
2965  [tmp0]"=&r"(tmp[0]),
2966  RESTRICT_ASM_ALL64
2967  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2968  [src1]"=&r"(src1),
2969  [h]"+&r"(h),
2970  [dst]"+&r"(dst), [src]"+&r"(src),
2971  [c]"+&f"(c), [d]"+&f"(d)
2972  : [sstride]"r"((mips_reg)sstride),
2973  [dstride]"r"((mips_reg)dstride),
2974  [ff_pw_4]"f"(ff_pw_4)
2975  : "memory"
2976  );
2977 #else
2978  int c = 8 - my, d = my;
2979  int x, y;
2980 
2981  for (y = 0; y < h; y++) {
2982  for (x = 0; x < 16; x++)
2983  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2984  dst += dstride;
2985  src += sstride;
2986  }
2987 #endif
2988 }
2989 
2990 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2991  ptrdiff_t sstride, int h, int mx, int my)
2992 {
2993 #if 1
2994  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2995  uint8_t *tmp = tmp_array;
2996 
2997  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2998  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2999 #else
3000  int a = 8 - mx, b = mx;
3001  int c = 8 - my, d = my;
3002  int x, y;
3003  uint8_t tmp_array[528];
3004  uint8_t *tmp = tmp_array;
3005 
3006  for (y = 0; y < h + 1; y++) {
3007  for (x = 0; x < 16; x++)
3008  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3009  tmp += 16;
3010  src += sstride;
3011  }
3012 
3013  tmp = tmp_array;
3014 
3015  for (y = 0; y < h; y++) {
3016  for (x = 0; x < 16; x++)
3017  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3018  dst += dstride;
3019  tmp += 16;
3020  }
3021 #endif
3022 }
3023 
3024 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3025  ptrdiff_t sstride, int h, int mx, int my)
3026 {
3027 #if 1
3028  int a = 8 - mx, b = mx;
3029  double ftmp[7];
3030  uint32_t tmp[1];
3031  DECLARE_VAR_ALL64;
3032 
3033  /*
3034  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3035  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3036  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3037  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3038  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3039  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3040  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3041  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3042  */
3043  __asm__ volatile (
3044  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3045  "li %[tmp0], 0x03 \n\t"
3046  "mtc1 %[tmp0], %[ftmp4] \n\t"
3047  "pshufh %[a], %[a], %[ftmp0] \n\t"
3048  "pshufh %[b], %[b], %[ftmp0] \n\t"
3049 
3050  "1: \n\t"
3051  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3052 
3053  "addiu %[h], %[h], -0x01 \n\t"
3054  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3055  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3056  "bnez %[h], 1b \n\t"
3057  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3058  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3059  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3060  [ftmp6]"=&f"(ftmp[6]),
3061  [tmp0]"=&r"(tmp[0]),
3062  RESTRICT_ASM_ALL64
3063  [h]"+&r"(h),
3064  [dst]"+&r"(dst), [src]"+&r"(src),
3065  [a]"+&f"(a), [b]"+&f"(b)
3066  : [sstride]"r"((mips_reg)sstride),
3067  [dstride]"r"((mips_reg)dstride),
3068  [ff_pw_4]"f"(ff_pw_4)
3069  : "memory"
3070  );
3071 #else
3072  int a = 8 - mx, b = mx;
3073  int x, y;
3074 
3075  for (y = 0; y < h; y++) {
3076  for (x = 0; x < 8; x++)
3077  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3078  dst += dstride;
3079  src += sstride;
3080  }
3081 #endif
3082 }
3083 
3084 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3085  ptrdiff_t sstride, int h, int mx, int my)
3086 {
3087 #if 1
3088  int c = 8 - my, d = my;
3089  double ftmp[7];
3090  uint32_t tmp[1];
3091  mips_reg src1;
3092  DECLARE_VAR_ALL64;
3093 
3094  /*
3095  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3096  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3097  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3098  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3099  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3100  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3101  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3102  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3103  */
3104  __asm__ volatile (
3105  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3106  "li %[tmp0], 0x03 \n\t"
3107  "mtc1 %[tmp0], %[ftmp4] \n\t"
3108  "pshufh %[c], %[c], %[ftmp0] \n\t"
3109  "pshufh %[d], %[d], %[ftmp0] \n\t"
3110 
3111  "1: \n\t"
3112  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3113 
3114  "addiu %[h], %[h], -0x01 \n\t"
3115  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3116  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3117  "bnez %[h], 1b \n\t"
3118  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3119  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3120  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3121  [ftmp6]"=&f"(ftmp[6]),
3122  [tmp0]"=&r"(tmp[0]),
3123  RESTRICT_ASM_ALL64
3124  [src1]"=&r"(src1),
3125  [h]"+&r"(h),
3126  [dst]"+&r"(dst), [src]"+&r"(src),
3127  [c]"+&f"(c), [d]"+&f"(d)
3128  : [sstride]"r"((mips_reg)sstride),
3129  [dstride]"r"((mips_reg)dstride),
3130  [ff_pw_4]"f"(ff_pw_4)
3131  : "memory"
3132  );
3133 #else
3134  int c = 8 - my, d = my;
3135  int x, y;
3136 
3137  for (y = 0; y < h; y++) {
3138  for (x = 0; x < 8; x++)
3139  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3140  dst += dstride;
3141  src += sstride;
3142  }
3143 #endif
3144 }
3145 
3146 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3147  ptrdiff_t sstride, int h, int mx, int my)
3148 {
3149 #if 1
3150  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3151  uint8_t *tmp = tmp_array;
3152 
3153  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3154  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3155 #else
3156  int a = 8 - mx, b = mx;
3157  int c = 8 - my, d = my;
3158  int x, y;
3159  uint8_t tmp_array[136];
3160  uint8_t *tmp = tmp_array;
3161 
3162  for (y = 0; y < h + 1; y++) {
3163  for (x = 0; x < 8; x++)
3164  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3165  tmp += 8;
3166  src += sstride;
3167  }
3168 
3169  tmp = tmp_array;
3170 
3171  for (y = 0; y < h; y++) {
3172  for (x = 0; x < 8; x++)
3173  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3174  dst += dstride;
3175  tmp += 8;
3176  }
3177 #endif
3178 }
3179 
3180 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3181  ptrdiff_t sstride, int h, int mx, int my)
3182 {
3183 #if 1
3184  int a = 8 - mx, b = mx;
3185  double ftmp[5];
3186  uint32_t tmp[1];
3187  DECLARE_VAR_LOW32;
3188  DECLARE_VAR_ALL64;
3189 
3190  /*
3191  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3192  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3193  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3194  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3195  */
3196  __asm__ volatile (
3197  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3198  "li %[tmp0], 0x03 \n\t"
3199  "mtc1 %[tmp0], %[ftmp4] \n\t"
3200  "pshufh %[a], %[a], %[ftmp0] \n\t"
3201  "pshufh %[b], %[b], %[ftmp0] \n\t"
3202 
3203  "1: \n\t"
3204  PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3205 
3206  "addiu %[h], %[h], -0x01 \n\t"
3207  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3208  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3209  "bnez %[h], 1b \n\t"
3210  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3211  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3212  [ftmp4]"=&f"(ftmp[4]),
3213  [tmp0]"=&r"(tmp[0]),
3214  RESTRICT_ASM_LOW32
3215  RESTRICT_ASM_ALL64
3216  [h]"+&r"(h),
3217  [dst]"+&r"(dst), [src]"+&r"(src),
3218  [a]"+&f"(a), [b]"+&f"(b)
3219  : [sstride]"r"((mips_reg)sstride),
3220  [dstride]"r"((mips_reg)dstride),
3221  [ff_pw_4]"f"(ff_pw_4)
3222  : "memory"
3223  );
3224 #else
3225  int a = 8 - mx, b = mx;
3226  int x, y;
3227 
3228  for (y = 0; y < h; y++) {
3229  for (x = 0; x < 4; x++)
3230  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3231  dst += dstride;
3232  src += sstride;
3233  }
3234 #endif
3235 }
3236 
3237 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3238  ptrdiff_t sstride, int h, int mx, int my)
3239 {
3240 #if 1
3241  int c = 8 - my, d = my;
3242  double ftmp[7];
3243  uint32_t tmp[1];
3244  mips_reg src1;
3245  DECLARE_VAR_LOW32;
3246  DECLARE_VAR_ALL64;
3247 
3248  /*
3249  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3250  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3251  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3252  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3253  */
3254  __asm__ volatile (
3255  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3256  "li %[tmp0], 0x03 \n\t"
3257  "mtc1 %[tmp0], %[ftmp4] \n\t"
3258  "pshufh %[c], %[c], %[ftmp0] \n\t"
3259  "pshufh %[d], %[d], %[ftmp0] \n\t"
3260 
3261  "1: \n\t"
3262  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3263 
3264  "addiu %[h], %[h], -0x01 \n\t"
3265  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3266  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3267  "bnez %[h], 1b \n\t"
3268  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3269  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3270  [ftmp4]"=&f"(ftmp[4]),
3271  [tmp0]"=&r"(tmp[0]),
3272  RESTRICT_ASM_LOW32
3273  RESTRICT_ASM_ALL64
3274  [src1]"=&r"(src1),
3275  [h]"+&r"(h),
3276  [dst]"+&r"(dst), [src]"+&r"(src),
3277  [c]"+&f"(c), [d]"+&f"(d)
3278  : [sstride]"r"((mips_reg)sstride),
3279  [dstride]"r"((mips_reg)dstride),
3280  [ff_pw_4]"f"(ff_pw_4)
3281  : "memory"
3282  );
3283 #else
3284  int c = 8 - my, d = my;
3285  int x, y;
3286 
3287  for (y = 0; y < h; y++) {
3288  for (x = 0; x < 4; x++)
3289  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3290  dst += dstride;
3291  src += sstride;
3292  }
3293 #endif
3294 }
3295 
3296 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3297  ptrdiff_t sstride, int h, int mx, int my)
3298 {
3299 #if 1
3300  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3301  uint8_t *tmp = tmp_array;
3302 
3303  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3304  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3305 #else
3306  int a = 8 - mx, b = mx;
3307  int c = 8 - my, d = my;
3308  int x, y;
3309  uint8_t tmp_array[36];
3310  uint8_t *tmp = tmp_array;
3311 
3312  for (y = 0; y < h + 1; y++) {
3313  for (x = 0; x < 4; x++)
3314  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3315  tmp += 4;
3316  src += sstride;
3317  }
3318 
3319  tmp = tmp_array;
3320 
3321  for (y = 0; y < h; y++) {
3322  for (x = 0; x < 4; x++)
3323  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3324  dst += dstride;
3325  tmp += 4;
3326  }
3327 #endif
3328 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:32
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:258
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
ff_pw_64
const uint64_t ff_pw_64
Definition: constants.c:45
mem_internal.h
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1729
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:486
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:693
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1379
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:359
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1446
av_unused
#define av_unused
Definition: attributes.h:131
ff_pw_4
const uint64_t ff_pw_4
Definition: constants.c:29
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:27
w
uint8_t w
Definition: llviddspenc.c:39
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:41
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1464
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2922
t1
#define t1
Definition: regdef.h:29
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
mips_reg
#define mips_reg
Definition: asmdefs.h:44
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:33
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1455
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:492
t10
#define t10
Definition: regdef.h:55
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:970
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:720
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:560
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:269
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2846
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3084
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:668
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2418
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:76
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:620
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:31
mmiutils.h
a1
#define a1
Definition: regdef.h:47
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2729
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2534
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:786
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2573
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:633
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2178
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:602
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:577
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1417
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2058
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1128
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2457
q0
static const uint8_t q0[256]
Definition: twofish.c:77
E
#define E
Definition: avdct.c:32
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:481
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1924
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:536
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:35
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:301
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3180
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2768
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1668
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:49
src
#define src
Definition: vp8dsp.c:255
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2807
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:856
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1784
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:230
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1402
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:34
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1387
t11
#define t11
Definition: regdef.h:56
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:871
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1353
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2651
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1980
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3146
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3237
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1862
t12
#define t12
Definition: regdef.h:58
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1295
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1362
FFMIN
#define FFMIN(a, b)
Definition: common.h:105
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
attributes.h
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:768
a0
#define a0
Definition: regdef.h:46
src0
#define src0
Definition: h264pred.c:139
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2379
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:117
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2121
src1
#define src1
Definition: h264pred.c:140
i
int i
Definition: input.c:407
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2990
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3024
t3
#define t3
Definition: regdef.h:31
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1432
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1372
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1591
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:667
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3296
a2
#define a2
Definition: regdef.h:48
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:30
av_always_inline
#define av_always_inline
Definition: attributes.h:49
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:430
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2612
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:389
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2257
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:955
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1439
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1104
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2495
t2
#define t2
Definition: regdef.h:30
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1551
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:285
dstV
uint16_t * dstV
Definition: input.c:403
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:193
cm
#define cm
Definition: dvbsubdec.c:38
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2690
av_clip_uint8
#define av_clip_uint8
Definition: common.h:128
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:351
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:741
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2321
d
d
Definition: ffmpeg_filter.c:156
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1511
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:332
h
h
Definition: vp9dsp_template.c:2038
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1394
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:457