FFmpeg
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
28 #include "libavutil/mem_internal.h"
29 
30 #define DECLARE_DOUBLE_1 double db_1
31 #define DECLARE_DOUBLE_2 double db_2
32 #define DECLARE_UINT32_T uint32_t it_1
33 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
36 
37 #define MMI_PCMPGTUB(dst, src1, src2) \
38  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
39  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
40  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
41  "pxor "#dst", %[db_2], %[db_1] \n\t"
42 
43 #define MMI_BTOH(dst_l, dst_r, src) \
44  "pxor %[db_1], %[db_1], %[db_1] \n\t" \
45  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
46  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
47  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
48 
49 #define MMI_VP8_LOOP_FILTER \
50  /* Calculation of hev */ \
51  "dmtc1 %[thresh], %[ftmp3] \n\t" \
52  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
56  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
57  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
58  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
59  /* Calculation of mask */ \
60  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
61  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
62  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
63  "li %[tmp0], 0x09 \n\t" \
64  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
65  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
66  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
67  "dmtc1 %[e], %[ftmp3] \n\t" \
68  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
72  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
73  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
74  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
75  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
76  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
77  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
78  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
79  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
80  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
81  "dmtc1 %[i], %[ftmp3] \n\t" \
82  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
86  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
87  "pxor %[mask], %[mask], %[ftmp3] \n\t" \
88  /* VP8_MBFILTER */ \
89  "li %[tmp0], 0x80808080 \n\t" \
90  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
91  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
92  "pxor %[p2], %[p2], %[ftmp7] \n\t" \
93  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
94  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
95  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
96  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
97  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
98  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
99  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
100  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
101  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
102  /* Right part */ \
103  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
104  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
105  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
106  /* Left part */ \
107  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
108  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
109  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
110  /* Combine left and right part */ \
111  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
112  "pand %[ftmp1], %[ftmp1], %[mask] \n\t" \
113  "pand %[ftmp2], %[ftmp1], %[hev] \n\t" \
114  "li %[tmp0], 0x04040404 \n\t" \
115  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
116  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
117  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
118  "li %[tmp0], 0x0B \n\t" \
119  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
120  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
121  "li %[tmp0], 0x03030303 \n\t" \
122  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
123  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
124  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
125  "li %[tmp0], 0x0B \n\t" \
126  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
127  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
128  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
129  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
130  /* filt_val &= ~hev */ \
131  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
132  "pxor %[hev], %[hev], %[ftmp0] \n\t" \
133  "pand %[ftmp1], %[ftmp1], %[hev] \n\t" \
134  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
135  "li %[tmp0], 0x07 \n\t" \
136  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
137  "li %[tmp0], 0x001b001b \n\t" \
138  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
139  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
140  "li %[tmp0], 0x003f003f \n\t" \
141  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
142  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
143  /* Right part */ \
144  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
145  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
146  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
147  /* Left part */ \
148  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
149  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
150  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
151  /* Combine left and right part */ \
152  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
153  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
154  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
155  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
156  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
157  "li %[tmp0], 0x00120012 \n\t" \
158  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
159  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
160  /* Right part */ \
161  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
162  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
163  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
164  /* Left part */ \
165  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
166  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
167  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
168  /* Combine left and right part */ \
169  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
170  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
171  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
172  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
173  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
174  "li %[tmp0], 0x03 \n\t" \
175  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
176  /* Right part */ \
177  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
178  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
179  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
180  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
181  /* Left part */ \
182  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
183  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
184  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
185  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
186  /* Combine left and right part */ \
187  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
188  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
189  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
190  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
191  "pxor %[p2], %[p2], %[ftmp7] \n\t"
192 
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
194  MMI_ULWC1(%[ftmp1], src, 0x00) \
195  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
196  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
197  \
198  MMI_ULWC1(%[ftmp1], src, -0x01) \
199  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
200  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
201  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
202  \
203  MMI_ULWC1(%[ftmp1], src, -0x02) \
204  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
205  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
206  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
207  \
208  MMI_ULWC1(%[ftmp1], src, 0x01) \
209  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
210  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
211  \
212  MMI_ULWC1(%[ftmp1], src, 0x02) \
213  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
214  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
215  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
216  \
217  MMI_ULWC1(%[ftmp1], src, 0x03) \
218  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
219  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
220  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
221  \
222  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
223  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
224  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
225  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
226  \
227  MMI_SWC1(%[ftmp1], dst, 0x00)
228 
229 
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
231  MMI_ULWC1(%[ftmp1], src, 0x00) \
232  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
234  \
235  MMI_ULWC1(%[ftmp1], src, -0x01) \
236  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
237  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
238  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
239  \
240  MMI_ULWC1(%[ftmp1], src, 0x01) \
241  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
242  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
243  \
244  MMI_ULWC1(%[ftmp1], src, 0x02) \
245  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
246  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
247  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
248  \
249  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
250  \
251  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
252  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
253  \
254  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
255  MMI_SWC1(%[ftmp1], dst, 0x00)
256 
257 
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
259  MMI_ULWC1(%[ftmp1], src, 0x00) \
260  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
261  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
262  \
263  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
264  MMI_ULWC1(%[ftmp1], src1, 0x00) \
265  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
266  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
267  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
268  \
269  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
270  MMI_ULWC1(%[ftmp1], src1, 0x00) \
271  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
272  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
273  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
274  \
275  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
276  MMI_ULWC1(%[ftmp1], src1, 0x00) \
277  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
278  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
279  \
280  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
281  MMI_ULWC1(%[ftmp1], src1, 0x00) \
282  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
283  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
284  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
285  \
286  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
287  MMI_ULWC1(%[ftmp1], src1, 0x00) \
288  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
289  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
290  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
291  \
292  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
293  \
294  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
295  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
296  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
297  \
298  MMI_SWC1(%[ftmp1], dst, 0x00)
299 
300 
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
302  MMI_ULWC1(%[ftmp1], src, 0x00) \
303  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
304  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
305  \
306  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
307  MMI_ULWC1(%[ftmp1], src1, 0x00) \
308  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
310  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
311  \
312  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
313  MMI_ULWC1(%[ftmp1], src1, 0x00) \
314  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
315  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
316  \
317  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
318  MMI_ULWC1(%[ftmp1], src1, 0x00) \
319  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
320  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
321  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
322  \
323  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
324  \
325  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
326  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
327  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
328  \
329  MMI_SWC1(%[ftmp1], dst, 0x00)
330 
331 
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
333  MMI_ULDC1(%[ftmp1], src, 0x00) \
334  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
335  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
336  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
337  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
338  \
339  MMI_ULDC1(%[ftmp1], src, -0x01) \
340  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
341  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
342  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
343  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
344  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
345  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
346  \
347  MMI_ULDC1(%[ftmp1], src, -0x02) \
348  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
349  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
350  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
351  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
352  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
353  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
354  \
355  MMI_ULDC1(%[ftmp1], src, 0x01) \
356  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
357  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
358  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
359  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
360  \
361  MMI_ULDC1(%[ftmp1], src, 0x02) \
362  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
363  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
364  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
365  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
366  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
367  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
368  \
369  MMI_ULDC1(%[ftmp1], src, 0x03) \
370  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
371  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
372  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
373  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
374  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
375  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
376  \
377  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
378  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
379  \
380  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
381  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
382  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
383  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
384  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
385  \
386  MMI_SDC1(%[ftmp1], dst, 0x00)
387 
388 
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
390  MMI_ULDC1(%[ftmp1], src, 0x00) \
391  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
392  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
393  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
394  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
395  \
396  MMI_ULDC1(%[ftmp1], src, -0x01) \
397  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
399  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
400  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
401  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
402  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
403  \
404  MMI_ULDC1(%[ftmp1], src, 0x01) \
405  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
406  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
407  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
408  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
409  \
410  MMI_ULDC1(%[ftmp1], src, 0x02) \
411  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
412  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
413  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
414  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
415  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
416  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
417  \
418  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
419  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
420  \
421  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
422  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
423  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
424  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
425  \
426  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
427  MMI_SDC1(%[ftmp1], dst, 0x00)
428 
429 
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
431  MMI_ULDC1(%[ftmp1], src, 0x00) \
432  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
433  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
434  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
435  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
436  \
437  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
438  MMI_ULDC1(%[ftmp1], src1, 0x00) \
439  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
441  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
442  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
443  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
444  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
445  \
446  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
447  MMI_ULDC1(%[ftmp1], src1, 0x00) \
448  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
449  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
450  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
451  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
452  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
453  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
454  \
455  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
456  MMI_ULDC1(%[ftmp1], src1, 0x00) \
457  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
458  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
459  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
460  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
461  \
462  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
463  MMI_ULDC1(%[ftmp1], src1, 0x00) \
464  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
465  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
466  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
467  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
468  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
469  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
470  \
471  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
472  MMI_ULDC1(%[ftmp1], src1, 0x00) \
473  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
474  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
475  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
476  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
477  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
478  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
479  \
480  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
481  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
482  \
483  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
484  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
485  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
486  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
487  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
488  \
489  MMI_SDC1(%[ftmp1], dst, 0x00)
490 
491 
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
493  MMI_ULDC1(%[ftmp1], src, 0x00) \
494  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
495  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
496  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
497  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
498  \
499  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
500  MMI_ULDC1(%[ftmp1], src1, 0x00) \
501  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
502  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
503  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
504  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
505  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
506  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
507  \
508  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
509  MMI_ULDC1(%[ftmp1], src1, 0x00) \
510  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
511  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
512  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
513  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
514  \
515  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
516  MMI_ULDC1(%[ftmp1], src1, 0x00) \
517  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
518  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
519  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
520  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
521  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
522  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
523  \
524  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
525  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
526  \
527  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
528  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
529  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
530  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
531  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
532  \
533  MMI_SDC1(%[ftmp1], dst, 0x00)
534 
535 
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
537  MMI_ULDC1(%[ftmp1], src, 0x00) \
538  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
539  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
540  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
541  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
542  \
543  MMI_ULDC1(%[ftmp1], src, 0x01) \
544  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
545  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
546  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
547  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
548  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
549  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
550  \
551  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
552  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
553  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
554  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
555  \
556  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
557  MMI_SDC1(%[ftmp1], dst, 0x00)
558 
559 
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
561  MMI_ULWC1(%[ftmp1], src, 0x00) \
562  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
563  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
564  \
565  MMI_ULWC1(%[ftmp1], src, 0x01) \
566  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
567  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
568  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
569  \
570  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
571  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
572  \
573  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
574  MMI_SWC1(%[ftmp1], dst, 0x00)
575 
576 
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
578  MMI_ULDC1(%[ftmp1], src, 0x00) \
579  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
580  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
581  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
582  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
583  \
584  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
585  MMI_ULDC1(%[ftmp1], src1, 0x00) \
586  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
587  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
588  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
589  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
590  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
591  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
592  \
593  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
594  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
595  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
596  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
597  \
598  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
599  MMI_SDC1(%[ftmp1], dst, 0x00)
600 
601 
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
603  MMI_ULWC1(%[ftmp1], src, 0x00) \
604  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
605  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
606  \
607  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
608  MMI_ULWC1(%[ftmp1], src1, 0x00) \
609  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
610  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
611  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
612  \
613  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
614  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
615  \
616  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
617  MMI_SWC1(%[ftmp1], dst, 0x00)
618 
619 
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623 
624  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626 
627  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629 
630  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632 
633  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635 
636  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638 
639  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641 };
642 
643 #if 0
644 #define FILTER_6TAP(src, F, stride) \
645  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
646  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
647  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648 
649 #define FILTER_4TAP(src, F, stride) \
650  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
651  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652 
653 static const uint8_t subpel_filters[7][6] = {
654  { 0, 6, 123, 12, 1, 0 },
655  { 2, 11, 108, 36, 8, 1 },
656  { 0, 9, 93, 50, 6, 0 },
657  { 3, 16, 77, 77, 16, 3 },
658  { 0, 6, 50, 93, 9, 0 },
659  { 1, 8, 36, 108, 11, 2 },
660  { 0, 1, 12, 123, 6, 0 },
661 };
662 
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a) (((a) * 35468) >> 16)
665 #endif
666 
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
669  ptrdiff_t stride)
670 {
671  int av_unused p1 = p[-2 * stride];
672  int av_unused p0 = p[-1 * stride];
673  int av_unused q0 = p[ 0 * stride];
674  int av_unused q1 = p[ 1 * stride];
675  int a, f1, f2;
676  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677 
678  a = 3 * (q0 - p0);
679  a += clip_int8(p1 - q1);
680  a = clip_int8(a);
681 
682  // We deviate from the spec here with c(a+3) >> 3
683  // since that's what libvpx does.
684  f1 = FFMIN(a + 4, 127) >> 3;
685  f2 = FFMIN(a + 3, 127) >> 3;
686 
687  // Despite what the spec says, we do need to clamp here to
688  // be bitexact with libvpx.
689  p[-1 * stride] = cm[p0 + f2];
690  p[ 0 * stride] = cm[q0 - f1];
691 }
692 
694  ptrdiff_t stride)
695 {
696  int av_unused p1 = p[-2 * stride];
697  int av_unused p0 = p[-1 * stride];
698  int av_unused q0 = p[ 0 * stride];
699  int av_unused q1 = p[ 1 * stride];
700  int a, f1, f2;
701  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702 
703  a = 3 * (q0 - p0);
704  a = clip_int8(a);
705 
706  // We deviate from the spec here with c(a+3) >> 3
707  // since that's what libvpx does.
708  f1 = FFMIN(a + 4, 127) >> 3;
709  f2 = FFMIN(a + 3, 127) >> 3;
710 
711  // Despite what the spec says, we do need to clamp here to
712  // be bitexact with libvpx.
713  p[-1 * stride] = cm[p0 + f2];
714  p[ 0 * stride] = cm[q0 - f1];
715  a = (f1 + 1) >> 1;
716  p[-2 * stride] = cm[p1 + a];
717  p[ 1 * stride] = cm[q1 - a];
718 }
719 
720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721  int flim)
722 {
723  int av_unused p1 = p[-2 * stride];
724  int av_unused p0 = p[-1 * stride];
725  int av_unused q0 = p[ 0 * stride];
726  int av_unused q1 = p[ 1 * stride];
727 
728  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729 }
730 
731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732 {
733  int av_unused p1 = p[-2 * stride];
734  int av_unused p0 = p[-1 * stride];
735  int av_unused q0 = p[ 0 * stride];
736  int av_unused q1 = p[ 1 * stride];
737 
738  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739 }
740 
741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742 {
743  int a0, a1, a2, w;
744  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745 
746  int av_unused p2 = p[-3 * stride];
747  int av_unused p1 = p[-2 * stride];
748  int av_unused p0 = p[-1 * stride];
749  int av_unused q0 = p[ 0 * stride];
750  int av_unused q1 = p[ 1 * stride];
751  int av_unused q2 = p[ 2 * stride];
752 
753  w = clip_int8(p1 - q1);
754  w = clip_int8(w + 3 * (q0 - p0));
755 
756  a0 = (27 * w + 63) >> 7;
757  a1 = (18 * w + 63) >> 7;
758  a2 = (9 * w + 63) >> 7;
759 
760  p[-3 * stride] = cm[p2 + a2];
761  p[-2 * stride] = cm[p1 + a1];
762  p[-1 * stride] = cm[p0 + a0];
763  p[ 0 * stride] = cm[q0 - a0];
764  p[ 1 * stride] = cm[q1 - a1];
765  p[ 2 * stride] = cm[q2 - a2];
766 }
767 
768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769  int E, int I)
770 {
771  int av_unused p3 = p[-4 * stride];
772  int av_unused p2 = p[-3 * stride];
773  int av_unused p1 = p[-2 * stride];
774  int av_unused p0 = p[-1 * stride];
775  int av_unused q0 = p[ 0 * stride];
776  int av_unused q1 = p[ 1 * stride];
777  int av_unused q2 = p[ 2 * stride];
778  int av_unused q3 = p[ 3 * stride];
779 
780  return vp8_simple_limit(p, stride, E) &&
781  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784 }
785 
786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788 {
789  double ftmp[18];
790  uint32_t tmp[1];
794  __asm__ volatile(
795  /* Get data from dst */
796  "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
797  "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
798  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
799  "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
800  "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
801  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
802  "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
803  "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
804  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
805  "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
806  "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
807  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
808  "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
809  "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
810  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
811  "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
812  "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
813  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
814  "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
815  "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
816  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
817  "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
818  "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
820  /* Move to dst */
821  "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
822  "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
823  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
824  "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
825  "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
826  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
827  "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
828  "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
829  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
830  "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
831  "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
832  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
833  "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
834  "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
835  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
836  "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
837  "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
838  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
839  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
840  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
841  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
842  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
843  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
844  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
845  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
846  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
847  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
850  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
851  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
852  : "memory"
853  );
854 }
855 
857  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
858 {
859  int i;
860 
861  for (i = 0; i < 8; i++)
862  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
863  int hv = hev(dst + i * 1, stride, hev_thresh);
864  if (hv)
865  vp8_filter_common_is4tap(dst + i * 1, stride);
866  else
868  }
869 }
870 
871 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
872  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
873 {
874  double ftmp[18];
875  uint32_t tmp[1];
879  __asm__ volatile(
880  /* Get data from dst */
881  "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
882  "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
883  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
884  "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
885  "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
886  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
887  "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
888  "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
889  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
890  "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
891  "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
892  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
893  "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
894  "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
895  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
896  "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
897  "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
898  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
899  "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
900  "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
901  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
902  "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
903  "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
904  /* Matrix transpose */
905  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
906  %[q0], %[q1], %[q2], %[q3],
907  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
909  /* Matrix transpose */
910  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
911  %[q0], %[q1], %[q2], %[q3],
912  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
913  /* Move to dst */
914  "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
915  "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
916  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
917  "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
918  "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
919  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
920  "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
921  "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
922  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
923  "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
924  "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
925  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
926  "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
927  "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
928  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
929  "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
930  "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
931  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
932  "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
933  "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
934  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
935  "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
936  "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
937  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
938  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
939  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
940  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
941  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
942  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
943  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
944  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
945  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
946  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
949  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
950  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
951  : "memory"
952  );
953 }
954 
956  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
957 {
958  int i;
959 
960  for (i = 0; i < 8; i++)
961  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
962  int hv = hev(dst + i * stride, 1, hev_thresh);
963  if (hv)
964  vp8_filter_common_is4tap(dst + i * stride, 1);
965  else
967  }
968 }
969 
970 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
971 {
972 #if 1
973  double ftmp[8];
974  DECLARE_VAR_ALL64;
975 
976  __asm__ volatile (
977  MMI_LDC1(%[ftmp0], %[dc], 0x00)
978  MMI_LDC1(%[ftmp1], %[dc], 0x08)
979  MMI_LDC1(%[ftmp2], %[dc], 0x10)
980  MMI_LDC1(%[ftmp3], %[dc], 0x18)
981  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
982  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
983  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
984  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
985  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
986  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
987  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
988  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
989  MMI_SDC1(%[ftmp0], %[dc], 0x00)
990  MMI_SDC1(%[ftmp1], %[dc], 0x08)
991  MMI_SDC1(%[ftmp2], %[dc], 0x10)
992  MMI_SDC1(%[ftmp3], %[dc], 0x18)
993  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
994  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
995  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
996  [ftmp6]"=&f"(ftmp[6]),
997  RESTRICT_ASM_ALL64
998  [ftmp7]"=&f"(ftmp[7])
999  : [dc]"r"((uint8_t*)dc)
1000  : "memory"
1001  );
1002 
1003  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1004  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1005  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1006  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1007 
1008  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1009  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1010  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1011  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1012 
1013  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1014  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1015  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1016  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1017 
1018  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1019  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1020  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1021  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1022 
1023  __asm__ volatile (
1024  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1025  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1026  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1027  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1028  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1029  : RESTRICT_ASM_ALL64
1030  [ftmp0]"=&f"(ftmp[0])
1031  : [dc]"r"((uint8_t *)dc)
1032  : "memory"
1033  );
1034 #else
1035  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1036 
1037  t00 = dc[0] + dc[12];
1038  t10 = dc[1] + dc[13];
1039  t20 = dc[2] + dc[14];
1040  t30 = dc[3] + dc[15];
1041 
1042  t03 = dc[0] - dc[12];
1043  t13 = dc[1] - dc[13];
1044  t23 = dc[2] - dc[14];
1045  t33 = dc[3] - dc[15];
1046 
1047  t01 = dc[4] + dc[ 8];
1048  t11 = dc[5] + dc[ 9];
1049  t21 = dc[6] + dc[10];
1050  t31 = dc[7] + dc[11];
1051 
1052  t02 = dc[4] - dc[ 8];
1053  t12 = dc[5] - dc[ 9];
1054  t22 = dc[6] - dc[10];
1055  t32 = dc[7] - dc[11];
1056 
1057  dc[ 0] = t00 + t01;
1058  dc[ 1] = t10 + t11;
1059  dc[ 2] = t20 + t21;
1060  dc[ 3] = t30 + t31;
1061 
1062  dc[ 4] = t03 + t02;
1063  dc[ 5] = t13 + t12;
1064  dc[ 6] = t23 + t22;
1065  dc[ 7] = t33 + t32;
1066 
1067  dc[ 8] = t00 - t01;
1068  dc[ 9] = t10 - t11;
1069  dc[10] = t20 - t21;
1070  dc[11] = t30 - t31;
1071 
1072  dc[12] = t03 - t02;
1073  dc[13] = t13 - t12;
1074  dc[14] = t23 - t22;
1075  dc[15] = t33 - t32;
1076 
1077  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1078  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1079  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1080  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1081 
1082  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1083  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1084  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1085  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1086 
1087  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1088  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1089  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1090  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1091 
1092  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1093  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1094  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1095  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1096 
1097  AV_ZERO64(dc + 0);
1098  AV_ZERO64(dc + 4);
1099  AV_ZERO64(dc + 8);
1100  AV_ZERO64(dc + 12);
1101 #endif
1102 }
1103 
1104 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1105 {
1106  int val = (dc[0] + 3) >> 3;
1107 
1108  dc[0] = 0;
1109 
1110  block[0][0][0] = val;
1111  block[0][1][0] = val;
1112  block[0][2][0] = val;
1113  block[0][3][0] = val;
1114  block[1][0][0] = val;
1115  block[1][1][0] = val;
1116  block[1][2][0] = val;
1117  block[1][3][0] = val;
1118  block[2][0][0] = val;
1119  block[2][1][0] = val;
1120  block[2][2][0] = val;
1121  block[2][3][0] = val;
1122  block[3][0][0] = val;
1123  block[3][1][0] = val;
1124  block[3][2][0] = val;
1125  block[3][3][0] = val;
1126 }
1127 
1128 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1129 {
1130 #if 1
1131  double ftmp[12];
1132  uint32_t tmp[1];
1133  union av_intfloat64 ff_ph_4e7b_u;
1134  union av_intfloat64 ff_ph_22a3_u;
1135  DECLARE_VAR_LOW32;
1136  DECLARE_VAR_ALL64;
1137  ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1138  ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1139 
1140  __asm__ volatile (
1141  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1142  MMI_LDC1(%[ftmp1], %[block], 0x00)
1143  MMI_LDC1(%[ftmp2], %[block], 0x08)
1144  MMI_LDC1(%[ftmp3], %[block], 0x10)
1145  MMI_LDC1(%[ftmp4], %[block], 0x18)
1146 
1147  "li %[tmp0], 0x02 \n\t"
1148  "mtc1 %[tmp0], %[ftmp11] \n\t"
1149 
1150  // block[0...3] + block[8...11]
1151  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1152  // block[0...3] - block[8...11]
1153  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1154  // MUL_35468(block[12...15])
1155  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1156  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1157  // MUL_35468(block[4...7])
1158  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1159  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1160  // MUL_20091(block[4...7]
1161  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1162  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1163  // MUL_20091(block[12...15])
1164  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1165  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1166 
1167  // tmp[0 4 8 12]
1168  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1169  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1170  // tmp[1 5 9 13]
1171  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1172  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1173  // tmp[2 6 10 14]
1174  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1175  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1176  // tmp[3 7 11 15]
1177  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1178  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1179 
1180  MMI_SDC1(%[ftmp0], %[block], 0x00)
1181  MMI_SDC1(%[ftmp0], %[block], 0x08)
1182  MMI_SDC1(%[ftmp0], %[block], 0x10)
1183  MMI_SDC1(%[ftmp0], %[block], 0x18)
1184 
1185  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1186  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1187 
1188  // t[0 4 8 12]
1189  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1190  // t[1 5 9 13]
1191  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1192  // t[2 6 10 14]
1193  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1194  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1195  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1196  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1197  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1198  // t[3 7 11 15]
1199  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1200  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1201  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1202  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1203  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1204 
1205  "li %[tmp0], 0x03 \n\t"
1206  "mtc1 %[tmp0], %[ftmp11] \n\t"
1207  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1208  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1209  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1210  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1211  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1212  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1213  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1214  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1215  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1216  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1217  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1218  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1219 
1220  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1221  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1222 
1223  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1224  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1225  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1226  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1227 
1228  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1229  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1230  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1231  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1232 
1233  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1234  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1235  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1236  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1237 
1238  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1239  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1240  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1241  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1242 
1243  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1244  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1245  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1246  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1247  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1248  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1249  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1250  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1251  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1252  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1253  RESTRICT_ASM_LOW32
1254  RESTRICT_ASM_ALL64
1255  [tmp0]"=&r"(tmp[0])
1256  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1257  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1258  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4.f),
1259  [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f), [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1260  : "memory"
1261  );
1262 #else
1263  int i, t0, t1, t2, t3;
1264  int16_t tmp[16];
1265 
1266  for (i = 0; i < 4; i++) {
1267  t0 = block[0 + i] + block[8 + i];
1268  t1 = block[0 + i] - block[8 + i];
1269  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1270  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1271  block[ 0 + i] = 0;
1272  block[ 4 + i] = 0;
1273  block[ 8 + i] = 0;
1274  block[12 + i] = 0;
1275 
1276  tmp[i * 4 + 0] = t0 + t3;
1277  tmp[i * 4 + 1] = t1 + t2;
1278  tmp[i * 4 + 2] = t1 - t2;
1279  tmp[i * 4 + 3] = t0 - t3;
1280  }
1281 
1282  for (i = 0; i < 4; i++) {
1283  t0 = tmp[0 + i] + tmp[8 + i];
1284  t1 = tmp[0 + i] - tmp[8 + i];
1285  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1286  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1287 
1288  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1289  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1290  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1291  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1292  dst += stride;
1293  }
1294 #endif
1295 }
1296 
1297 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1298 {
1299 #if 1
1300  int dc = (block[0] + 4) >> 3;
1301  double ftmp[6];
1302  DECLARE_VAR_LOW32;
1303 
1304  block[0] = 0;
1305 
1306  __asm__ volatile (
1307  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1308  "mtc1 %[dc], %[ftmp5] \n\t"
1309  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1310  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1311  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1312  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1313  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1314  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1315  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1316  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1317  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1318  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1319  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1320  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1321  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1322  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1323  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1324  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1325  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1326  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1327  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1328  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1329  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1330  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1331  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1332  [ftmp4]"=&f"(ftmp[4]),
1333  RESTRICT_ASM_LOW32
1334  [ftmp5]"=&f"(ftmp[5])
1335  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1336  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1337  [dc]"r"(dc)
1338  : "memory"
1339  );
1340 #else
1341  int i, dc = (block[0] + 4) >> 3;
1342 
1343  block[0] = 0;
1344 
1345  for (i = 0; i < 4; i++) {
1346  dst[0] = av_clip_uint8(dst[0] + dc);
1347  dst[1] = av_clip_uint8(dst[1] + dc);
1348  dst[2] = av_clip_uint8(dst[2] + dc);
1349  dst[3] = av_clip_uint8(dst[3] + dc);
1350  dst += stride;
1351  }
1352 #endif
1353 }
1354 
1355 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1356  ptrdiff_t stride)
1357 {
1358  ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1359  ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1360  ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1361  ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1362 }
1363 
1364 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1365  ptrdiff_t stride)
1366 {
1367  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1368  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1369  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1370  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1371 }
1372 
1373 // loop filter applied to edges between macroblocks
1374 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1375  int flim_I, int hev_thresh)
1376 {
1377  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1378  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1379 }
1380 
1381 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1382  int flim_I, int hev_thresh)
1383 {
1384  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1385  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1386  hev_thresh);
1387 }
1388 
1389 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1390  int flim_E, int flim_I, int hev_thresh)
1391 {
1392  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1393  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1394 }
1395 
1396 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1397  int flim_E, int flim_I, int hev_thresh)
1398 {
1399  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1400  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1401 }
1402 
1403 // loop filter applied to inner macroblock edges
1404 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1405  int flim_E, int flim_I, int hev_thresh)
1406 {
1407  int i;
1408 
1409  for (i = 0; i < 16; i++)
1410  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1411  int hv = hev(dst + i * 1, stride, hev_thresh);
1412  if (hv)
1413  vp8_filter_common_is4tap(dst + i * 1, stride);
1414  else
1415  vp8_filter_common_isnot4tap(dst + i * 1, stride);
1416  }
1417 }
1418 
1419 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1420  int flim_E, int flim_I, int hev_thresh)
1421 {
1422  int i;
1423 
1424  for (i = 0; i < 16; i++)
1425  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1426  int hv = hev(dst + i * stride, 1, hev_thresh);
1427  if (hv)
1428  vp8_filter_common_is4tap(dst + i * stride, 1);
1429  else
1430  vp8_filter_common_isnot4tap(dst + i * stride, 1);
1431  }
1432 }
1433 
1434 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1435  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1436 {
1437  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1438  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1439 }
1440 
1441 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1442  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1443 {
1444  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1445  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1446 }
1447 
1448 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1449 {
1450  int i;
1451 
1452  for (i = 0; i < 16; i++)
1453  if (vp8_simple_limit(dst + i, stride, flim))
1455 }
1456 
1457 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1458 {
1459  int i;
1460 
1461  for (i = 0; i < 16; i++)
1462  if (vp8_simple_limit(dst + i * stride, 1, flim))
1463  vp8_filter_common_is4tap(dst + i * stride, 1);
1464 }
1465 
1466 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1467  ptrdiff_t srcstride, int h, int x, int y)
1468 {
1469 #if 1
1470  double ftmp[2];
1471  uint64_t tmp[2];
1472  mips_reg addr[2];
1473  DECLARE_VAR_ALL64;
1474 
1475  __asm__ volatile (
1476  "1: \n\t"
1477  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1478  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1479  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1480  "ldr %[tmp0], 0x08(%[src]) \n\t"
1481  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1482  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1483  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1484  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1485  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1486  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1487  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1488  "addiu %[h], %[h], -0x02 \n\t"
1489  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1490  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1491  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1492  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1493  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1494  "bnez %[h], 1b \n\t"
1495  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1496  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1497  RESTRICT_ASM_ALL64
1498  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1499  [dst]"+&r"(dst), [src]"+&r"(src),
1500  [h]"+&r"(h)
1501  : [dststride]"r"((mips_reg)dststride),
1502  [srcstride]"r"((mips_reg)srcstride)
1503  : "memory"
1504  );
1505 #else
1506  int i;
1507 
1508  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1509  memcpy(dst, src, 16);
1510 #endif
1511 }
1512 
1513 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1514  ptrdiff_t srcstride, int h, int x, int y)
1515 {
1516 #if 1
1517  double ftmp[1];
1518  uint64_t tmp[1];
1519  mips_reg addr[2];
1520  DECLARE_VAR_ALL64;
1521 
1522  __asm__ volatile (
1523  "1: \n\t"
1524  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1525  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1526  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1527  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1528  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1529  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1530  "addiu %[h], %[h], -0x02 \n\t"
1531  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1532  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1533  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1534  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1535  "bnez %[h], 1b \n\t"
1536  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1537  RESTRICT_ASM_ALL64
1538  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1539  [dst]"+&r"(dst), [src]"+&r"(src),
1540  [h]"+&r"(h)
1541  : [dststride]"r"((mips_reg)dststride),
1542  [srcstride]"r"((mips_reg)srcstride)
1543  : "memory"
1544  );
1545 #else
1546  int i;
1547 
1548  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1549  memcpy(dst, src, 8);
1550 #endif
1551 }
1552 
1553 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1554  ptrdiff_t srcstride, int h, int x, int y)
1555 {
1556 #if 1
1557  double ftmp[1];
1558  uint64_t tmp[1];
1559  mips_reg addr[2];
1560  DECLARE_VAR_LOW32;
1561 
1562  __asm__ volatile (
1563  "1: \n\t"
1564  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1565  MMI_LWC1(%[ftmp0], %[src], 0x00)
1566  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1567  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1568  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1569  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1570  "addiu %[h], %[h], -0x02 \n\t"
1571  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1572  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1573  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1574  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1575  "bnez %[h], 1b \n\t"
1576  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1577  RESTRICT_ASM_LOW32
1578  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1579  [dst]"+&r"(dst), [src]"+&r"(src),
1580  [h]"+&r"(h)
1581  : [dststride]"r"((mips_reg)dststride),
1582  [srcstride]"r"((mips_reg)srcstride)
1583  : "memory"
1584  );
1585 #else
1586  int i;
1587 
1588  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1589  memcpy(dst, src, 4);
1590 #endif
1591 }
1592 
1593 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1594  ptrdiff_t srcstride, int h, int mx, int my)
1595 {
1596 #if 1
1597  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1598  double ftmp[9];
1599  uint32_t tmp[1];
1600  union av_intfloat64 filter1;
1601  union av_intfloat64 filter2;
1602  union av_intfloat64 filter3;
1603  union av_intfloat64 filter4;
1604  mips_reg src1, dst1;
1605  DECLARE_VAR_ALL64;
1606  filter1.i = filter[1];
1607  filter2.i = filter[2];
1608  filter3.i = filter[3];
1609  filter4.i = filter[4];
1610 
1611  /*
1612  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1613  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1614  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1615  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1616  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1617  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1618  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1619  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1620 
1621  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1622  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1623  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1624  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1625  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1626  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1627  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1628  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1629  */
1630  __asm__ volatile (
1631  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1632  "li %[tmp0], 0x07 \n\t"
1633  "mtc1 %[tmp0], %[ftmp4] \n\t"
1634 
1635  "1: \n\t"
1636  // 0 - 7
1637  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1638  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1639  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1640  // 8 - 15
1641  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1642 
1643  "addiu %[h], %[h], -0x01 \n\t"
1644  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1645  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1646  "bnez %[h], 1b \n\t"
1647  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1648  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1649  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1650  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1651  [ftmp8]"=&f"(ftmp[8]),
1652  [tmp0]"=&r"(tmp[0]),
1653  RESTRICT_ASM_ALL64
1654  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1655  [h]"+&r"(h),
1656  [dst]"+&r"(dst), [src]"+&r"(src)
1657  : [ff_pw_64]"f"(ff_pw_64.f),
1658  [srcstride]"r"((mips_reg)srcstride),
1659  [dststride]"r"((mips_reg)dststride),
1660  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1661  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1662  : "memory"
1663  );
1664 #else
1665  const uint8_t *filter = subpel_filters[mx - 1];
1666  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1667  int x, y;
1668 
1669  for (y = 0; y < h; y++) {
1670  for (x = 0; x < 16; x++)
1671  dst[x] = FILTER_4TAP(src, filter, 1);
1672  dst += dststride;
1673  src += srcstride;
1674  }
1675 #endif
1676 }
1677 
1678 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1679  ptrdiff_t srcstride, int h, int mx, int my)
1680 {
1681 #if 1
1682  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1683  double ftmp[9];
1684  uint32_t tmp[1];
1685  union av_intfloat64 filter1;
1686  union av_intfloat64 filter2;
1687  union av_intfloat64 filter3;
1688  union av_intfloat64 filter4;
1689  DECLARE_VAR_ALL64;
1690  filter1.i = filter[1];
1691  filter2.i = filter[2];
1692  filter3.i = filter[3];
1693  filter4.i = filter[4];
1694 
1695 
1696  /*
1697  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1698  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1699  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1700  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1701  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1702  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1703  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1704  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1705  */
1706  __asm__ volatile (
1707  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1708  "li %[tmp0], 0x07 \n\t"
1709  "mtc1 %[tmp0], %[ftmp4] \n\t"
1710 
1711  "1: \n\t"
1712  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1713 
1714  "addiu %[h], %[h], -0x01 \n\t"
1715  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1716  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1717  "bnez %[h], 1b \n\t"
1718  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1719  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1720  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1721  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1722  [ftmp8]"=&f"(ftmp[8]),
1723  [tmp0]"=&r"(tmp[0]),
1724  RESTRICT_ASM_ALL64
1725  [h]"+&r"(h),
1726  [dst]"+&r"(dst), [src]"+&r"(src)
1727  : [ff_pw_64]"f"(ff_pw_64.f),
1728  [srcstride]"r"((mips_reg)srcstride),
1729  [dststride]"r"((mips_reg)dststride),
1730  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1731  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1732  : "memory"
1733  );
1734 #else
1735  const uint8_t *filter = subpel_filters[mx - 1];
1736  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1737  int x, y;
1738 
1739  for (y = 0; y < h; y++) {
1740  for (x = 0; x < 8; x++)
1741  dst[x] = FILTER_4TAP(src, filter, 1);
1742  dst += dststride;
1743  src += srcstride;
1744  }
1745 #endif
1746 }
1747 
1748 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1749  ptrdiff_t srcstride, int h, int mx, int my)
1750 {
1751 #if 1
1752  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1753  double ftmp[6];
1754  uint32_t tmp[1];
1755  union av_intfloat64 filter1;
1756  union av_intfloat64 filter2;
1757  union av_intfloat64 filter3;
1758  union av_intfloat64 filter4;
1759  DECLARE_VAR_LOW32;
1760  filter1.i = filter[1];
1761  filter2.i = filter[2];
1762  filter3.i = filter[3];
1763  filter4.i = filter[4];
1764 
1765  /*
1766  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1767  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1768  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1769  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1770  */
1771  __asm__ volatile (
1772  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1773  "li %[tmp0], 0x07 \n\t"
1774  "mtc1 %[tmp0], %[ftmp4] \n\t"
1775 
1776  "1: \n\t"
1777  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1778 
1779  "addiu %[h], %[h], -0x01 \n\t"
1780  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1781  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1782  "bnez %[h], 1b \n\t"
1783  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1784  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1785  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1786  [tmp0]"=&r"(tmp[0]),
1787  RESTRICT_ASM_LOW32
1788  [h]"+&r"(h),
1789  [dst]"+&r"(dst), [src]"+&r"(src)
1790  : [ff_pw_64]"f"(ff_pw_64.f),
1791  [srcstride]"r"((mips_reg)srcstride),
1792  [dststride]"r"((mips_reg)dststride),
1793  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1794  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1795  : "memory"
1796  );
1797 #else
1798  const uint8_t *filter = subpel_filters[mx - 1];
1799  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1800  int x, y;
1801 
1802  for (y = 0; y < h; y++) {
1803  for (x = 0; x < 4; x++)
1804  dst[x] = FILTER_4TAP(src, filter, 1);
1805  dst += dststride;
1806  src += srcstride;
1807  }
1808 #endif
1809 }
1810 
1811 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1812  ptrdiff_t srcstride, int h, int mx, int my)
1813 {
1814 #if 1
1815  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1816  double ftmp[9];
1817  uint32_t tmp[1];
1818  mips_reg src1, dst1;
1819  union av_intfloat64 filter0;
1820  union av_intfloat64 filter1;
1821  union av_intfloat64 filter2;
1822  union av_intfloat64 filter3;
1823  union av_intfloat64 filter4;
1824  union av_intfloat64 filter5;
1825  DECLARE_VAR_ALL64;
1826  filter0.i = filter[0];
1827  filter1.i = filter[1];
1828  filter2.i = filter[2];
1829  filter3.i = filter[3];
1830  filter4.i = filter[4];
1831  filter5.i = filter[5];
1832 
1833  /*
1834  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1835  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1836  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1837  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1838  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1839  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1840  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1841  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1842 
1843  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1844  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1845  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1846  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1847  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1848  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1849  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1850  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1851  */
1852  __asm__ volatile (
1853  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1854  "li %[tmp0], 0x07 \n\t"
1855  "mtc1 %[tmp0], %[ftmp4] \n\t"
1856 
1857  "1: \n\t"
1858  // 0 - 7
1859  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1860  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1861  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1862  // 8 - 15
1863  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1864 
1865  "addiu %[h], %[h], -0x01 \n\t"
1866  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1867  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1868  "bnez %[h], 1b \n\t"
1869  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1870  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1871  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1872  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1873  [ftmp8]"=&f"(ftmp[8]),
1874  [tmp0]"=&r"(tmp[0]),
1875  RESTRICT_ASM_ALL64
1876  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1877  [h]"+&r"(h),
1878  [dst]"+&r"(dst), [src]"+&r"(src)
1879  : [ff_pw_64]"f"(ff_pw_64.f),
1880  [srcstride]"r"((mips_reg)srcstride),
1881  [dststride]"r"((mips_reg)dststride),
1882  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1883  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1884  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1885  : "memory"
1886  );
1887 #else
1888  const uint8_t *filter = subpel_filters[mx - 1];
1889  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1890  int x, y;
1891 
1892  for (y = 0; y < h; y++) {
1893  for (x = 0; x < 16; x++)
1894  dst[x] = FILTER_6TAP(src, filter, 1);
1895  dst += dststride;
1896  src += srcstride;
1897  }
1898 #endif
1899 }
1900 
1901 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1902  ptrdiff_t srcstride, int h, int mx, int my)
1903 {
1904 #if 1
1905  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1906  double ftmp[9];
1907  uint32_t tmp[1];
1908  union av_intfloat64 filter0;
1909  union av_intfloat64 filter1;
1910  union av_intfloat64 filter2;
1911  union av_intfloat64 filter3;
1912  union av_intfloat64 filter4;
1913  union av_intfloat64 filter5;
1914  DECLARE_VAR_ALL64;
1915  filter0.i = filter[0];
1916  filter1.i = filter[1];
1917  filter2.i = filter[2];
1918  filter3.i = filter[3];
1919  filter4.i = filter[4];
1920  filter5.i = filter[5];
1921 
1922  /*
1923  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1924  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1925  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1926  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1927  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1928  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1929  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1930  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1931  */
1932  __asm__ volatile (
1933  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1934  "li %[tmp0], 0x07 \n\t"
1935  "mtc1 %[tmp0], %[ftmp4] \n\t"
1936 
1937  "1: \n\t"
1938  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1939 
1940  "addiu %[h], %[h], -0x01 \n\t"
1941  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1942  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1943  "bnez %[h], 1b \n\t"
1944  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1945  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1946  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1947  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1948  [ftmp8]"=&f"(ftmp[8]),
1949  [tmp0]"=&r"(tmp[0]),
1950  RESTRICT_ASM_ALL64
1951  [h]"+&r"(h),
1952  [dst]"+&r"(dst), [src]"+&r"(src)
1953  : [ff_pw_64]"f"(ff_pw_64.f),
1954  [srcstride]"r"((mips_reg)srcstride),
1955  [dststride]"r"((mips_reg)dststride),
1956  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1957  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1958  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1959  : "memory"
1960  );
1961 #else
1962  const uint8_t *filter = subpel_filters[mx - 1];
1963  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1964  int x, y;
1965 
1966  for (y = 0; y < h; y++) {
1967  for (x = 0; x < 8; x++)
1968  dst[x] = FILTER_6TAP(src, filter, 1);
1969  dst += dststride;
1970  src += srcstride;
1971  }
1972 #endif
1973 }
1974 
1975 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1976  ptrdiff_t srcstride, int h, int mx, int my)
1977 {
1978 #if 1
1979  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1980  double ftmp[6];
1981  uint32_t tmp[1];
1982  union av_intfloat64 filter0;
1983  union av_intfloat64 filter1;
1984  union av_intfloat64 filter2;
1985  union av_intfloat64 filter3;
1986  union av_intfloat64 filter4;
1987  union av_intfloat64 filter5;
1988  DECLARE_VAR_LOW32;
1989  filter0.i = filter[0];
1990  filter1.i = filter[1];
1991  filter2.i = filter[2];
1992  filter3.i = filter[3];
1993  filter4.i = filter[4];
1994  filter5.i = filter[5];
1995 
1996  /*
1997  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1998  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1999  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
2000  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
2001  */
2002  __asm__ volatile (
2003  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2004  "li %[tmp0], 0x07 \n\t"
2005  "mtc1 %[tmp0], %[ftmp4] \n\t"
2006 
2007  "1: \n\t"
2008  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
2009 
2010  "addiu %[h], %[h], -0x01 \n\t"
2011  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2012  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2013  "bnez %[h], 1b \n\t"
2014  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2015  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2016  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2017  [tmp0]"=&r"(tmp[0]),
2018  RESTRICT_ASM_LOW32
2019  [h]"+&r"(h),
2020  [dst]"+&r"(dst), [src]"+&r"(src)
2021  : [ff_pw_64]"f"(ff_pw_64.f),
2022  [srcstride]"r"((mips_reg)srcstride),
2023  [dststride]"r"((mips_reg)dststride),
2024  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2025  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2026  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2027  : "memory"
2028  );
2029 #else
2030  const uint8_t *filter = subpel_filters[mx - 1];
2031  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2032  int x, y;
2033 
2034  for (y = 0; y < h; y++) {
2035  for (x = 0; x < 4; x++)
2036  dst[x] = FILTER_6TAP(src, filter, 1);
2037  dst += dststride;
2038  src += srcstride;
2039  }
2040 #endif
2041 }
2042 
2043 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2044  ptrdiff_t srcstride, int h, int mx, int my)
2045 {
2046 #if 1
2047  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2048  double ftmp[9];
2049  uint32_t tmp[1];
2050  mips_reg src0, src1, dst0;
2051  union av_intfloat64 filter1;
2052  union av_intfloat64 filter2;
2053  union av_intfloat64 filter3;
2054  union av_intfloat64 filter4;
2055  DECLARE_VAR_ALL64;
2056  filter1.i = filter[1];
2057  filter2.i = filter[2];
2058  filter3.i = filter[3];
2059  filter4.i = filter[4];
2060 
2061  /*
2062  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2063  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2064  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2065  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2066  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2067  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2068  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2069  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2070 
2071  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2072  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2073  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2074  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2075  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2076  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2077  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2078  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2079  */
2080  __asm__ volatile (
2081  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2082  "li %[tmp0], 0x07 \n\t"
2083  "mtc1 %[tmp0], %[ftmp4] \n\t"
2084 
2085  "1: \n\t"
2086  // 0 - 7
2087  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2088  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2089  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2090  // 8 - 15
2091  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2092 
2093  "addiu %[h], %[h], -0x01 \n\t"
2094  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2095  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2096  "bnez %[h], 1b \n\t"
2097  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2098  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2099  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2100  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2101  [ftmp8]"=&f"(ftmp[8]),
2102  [tmp0]"=&r"(tmp[0]),
2103  RESTRICT_ASM_ALL64
2104  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2105  [src1]"=&r"(src1),
2106  [h]"+&r"(h),
2107  [dst]"+&r"(dst), [src]"+&r"(src)
2108  : [ff_pw_64]"f"(ff_pw_64.f),
2109  [srcstride]"r"((mips_reg)srcstride),
2110  [dststride]"r"((mips_reg)dststride),
2111  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2112  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2113  : "memory"
2114  );
2115 #else
2116  const uint8_t *filter = subpel_filters[my - 1];
2117  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2118  int x, y;
2119 
2120  for (y = 0; y < h; y++) {
2121  for (x = 0; x < 16; x++)
2122  dst[x] = FILTER_4TAP(src, filter, srcstride);
2123  dst += dststride;
2124  src += srcstride;
2125  }
2126 #endif
2127 }
2128 
2129 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2130  ptrdiff_t srcstride, int h, int mx, int my)
2131 {
2132 #if 1
2133  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2134  double ftmp[9];
2135  uint32_t tmp[1];
2136  mips_reg src1;
2137  union av_intfloat64 filter1;
2138  union av_intfloat64 filter2;
2139  union av_intfloat64 filter3;
2140  union av_intfloat64 filter4;
2141  DECLARE_VAR_ALL64;
2142  filter1.i = filter[1];
2143  filter2.i = filter[2];
2144  filter3.i = filter[3];
2145  filter4.i = filter[4];
2146 
2147  /*
2148  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2149  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2150  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2151  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2152  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2153  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2154  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2155  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2156  */
2157  __asm__ volatile (
2158  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2159  "li %[tmp0], 0x07 \n\t"
2160  "mtc1 %[tmp0], %[ftmp4] \n\t"
2161 
2162  "1: \n\t"
2163  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2164 
2165  "addiu %[h], %[h], -0x01 \n\t"
2166  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2167  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2168  "bnez %[h], 1b \n\t"
2169  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2170  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2171  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2172  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2173  [ftmp8]"=&f"(ftmp[8]),
2174  [tmp0]"=&r"(tmp[0]),
2175  RESTRICT_ASM_ALL64
2176  [src1]"=&r"(src1),
2177  [h]"+&r"(h),
2178  [dst]"+&r"(dst), [src]"+&r"(src)
2179  : [ff_pw_64]"f"(ff_pw_64.f),
2180  [srcstride]"r"((mips_reg)srcstride),
2181  [dststride]"r"((mips_reg)dststride),
2182  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2183  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2184  : "memory"
2185  );
2186 #else
2187  const uint8_t *filter = subpel_filters[my - 1];
2188  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2189  int x, y;
2190 
2191  for (y = 0; y < h; y++) {
2192  for (x = 0; x < 8; x++)
2193  dst[x] = FILTER_4TAP(src, filter, srcstride);
2194  dst += dststride;
2195  src += srcstride;
2196  }
2197 #endif
2198 }
2199 
2200 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2201  ptrdiff_t srcstride, int h, int mx, int my)
2202 {
2203 #if 1
2204  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2205  double ftmp[6];
2206  uint32_t tmp[1];
2207  mips_reg src1;
2208  union av_intfloat64 filter1;
2209  union av_intfloat64 filter2;
2210  union av_intfloat64 filter3;
2211  union av_intfloat64 filter4;
2212  DECLARE_VAR_LOW32;
2213  filter1.i = filter[1];
2214  filter2.i = filter[2];
2215  filter3.i = filter[3];
2216  filter4.i = filter[4];
2217 
2218  /*
2219  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2220  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2221  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2222  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2223  */
2224  __asm__ volatile (
2225  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2226  "li %[tmp0], 0x07 \n\t"
2227  "mtc1 %[tmp0], %[ftmp4] \n\t"
2228 
2229  "1: \n\t"
2230  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2231 
2232  "addiu %[h], %[h], -0x01 \n\t"
2233  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2234  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2235  "bnez %[h], 1b \n\t"
2236  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2237  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2238  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2239  [tmp0]"=&r"(tmp[0]),
2240  RESTRICT_ASM_LOW32
2241  [src1]"=&r"(src1),
2242  [h]"+&r"(h),
2243  [dst]"+&r"(dst), [src]"+&r"(src)
2244  : [ff_pw_64]"f"(ff_pw_64.f),
2245  [srcstride]"r"((mips_reg)srcstride),
2246  [dststride]"r"((mips_reg)dststride),
2247  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2248  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2249  : "memory"
2250  );
2251 #else
2252  const uint8_t *filter = subpel_filters[my - 1];
2253  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2254  int x, y;
2255 
2256  for (y = 0; y < h; y++) {
2257  for (x = 0; x < 4; x++)
2258  dst[x] = FILTER_4TAP(src, filter, srcstride);
2259  dst += dststride;
2260  src += srcstride;
2261  }
2262 #endif
2263 }
2264 
2265 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2266  ptrdiff_t srcstride, int h, int mx, int my)
2267 {
2268 #if 1
2269  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2270  double ftmp[9];
2271  uint32_t tmp[1];
2272  mips_reg src0, src1, dst0;
2273  union av_intfloat64 filter0;
2274  union av_intfloat64 filter1;
2275  union av_intfloat64 filter2;
2276  union av_intfloat64 filter3;
2277  union av_intfloat64 filter4;
2278  union av_intfloat64 filter5;
2279  DECLARE_VAR_ALL64;
2280  filter0.i = filter[0];
2281  filter1.i = filter[1];
2282  filter2.i = filter[2];
2283  filter3.i = filter[3];
2284  filter4.i = filter[4];
2285  filter5.i = filter[5];
2286 
2287  /*
2288  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2289  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2290  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2291  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2292  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2293  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2294  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2295  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2296 
2297  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2298  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2299  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2300  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2301  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2302  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2303  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2304  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2305  */
2306  __asm__ volatile (
2307  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2308  "li %[tmp0], 0x07 \n\t"
2309  "mtc1 %[tmp0], %[ftmp4] \n\t"
2310 
2311  "1: \n\t"
2312  // 0 - 7
2313  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2314  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2315  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2316  // 8 - 15
2317  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2318 
2319  "addiu %[h], %[h], -0x01 \n\t"
2320  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2321  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2322  "bnez %[h], 1b \n\t"
2323  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2324  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2325  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2326  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2327  [ftmp8]"=&f"(ftmp[8]),
2328  [tmp0]"=&r"(tmp[0]),
2329  RESTRICT_ASM_ALL64
2330  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2331  [src1]"=&r"(src1),
2332  [h]"+&r"(h),
2333  [dst]"+&r"(dst), [src]"+&r"(src)
2334  : [ff_pw_64]"f"(ff_pw_64.f),
2335  [srcstride]"r"((mips_reg)srcstride),
2336  [dststride]"r"((mips_reg)dststride),
2337  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2338  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2339  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2340  : "memory"
2341  );
2342 #else
2343  const uint8_t *filter = subpel_filters[my - 1];
2344  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2345  int x, y;
2346 
2347  for (y = 0; y < h; y++) {
2348  for (x = 0; x < 16; x++)
2349  dst[x] = FILTER_6TAP(src, filter, srcstride);
2350  dst += dststride;
2351  src += srcstride;
2352  }
2353 #endif
2354 }
2355 
2356 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2357  ptrdiff_t srcstride, int h, int mx, int my)
2358 {
2359 #if 1
2360  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2361  double ftmp[9];
2362  uint32_t tmp[1];
2363  mips_reg src1;
2364  union av_intfloat64 filter0;
2365  union av_intfloat64 filter1;
2366  union av_intfloat64 filter2;
2367  union av_intfloat64 filter3;
2368  union av_intfloat64 filter4;
2369  union av_intfloat64 filter5;
2370  DECLARE_VAR_ALL64;
2371  filter0.i = filter[0];
2372  filter1.i = filter[1];
2373  filter2.i = filter[2];
2374  filter3.i = filter[3];
2375  filter4.i = filter[4];
2376  filter5.i = filter[5];
2377 
2378  /*
2379  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2380  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2381  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2382  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2383  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2384  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2385  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2386  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2387  */
2388  __asm__ volatile (
2389  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2390  "li %[tmp0], 0x07 \n\t"
2391  "mtc1 %[tmp0], %[ftmp4] \n\t"
2392 
2393  "1: \n\t"
2394  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2395 
2396  "addiu %[h], %[h], -0x01 \n\t"
2397  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2398  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2399  "bnez %[h], 1b \n\t"
2400  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2401  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2402  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2403  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2404  [ftmp8]"=&f"(ftmp[8]),
2405  [tmp0]"=&r"(tmp[0]),
2406  RESTRICT_ASM_ALL64
2407  [src1]"=&r"(src1),
2408  [h]"+&r"(h),
2409  [dst]"+&r"(dst), [src]"+&r"(src)
2410  : [ff_pw_64]"f"(ff_pw_64.f),
2411  [srcstride]"r"((mips_reg)srcstride),
2412  [dststride]"r"((mips_reg)dststride),
2413  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2414  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2415  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2416  : "memory"
2417  );
2418 #else
2419  const uint8_t *filter = subpel_filters[my - 1];
2420  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2421  int x, y;
2422 
2423  for (y = 0; y < h; y++) {
2424  for (x = 0; x < 8; x++)
2425  dst[x] = FILTER_6TAP(src, filter, srcstride);
2426  dst += dststride;
2427  src += srcstride;
2428  }
2429 #endif
2430 }
2431 
2432 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2433  ptrdiff_t srcstride, int h, int mx, int my)
2434 {
2435 #if 1
2436  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2437  double ftmp[6];
2438  uint32_t tmp[1];
2439  mips_reg src1;
2440  union av_intfloat64 filter0;
2441  union av_intfloat64 filter1;
2442  union av_intfloat64 filter2;
2443  union av_intfloat64 filter3;
2444  union av_intfloat64 filter4;
2445  union av_intfloat64 filter5;
2446  DECLARE_VAR_LOW32;
2447  filter0.i = filter[0];
2448  filter1.i = filter[1];
2449  filter2.i = filter[2];
2450  filter3.i = filter[3];
2451  filter4.i = filter[4];
2452  filter5.i = filter[5];
2453 
2454  /*
2455  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2456  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2457  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2458  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2459  */
2460  __asm__ volatile (
2461  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2462  "li %[tmp0], 0x07 \n\t"
2463  "mtc1 %[tmp0], %[ftmp4] \n\t"
2464 
2465  "1: \n\t"
2466  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2467 
2468  "addiu %[h], %[h], -0x01 \n\t"
2469  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2470  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2471  "bnez %[h], 1b \n\t"
2472  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2473  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2474  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2475  [tmp0]"=&r"(tmp[0]),
2476  RESTRICT_ASM_LOW32
2477  [src1]"=&r"(src1),
2478  [h]"+&r"(h),
2479  [dst]"+&r"(dst), [src]"+&r"(src)
2480  : [ff_pw_64]"f"(ff_pw_64.f),
2481  [srcstride]"r"((mips_reg)srcstride),
2482  [dststride]"r"((mips_reg)dststride),
2483  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2484  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2485  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2486  : "memory"
2487  );
2488 #else
2489  const uint8_t *filter = subpel_filters[my - 1];
2490  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2491  int x, y;
2492 
2493  for (y = 0; y < h; y++) {
2494  for (x = 0; x < 4; x++)
2495  dst[x] = FILTER_6TAP(src, filter, srcstride);
2496  dst += dststride;
2497  src += srcstride;
2498  }
2499 #endif
2500 }
2501 
2502 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2503  ptrdiff_t srcstride, int h, int mx, int my)
2504 {
2505 #if 1
2506  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2507  uint8_t *tmp = tmp_array;
2508 
2509  src -= srcstride;
2510  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2511  tmp = tmp_array + 16;
2512  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2513 #else
2514  const uint8_t *filter = subpel_filters[mx - 1];
2515  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2516  int x, y;
2517  uint8_t tmp_array[560];
2518  uint8_t *tmp = tmp_array;
2519 
2520  src -= srcstride;
2521 
2522  for (y = 0; y < h + 3; y++) {
2523  for (x = 0; x < 16; x++)
2524  tmp[x] = FILTER_4TAP(src, filter, 1);
2525  tmp += 16;
2526  src += srcstride;
2527  }
2528 
2529  tmp = tmp_array + 16;
2530  filter = subpel_filters[my - 1];
2531 
2532  for (y = 0; y < h; y++) {
2533  for (x = 0; x < 16; x++)
2534  dst[x] = FILTER_4TAP(tmp, filter, 16);
2535  dst += dststride;
2536  tmp += 16;
2537  }
2538 #endif
2539 }
2540 
2541 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2542  ptrdiff_t srcstride, int h, int mx, int my)
2543 {
2544 #if 1
2545  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2546  uint8_t *tmp = tmp_array;
2547 
2548  src -= srcstride;
2549  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2550  tmp = tmp_array + 8;
2551  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2552 #else
2553  const uint8_t *filter = subpel_filters[mx - 1];
2554  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2555  int x, y;
2556  uint8_t tmp_array[152];
2557  uint8_t *tmp = tmp_array;
2558 
2559  src -= srcstride;
2560 
2561  for (y = 0; y < h + 3; y++) {
2562  for (x = 0; x < 8; x++)
2563  tmp[x] = FILTER_4TAP(src, filter, 1);
2564  tmp += 8;
2565  src += srcstride;
2566  }
2567 
2568  tmp = tmp_array + 8;
2569  filter = subpel_filters[my - 1];
2570 
2571  for (y = 0; y < h; y++) {
2572  for (x = 0; x < 8; x++)
2573  dst[x] = FILTER_4TAP(tmp, filter, 8);
2574  dst += dststride;
2575  tmp += 8;
2576  }
2577 #endif
2578 }
2579 
2580 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2581  ptrdiff_t srcstride, int h, int mx, int my)
2582 {
2583 #if 1
2584  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2585  uint8_t *tmp = tmp_array;
2586 
2587  src -= srcstride;
2588  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2589  tmp = tmp_array + 4;
2590  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2591 #else
2592  const uint8_t *filter = subpel_filters[mx - 1];
2593  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2594  int x, y;
2595  uint8_t tmp_array[44];
2596  uint8_t *tmp = tmp_array;
2597 
2598  src -= srcstride;
2599 
2600  for (y = 0; y < h + 3; y++) {
2601  for (x = 0; x < 4; x++)
2602  tmp[x] = FILTER_4TAP(src, filter, 1);
2603  tmp += 4;
2604  src += srcstride;
2605  }
2606  tmp = tmp_array + 4;
2607  filter = subpel_filters[my - 1];
2608 
2609  for (y = 0; y < h; y++) {
2610  for (x = 0; x < 4; x++)
2611  dst[x] = FILTER_4TAP(tmp, filter, 4);
2612  dst += dststride;
2613  tmp += 4;
2614  }
2615 #endif
2616 }
2617 
2618 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2619  ptrdiff_t srcstride, int h, int mx, int my)
2620 {
2621 #if 1
2622  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2623  uint8_t *tmp = tmp_array;
2624 
2625  src -= 2 * srcstride;
2626  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2627  tmp = tmp_array + 32;
2628  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2629 #else
2630  const uint8_t *filter = subpel_filters[mx - 1];
2631  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2632  int x, y;
2633  uint8_t tmp_array[592];
2634  uint8_t *tmp = tmp_array;
2635 
2636  src -= 2 * srcstride;
2637 
2638  for (y = 0; y < h + 5; y++) {
2639  for (x = 0; x < 16; x++)
2640  tmp[x] = FILTER_4TAP(src, filter, 1);
2641  tmp += 16;
2642  src += srcstride;
2643  }
2644 
2645  tmp = tmp_array + 32;
2646  filter = subpel_filters[my - 1];
2647 
2648  for (y = 0; y < h; y++) {
2649  for (x = 0; x < 16; x++)
2650  dst[x] = FILTER_6TAP(tmp, filter, 16);
2651  dst += dststride;
2652  tmp += 16;
2653  }
2654 #endif
2655 }
2656 
2657 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2658  ptrdiff_t srcstride, int h, int mx, int my)
2659 {
2660 #if 1
2661  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2662  uint8_t *tmp = tmp_array;
2663 
2664  src -= 2 * srcstride;
2665  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2666  tmp = tmp_array + 16;
2667  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2668 #else
2669  const uint8_t *filter = subpel_filters[mx - 1];
2670  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2671  int x, y;
2672  uint8_t tmp_array[168];
2673  uint8_t *tmp = tmp_array;
2674 
2675  src -= 2 * srcstride;
2676 
2677  for (y = 0; y < h + 5; y++) {
2678  for (x = 0; x < 8; x++)
2679  tmp[x] = FILTER_4TAP(src, filter, 1);
2680  tmp += 8;
2681  src += srcstride;
2682  }
2683 
2684  tmp = tmp_array + 16;
2685  filter = subpel_filters[my - 1];
2686 
2687  for (y = 0; y < h; y++) {
2688  for (x = 0; x < 8; x++)
2689  dst[x] = FILTER_6TAP(tmp, filter, 8);
2690  dst += dststride;
2691  tmp += 8;
2692  }
2693 #endif
2694 }
2695 
2696 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2697  ptrdiff_t srcstride, int h, int mx, int my)
2698 {
2699 #if 1
2700  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2701  uint8_t *tmp = tmp_array;
2702 
2703  src -= 2 * srcstride;
2704  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2705  tmp = tmp_array + 8;
2706  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2707 #else
2708  const uint8_t *filter = subpel_filters[mx - 1];
2709  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2710  int x, y;
2711  uint8_t tmp_array[52];
2712  uint8_t *tmp = tmp_array;
2713 
2714  src -= 2 * srcstride;
2715 
2716  for (y = 0; y < h + 5; y++) {
2717  for (x = 0; x < 4; x++)
2718  tmp[x] = FILTER_4TAP(src, filter, 1);
2719  tmp += 4;
2720  src += srcstride;
2721  }
2722 
2723  tmp = tmp_array + 8;
2724  filter = subpel_filters[my - 1];
2725 
2726  for (y = 0; y < h; y++) {
2727  for (x = 0; x < 4; x++)
2728  dst[x] = FILTER_6TAP(tmp, filter, 4);
2729  dst += dststride;
2730  tmp += 4;
2731  }
2732 #endif
2733 }
2734 
2735 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2736  ptrdiff_t srcstride, int h, int mx, int my)
2737 {
2738 #if 1
2739  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2740  uint8_t *tmp = tmp_array;
2741 
2742  src -= srcstride;
2743  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2744  tmp = tmp_array + 16;
2745  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2746 #else
2747  const uint8_t *filter = subpel_filters[mx - 1];
2748  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2749  int x, y;
2750  uint8_t tmp_array[560];
2751  uint8_t *tmp = tmp_array;
2752 
2753  src -= srcstride;
2754 
2755  for (y = 0; y < h + 3; y++) {
2756  for (x = 0; x < 16; x++)
2757  tmp[x] = FILTER_6TAP(src, filter, 1);
2758  tmp += 16;
2759  src += srcstride;
2760  }
2761 
2762  tmp = tmp_array + 16;
2763  filter = subpel_filters[my - 1];
2764 
2765  for (y = 0; y < h; y++) {
2766  for (x = 0; x < 16; x++)
2767  dst[x] = FILTER_4TAP(tmp, filter, 16);
2768  dst += dststride;
2769  tmp += 16;
2770  }
2771 #endif
2772 }
2773 
2774 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2775  ptrdiff_t srcstride, int h, int mx, int my)
2776 {
2777 #if 1
2778  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2779  uint8_t *tmp = tmp_array;
2780 
2781  src -= srcstride;
2782  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2783  tmp = tmp_array + 8;
2784  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2785 #else
2786  const uint8_t *filter = subpel_filters[mx - 1];
2787  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2788  int x, y;
2789  uint8_t tmp_array[152];
2790  uint8_t *tmp = tmp_array;
2791 
2792  src -= srcstride;
2793 
2794  for (y = 0; y < h + 3; y++) {
2795  for (x = 0; x < 8; x++)
2796  tmp[x] = FILTER_6TAP(src, filter, 1);
2797  tmp += 8;
2798  src += srcstride;
2799  }
2800 
2801  tmp = tmp_array + 8;
2802  filter = subpel_filters[my - 1];
2803 
2804  for (y = 0; y < h; y++) {
2805  for (x = 0; x < 8; x++)
2806  dst[x] = FILTER_4TAP(tmp, filter, 8);
2807  dst += dststride;
2808  tmp += 8;
2809  }
2810 #endif
2811 }
2812 
2813 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2814  ptrdiff_t srcstride, int h, int mx, int my)
2815 {
2816 #if 1
2817  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2818  uint8_t *tmp = tmp_array;
2819 
2820  src -= srcstride;
2821  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2822  tmp = tmp_array + 4;
2823  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2824 #else
2825  const uint8_t *filter = subpel_filters[mx - 1];
2826  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2827  int x, y;
2828  uint8_t tmp_array[44];
2829  uint8_t *tmp = tmp_array;
2830 
2831  src -= srcstride;
2832 
2833  for (y = 0; y < h + 3; y++) {
2834  for (x = 0; x < 4; x++)
2835  tmp[x] = FILTER_6TAP(src, filter, 1);
2836  tmp += 4;
2837  src += srcstride;
2838  }
2839 
2840  tmp = tmp_array + 4;
2841  filter = subpel_filters[my - 1];
2842 
2843  for (y = 0; y < h; y++) {
2844  for (x = 0; x < 4; x++)
2845  dst[x] = FILTER_4TAP(tmp, filter, 4);
2846  dst += dststride;
2847  tmp += 4;
2848  }
2849 #endif
2850 }
2851 
2852 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2853  ptrdiff_t srcstride, int h, int mx, int my)
2854 {
2855 #if 1
2856  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2857  uint8_t *tmp = tmp_array;
2858 
2859  src -= 2 * srcstride;
2860  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2861  tmp = tmp_array + 32;
2862  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2863 #else
2864  const uint8_t *filter = subpel_filters[mx - 1];
2865  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2866  int x, y;
2867  uint8_t tmp_array[592];
2868  uint8_t *tmp = tmp_array;
2869 
2870  src -= 2 * srcstride;
2871 
2872  for (y = 0; y < h + 5; y++) {
2873  for (x = 0; x < 16; x++)
2874  tmp[x] = FILTER_6TAP(src, filter, 1);
2875  tmp += 16;
2876  src += srcstride;
2877  }
2878 
2879  tmp = tmp_array + 32;
2880  filter = subpel_filters[my - 1];
2881 
2882  for (y = 0; y < h; y++) {
2883  for (x = 0; x < 16; x++)
2884  dst[x] = FILTER_6TAP(tmp, filter, 16);
2885  dst += dststride;
2886  tmp += 16;
2887  }
2888 #endif
2889 }
2890 
2891 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2892  ptrdiff_t srcstride, int h, int mx, int my)
2893 {
2894 #if 1
2895  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2896  uint8_t *tmp = tmp_array;
2897 
2898  src -= 2 * srcstride;
2899  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2900  tmp = tmp_array + 16;
2901  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2902 #else
2903  const uint8_t *filter = subpel_filters[mx - 1];
2904  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2905  int x, y;
2906  uint8_t tmp_array[168];
2907  uint8_t *tmp = tmp_array;
2908 
2909  src -= 2 * srcstride;
2910 
2911  for (y = 0; y < h + 5; y++) {
2912  for (x = 0; x < 8; x++)
2913  tmp[x] = FILTER_6TAP(src, filter, 1);
2914  tmp += 8;
2915  src += srcstride;
2916  }
2917 
2918  tmp = tmp_array + 16;
2919  filter = subpel_filters[my - 1];
2920 
2921  for (y = 0; y < h; y++) {
2922  for (x = 0; x < 8; x++)
2923  dst[x] = FILTER_6TAP(tmp, filter, 8);
2924  dst += dststride;
2925  tmp += 8;
2926  }
2927 #endif
2928 }
2929 
2930 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2931  ptrdiff_t srcstride, int h, int mx, int my)
2932 {
2933 #if 1
2934  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2935  uint8_t *tmp = tmp_array;
2936 
2937  src -= 2 * srcstride;
2938  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2939  tmp = tmp_array + 8;
2940  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2941 #else
2942  const uint8_t *filter = subpel_filters[mx - 1];
2943  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2944  int x, y;
2945  uint8_t tmp_array[52];
2946  uint8_t *tmp = tmp_array;
2947 
2948  src -= 2 * srcstride;
2949 
2950  for (y = 0; y < h + 5; y++) {
2951  for (x = 0; x < 4; x++)
2952  tmp[x] = FILTER_6TAP(src, filter, 1);
2953  tmp += 4;
2954  src += srcstride;
2955  }
2956 
2957  tmp = tmp_array + 8;
2958  filter = subpel_filters[my - 1];
2959 
2960  for (y = 0; y < h; y++) {
2961  for (x = 0; x < 4; x++)
2962  dst[x] = FILTER_6TAP(tmp, filter, 4);
2963  dst += dststride;
2964  tmp += 4;
2965  }
2966 #endif
2967 }
2968 
2969 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2970  ptrdiff_t sstride, int h, int mx, int my)
2971 {
2972 #if 1
2973  union mmi_intfloat64 a, b;
2974  double ftmp[7];
2975  uint32_t tmp[1];
2976  mips_reg dst0, src0;
2977  DECLARE_VAR_ALL64;
2978  a.i = 8 - mx;
2979  b.i = mx;
2980 
2981  /*
2982  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2983  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2984  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2985  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2986  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2987  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2988  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2989  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2990 
2991  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2992  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2993  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2994  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2995  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2996  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2997  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2998  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2999  */
3000  __asm__ volatile (
3001  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3002  "li %[tmp0], 0x03 \n\t"
3003  "mtc1 %[tmp0], %[ftmp4] \n\t"
3004  "pshufh %[a], %[a], %[ftmp0] \n\t"
3005  "pshufh %[b], %[b], %[ftmp0] \n\t"
3006 
3007  "1: \n\t"
3008  // 0 - 7
3009  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3010  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
3011  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
3012  // 8 - 15
3013  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
3014 
3015  "addiu %[h], %[h], -0x01 \n\t"
3016  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3017  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3018  "bnez %[h], 1b \n\t"
3019  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3020  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3021  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3022  [ftmp6]"=&f"(ftmp[6]),
3023  [tmp0]"=&r"(tmp[0]),
3024  RESTRICT_ASM_ALL64
3025  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
3026  [h]"+&r"(h),
3027  [dst]"+&r"(dst), [src]"+&r"(src),
3028  [a]"+&f"(a.f), [b]"+&f"(b.f)
3029  : [sstride]"r"((mips_reg)sstride),
3030  [dstride]"r"((mips_reg)dstride),
3031  [ff_pw_4]"f"(ff_pw_4.f)
3032  : "memory"
3033  );
3034 #else
3035  int a = 8 - mx, b = mx;
3036  int x, y;
3037 
3038  for (y = 0; y < h; y++) {
3039  for (x = 0; x < 16; x++)
3040  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3041  dst += dstride;
3042  src += sstride;
3043  }
3044 #endif
3045 }
3046 
3047 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3048  ptrdiff_t sstride, int h, int mx, int my)
3049 {
3050 #if 1
3051  union mmi_intfloat64 c, d;
3052  double ftmp[7];
3053  uint32_t tmp[1];
3054  mips_reg src0, src1, dst0;
3055  DECLARE_VAR_ALL64;
3056  c.i = 8 - my;
3057  d.i = my;
3058 
3059  /*
3060  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3061  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3062  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3063  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3064  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3065  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3066  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3067  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3068  */
3069  __asm__ volatile (
3070  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3071  "li %[tmp0], 0x03 \n\t"
3072  "mtc1 %[tmp0], %[ftmp4] \n\t"
3073  "pshufh %[c], %[c], %[ftmp0] \n\t"
3074  "pshufh %[d], %[d], %[ftmp0] \n\t"
3075 
3076  "1: \n\t"
3077  // 0 - 7
3078  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3079  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
3080  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
3081  // 8 - 15
3082  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3083 
3084  "addiu %[h], %[h], -0x01 \n\t"
3085  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3086  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3087  "bnez %[h], 1b \n\t"
3088  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3089  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3090  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3091  [ftmp6]"=&f"(ftmp[6]),
3092  [tmp0]"=&r"(tmp[0]),
3093  RESTRICT_ASM_ALL64
3094  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
3095  [src1]"=&r"(src1),
3096  [h]"+&r"(h),
3097  [dst]"+&r"(dst), [src]"+&r"(src),
3098  [c]"+&f"(c.f), [d]"+&f"(d.f)
3099  : [sstride]"r"((mips_reg)sstride),
3100  [dstride]"r"((mips_reg)dstride),
3101  [ff_pw_4]"f"(ff_pw_4.f)
3102  : "memory"
3103  );
3104 #else
3105  int c = 8 - my, d = my;
3106  int x, y;
3107 
3108  for (y = 0; y < h; y++) {
3109  for (x = 0; x < 16; x++)
3110  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3111  dst += dstride;
3112  src += sstride;
3113  }
3114 #endif
3115 }
3116 
3117 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3118  ptrdiff_t sstride, int h, int mx, int my)
3119 {
3120 #if 1
3121  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3122  uint8_t *tmp = tmp_array;
3123 
3124  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3125  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3126 #else
3127  int a = 8 - mx, b = mx;
3128  int c = 8 - my, d = my;
3129  int x, y;
3130  uint8_t tmp_array[528];
3131  uint8_t *tmp = tmp_array;
3132 
3133  for (y = 0; y < h + 1; y++) {
3134  for (x = 0; x < 16; x++)
3135  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3136  tmp += 16;
3137  src += sstride;
3138  }
3139 
3140  tmp = tmp_array;
3141 
3142  for (y = 0; y < h; y++) {
3143  for (x = 0; x < 16; x++)
3144  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3145  dst += dstride;
3146  tmp += 16;
3147  }
3148 #endif
3149 }
3150 
3151 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3152  ptrdiff_t sstride, int h, int mx, int my)
3153 {
3154 #if 1
3155  union mmi_intfloat64 a, b;
3156  double ftmp[7];
3157  uint32_t tmp[1];
3158  DECLARE_VAR_ALL64;
3159  a.i = 8 - mx;
3160  b.i = mx;
3161 
3162  /*
3163  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3164  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3165  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3166  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3167  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3168  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3169  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3170  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3171  */
3172  __asm__ volatile (
3173  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3174  "li %[tmp0], 0x03 \n\t"
3175  "mtc1 %[tmp0], %[ftmp4] \n\t"
3176  "pshufh %[a], %[a], %[ftmp0] \n\t"
3177  "pshufh %[b], %[b], %[ftmp0] \n\t"
3178 
3179  "1: \n\t"
3180  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3181 
3182  "addiu %[h], %[h], -0x01 \n\t"
3183  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3184  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3185  "bnez %[h], 1b \n\t"
3186  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3187  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3188  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3189  [ftmp6]"=&f"(ftmp[6]),
3190  [tmp0]"=&r"(tmp[0]),
3191  RESTRICT_ASM_ALL64
3192  [h]"+&r"(h),
3193  [dst]"+&r"(dst), [src]"+&r"(src),
3194  [a]"+&f"(a.f), [b]"+&f"(b.f)
3195  : [sstride]"r"((mips_reg)sstride),
3196  [dstride]"r"((mips_reg)dstride),
3197  [ff_pw_4]"f"(ff_pw_4.f)
3198  : "memory"
3199  );
3200 #else
3201  int a = 8 - mx, b = mx;
3202  int x, y;
3203 
3204  for (y = 0; y < h; y++) {
3205  for (x = 0; x < 8; x++)
3206  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3207  dst += dstride;
3208  src += sstride;
3209  }
3210 #endif
3211 }
3212 
3213 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3214  ptrdiff_t sstride, int h, int mx, int my)
3215 {
3216 #if 1
3217  union mmi_intfloat64 c, d;
3218  double ftmp[7];
3219  uint32_t tmp[1];
3220  mips_reg src1;
3221  DECLARE_VAR_ALL64;
3222  c.i = 8 - my;
3223  d.i = my;
3224 
3225  /*
3226  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3227  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3228  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3229  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3230  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3231  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3232  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3233  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3234  */
3235  __asm__ volatile (
3236  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3237  "li %[tmp0], 0x03 \n\t"
3238  "mtc1 %[tmp0], %[ftmp4] \n\t"
3239  "pshufh %[c], %[c], %[ftmp0] \n\t"
3240  "pshufh %[d], %[d], %[ftmp0] \n\t"
3241 
3242  "1: \n\t"
3243  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3244 
3245  "addiu %[h], %[h], -0x01 \n\t"
3246  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3247  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3248  "bnez %[h], 1b \n\t"
3249  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3250  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3251  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3252  [ftmp6]"=&f"(ftmp[6]),
3253  [tmp0]"=&r"(tmp[0]),
3254  RESTRICT_ASM_ALL64
3255  [src1]"=&r"(src1),
3256  [h]"+&r"(h),
3257  [dst]"+&r"(dst), [src]"+&r"(src),
3258  [c]"+&f"(c.f), [d]"+&f"(d.f)
3259  : [sstride]"r"((mips_reg)sstride),
3260  [dstride]"r"((mips_reg)dstride),
3261  [ff_pw_4]"f"(ff_pw_4.f)
3262  : "memory"
3263  );
3264 #else
3265  int c = 8 - my, d = my;
3266  int x, y;
3267 
3268  for (y = 0; y < h; y++) {
3269  for (x = 0; x < 8; x++)
3270  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3271  dst += dstride;
3272  src += sstride;
3273  }
3274 #endif
3275 }
3276 
3277 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3278  ptrdiff_t sstride, int h, int mx, int my)
3279 {
3280 #if 1
3281  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3282  uint8_t *tmp = tmp_array;
3283 
3284  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3285  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3286 #else
3287  int a = 8 - mx, b = mx;
3288  int c = 8 - my, d = my;
3289  int x, y;
3290  uint8_t tmp_array[136];
3291  uint8_t *tmp = tmp_array;
3292 
3293  for (y = 0; y < h + 1; y++) {
3294  for (x = 0; x < 8; x++)
3295  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3296  tmp += 8;
3297  src += sstride;
3298  }
3299 
3300  tmp = tmp_array;
3301 
3302  for (y = 0; y < h; y++) {
3303  for (x = 0; x < 8; x++)
3304  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3305  dst += dstride;
3306  tmp += 8;
3307  }
3308 #endif
3309 }
3310 
3311 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3312  ptrdiff_t sstride, int h, int mx, int my)
3313 {
3314 #if 1
3315  union mmi_intfloat64 a, b;
3316  double ftmp[5];
3317  uint32_t tmp[1];
3318  DECLARE_VAR_LOW32;
3319  DECLARE_VAR_ALL64;
3320  a.i = 8 - mx;
3321  b.i = mx;
3322 
3323  /*
3324  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3325  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3326  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3327  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3328  */
3329  __asm__ volatile (
3330  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3331  "li %[tmp0], 0x03 \n\t"
3332  "mtc1 %[tmp0], %[ftmp4] \n\t"
3333  "pshufh %[a], %[a], %[ftmp0] \n\t"
3334  "pshufh %[b], %[b], %[ftmp0] \n\t"
3335 
3336  "1: \n\t"
3337  PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3338 
3339  "addiu %[h], %[h], -0x01 \n\t"
3340  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3341  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3342  "bnez %[h], 1b \n\t"
3343  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3344  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3345  [ftmp4]"=&f"(ftmp[4]),
3346  [tmp0]"=&r"(tmp[0]),
3347  RESTRICT_ASM_LOW32
3348  RESTRICT_ASM_ALL64
3349  [h]"+&r"(h),
3350  [dst]"+&r"(dst), [src]"+&r"(src),
3351  [a]"+&f"(a.f), [b]"+&f"(b.f)
3352  : [sstride]"r"((mips_reg)sstride),
3353  [dstride]"r"((mips_reg)dstride),
3354  [ff_pw_4]"f"(ff_pw_4.f)
3355  : "memory"
3356  );
3357 #else
3358  int a = 8 - mx, b = mx;
3359  int x, y;
3360 
3361  for (y = 0; y < h; y++) {
3362  for (x = 0; x < 4; x++)
3363  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3364  dst += dstride;
3365  src += sstride;
3366  }
3367 #endif
3368 }
3369 
3370 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3371  ptrdiff_t sstride, int h, int mx, int my)
3372 {
3373 #if 1
3374  union mmi_intfloat64 c, d;
3375  double ftmp[7];
3376  uint32_t tmp[1];
3377  mips_reg src1;
3378  DECLARE_VAR_LOW32;
3379  DECLARE_VAR_ALL64;
3380  c.i = 8 - my;
3381  d.i = my;
3382 
3383  /*
3384  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3385  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3386  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3387  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3388  */
3389  __asm__ volatile (
3390  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3391  "li %[tmp0], 0x03 \n\t"
3392  "mtc1 %[tmp0], %[ftmp4] \n\t"
3393  "pshufh %[c], %[c], %[ftmp0] \n\t"
3394  "pshufh %[d], %[d], %[ftmp0] \n\t"
3395 
3396  "1: \n\t"
3397  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3398 
3399  "addiu %[h], %[h], -0x01 \n\t"
3400  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3401  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3402  "bnez %[h], 1b \n\t"
3403  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3404  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3405  [ftmp4]"=&f"(ftmp[4]),
3406  [tmp0]"=&r"(tmp[0]),
3407  RESTRICT_ASM_LOW32
3408  RESTRICT_ASM_ALL64
3409  [src1]"=&r"(src1),
3410  [h]"+&r"(h),
3411  [dst]"+&r"(dst), [src]"+&r"(src),
3412  [c]"+&f"(c.f), [d]"+&f"(d.f)
3413  : [sstride]"r"((mips_reg)sstride),
3414  [dstride]"r"((mips_reg)dstride),
3415  [ff_pw_4]"f"(ff_pw_4.f)
3416  : "memory"
3417  );
3418 #else
3419  int c = 8 - my, d = my;
3420  int x, y;
3421 
3422  for (y = 0; y < h; y++) {
3423  for (x = 0; x < 4; x++)
3424  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3425  dst += dstride;
3426  src += sstride;
3427  }
3428 #endif
3429 }
3430 
3431 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3432  ptrdiff_t sstride, int h, int mx, int my)
3433 {
3434 #if 1
3435  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3436  uint8_t *tmp = tmp_array;
3437 
3438  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3439  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3440 #else
3441  int a = 8 - mx, b = mx;
3442  int c = 8 - my, d = my;
3443  int x, y;
3444  uint8_t tmp_array[36];
3445  uint8_t *tmp = tmp_array;
3446 
3447  for (y = 0; y < h + 1; y++) {
3448  for (x = 0; x < 4; x++)
3449  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3450  tmp += 4;
3451  src += sstride;
3452  }
3453 
3454  tmp = tmp_array;
3455 
3456  for (y = 0; y < h; y++) {
3457  for (x = 0; x < 4; x++)
3458  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3459  dst += dstride;
3460  tmp += 4;
3461  }
3462 #endif
3463 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:32
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:258
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
mem_internal.h
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1748
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:486
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:693
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1381
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:358
ff_pw_4
const union av_intfloat64 ff_pw_4
Definition: constants.c:28
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1448
av_unused
#define av_unused
Definition: attributes.h:131
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
w
uint8_t w
Definition: llviddspenc.c:38
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:40
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1466
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3047
t1
#define t1
Definition: regdef.h:29
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
mips_reg
#define mips_reg
Definition: asmdefs.h:46
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:33
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1457
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:492
t10
#define t10
Definition: regdef.h:55
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:970
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:720
ff_pw_64
const union av_intfloat64 ff_pw_64
Definition: constants.c:44
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:560
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:269
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2969
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3213
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:668
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2541
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:75
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:620
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:31
mmiutils.h
a1
#define a1
Definition: regdef.h:47
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2852
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2657
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:786
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2696
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:633
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2265
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:602
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:577
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1419
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2129
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1128
av_intfloat64
Definition: intfloat.h:32
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2580
q0
static const uint8_t q0[256]
Definition: twofish.c:77
E
#define E
Definition: avdct.c:32
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:481
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1975
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:536
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:35
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:301
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3311
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2891
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1678
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:49
av_intfloat64::i
uint64_t i
Definition: intfloat.h:33
src
#define src
Definition: vp8dsp.c:255
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2930
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:856
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1811
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:230
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1404
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:34
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1389
t11
#define t11
Definition: regdef.h:56
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:871
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1355
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2774
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2043
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3277
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3370
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1901
t12
#define t12
Definition: regdef.h:58
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1297
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1364
FFMIN
#define FFMIN(a, b)
Definition: common.h:105
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:52
attributes.h
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:768
a0
#define a0
Definition: regdef.h:46
src0
#define src0
Definition: h264pred.c:139
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2502
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:116
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2200
src1
#define src1
Definition: h264pred.c:140
i
int i
Definition: input.c:406
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3117
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3151
t3
#define t3
Definition: regdef.h:31
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1434
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1374
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1593
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:667
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3431
a2
#define a2
Definition: regdef.h:48
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:30
av_always_inline
#define av_always_inline
Definition: attributes.h:49
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:430
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2735
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:389
av_intfloat64::f
double f
Definition: intfloat.h:34
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2356
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:955
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1441
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:49
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1104
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2618
t2
#define t2
Definition: regdef.h:30
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1553
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:285
dstV
uint16_t * dstV
Definition: input.c:402
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:193
cm
#define cm
Definition: dvbsubdec.c:38
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2813
av_clip_uint8
#define av_clip_uint8
Definition: common.h:128
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:350
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:741
mmi_intfloat64
Definition: asmdefs.h:103
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2432
d
d
Definition: ffmpeg_filter.c:156
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1513
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:332
h
h
Definition: vp9dsp_template.c:2038
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1396
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:457