FFmpeg
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
27 
28 #define DECLARE_DOUBLE_1 double db_1
29 #define DECLARE_DOUBLE_2 double db_2
30 #define DECLARE_UINT32_T uint32_t it_1
31 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
32 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
33 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
34 
35 #define MMI_PCMPGTUB(dst, src1, src2) \
36  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
37  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
38  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
39  "xor "#dst", %[db_2], %[db_1] \n\t"
40 
41 #define MMI_BTOH(dst_l, dst_r, src) \
42  "xor %[db_1], %[db_1], %[db_1] \n\t" \
43  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
44  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
45  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
46 
47 #define MMI_VP8_LOOP_FILTER \
48  /* Calculation of hev */ \
49  "dmtc1 %[thresh], %[ftmp3] \n\t" \
50  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
51  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
52  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
54  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
55  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
56  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
57  /* Calculation of mask */ \
58  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
59  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
60  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
61  "li %[tmp0], 0x09 \n\t" \
62  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
63  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
64  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
65  "dmtc1 %[e], %[ftmp3] \n\t" \
66  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
67  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
68  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
70  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
71  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
72  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
73  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
74  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
75  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
76  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
77  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
78  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
79  "dmtc1 %[i], %[ftmp3] \n\t" \
80  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
81  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
82  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
84  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85  "xor %[mask], %[mask], %[ftmp3] \n\t" \
86  /* VP8_MBFILTER */ \
87  "li %[tmp0], 0x80808080 \n\t" \
88  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
89  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
90  "xor %[p2], %[p2], %[ftmp7] \n\t" \
91  "xor %[p1], %[p1], %[ftmp7] \n\t" \
92  "xor %[p0], %[p0], %[ftmp7] \n\t" \
93  "xor %[q0], %[q0], %[ftmp7] \n\t" \
94  "xor %[q1], %[q1], %[ftmp7] \n\t" \
95  "xor %[q2], %[q2], %[ftmp7] \n\t" \
96  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
97  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
98  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
99  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
100  /* Right part */ \
101  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
102  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
103  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
104  /* Left part */ \
105  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
106  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
107  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
108  /* Combine left and right part */ \
109  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
110  "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
111  "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
112  "li %[tmp0], 0x04040404 \n\t" \
113  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
114  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
115  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
116  "li %[tmp0], 0x0B \n\t" \
117  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
118  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
119  "li %[tmp0], 0x03030303 \n\t" \
120  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
121  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
122  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
123  "li %[tmp0], 0x0B \n\t" \
124  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
125  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
126  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
127  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
128  /* filt_val &= ~hev */ \
129  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
130  "xor %[hev], %[hev], %[ftmp0] \n\t" \
131  "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
132  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
133  "li %[tmp0], 0x07 \n\t" \
134  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
135  "li %[tmp0], 0x001b001b \n\t" \
136  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
137  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
138  "li %[tmp0], 0x003f003f \n\t" \
139  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
140  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
141  /* Right part */ \
142  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
143  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
144  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
145  /* Left part */ \
146  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
147  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
148  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
149  /* Combine left and right part */ \
150  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
151  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
152  "xor %[q0], %[q0], %[ftmp7] \n\t" \
153  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
154  "xor %[p0], %[p0], %[ftmp7] \n\t" \
155  "li %[tmp0], 0x00120012 \n\t" \
156  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
157  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
158  /* Right part */ \
159  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
160  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
161  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
162  /* Left part */ \
163  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
164  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
165  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
166  /* Combine left and right part */ \
167  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
168  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
169  "xor %[q1], %[q1], %[ftmp7] \n\t" \
170  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
171  "xor %[p1], %[p1], %[ftmp7] \n\t" \
172  "li %[tmp0], 0x03 \n\t" \
173  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
174  /* Right part */ \
175  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
176  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
177  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
178  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
179  /* Left part */ \
180  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
181  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
182  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
183  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
184  /* Combine left and right part */ \
185  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
186  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
187  "xor %[q2], %[q2], %[ftmp7] \n\t" \
188  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
189  "xor %[p2], %[p2], %[ftmp7] \n\t"
190 
191 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
192  MMI_ULWC1(%[ftmp1], src, 0x00) \
193  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
194  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
195  \
196  MMI_ULWC1(%[ftmp1], src, -0x01) \
197  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
198  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
199  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
200  \
201  MMI_ULWC1(%[ftmp1], src, -0x02) \
202  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
203  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
204  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
205  \
206  MMI_ULWC1(%[ftmp1], src, 0x01) \
207  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
208  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
209  \
210  MMI_ULWC1(%[ftmp1], src, 0x02) \
211  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
212  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
213  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
214  \
215  MMI_ULWC1(%[ftmp1], src, 0x03) \
216  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
217  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
218  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
219  \
220  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
221  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
222  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
223  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
224  \
225  MMI_SWC1(%[ftmp1], dst, 0x00)
226 
227 
228 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
229  MMI_ULWC1(%[ftmp1], src, 0x00) \
230  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
231  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
232  \
233  MMI_ULWC1(%[ftmp1], src, -0x01) \
234  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
235  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
236  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
237  \
238  MMI_ULWC1(%[ftmp1], src, 0x01) \
239  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
240  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
241  \
242  MMI_ULWC1(%[ftmp1], src, 0x02) \
243  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
244  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
245  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
246  \
247  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
248  \
249  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
250  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
251  \
252  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
253  MMI_SWC1(%[ftmp1], dst, 0x00)
254 
255 
256 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
257  MMI_ULWC1(%[ftmp1], src, 0x00) \
258  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
259  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
260  \
261  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
262  MMI_ULWC1(%[ftmp1], src1, 0x00) \
263  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
264  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
265  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
266  \
267  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
268  MMI_ULWC1(%[ftmp1], src1, 0x00) \
269  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
270  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
271  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
272  \
273  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
274  MMI_ULWC1(%[ftmp1], src1, 0x00) \
275  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
276  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
277  \
278  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
279  MMI_ULWC1(%[ftmp1], src1, 0x00) \
280  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
281  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
282  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
283  \
284  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
285  MMI_ULWC1(%[ftmp1], src1, 0x00) \
286  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
287  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
288  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
289  \
290  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
291  \
292  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
293  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
294  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
295  \
296  MMI_SWC1(%[ftmp1], dst, 0x00)
297 
298 
299 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
300  MMI_ULWC1(%[ftmp1], src, 0x00) \
301  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
302  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
303  \
304  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
305  MMI_ULWC1(%[ftmp1], src1, 0x00) \
306  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
307  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
308  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
309  \
310  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
311  MMI_ULWC1(%[ftmp1], src1, 0x00) \
312  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
313  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
314  \
315  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
316  MMI_ULWC1(%[ftmp1], src1, 0x00) \
317  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
318  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
319  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
320  \
321  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
322  \
323  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
324  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
325  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
326  \
327  MMI_SWC1(%[ftmp1], dst, 0x00)
328 
329 
330 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
331  MMI_ULDC1(%[ftmp1], src, 0x00) \
332  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
333  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
334  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
335  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
336  \
337  MMI_ULDC1(%[ftmp1], src, -0x01) \
338  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
339  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
340  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
341  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
342  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
343  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
344  \
345  MMI_ULDC1(%[ftmp1], src, -0x02) \
346  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
347  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
348  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
349  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
350  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
351  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
352  \
353  MMI_ULDC1(%[ftmp1], src, 0x01) \
354  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
355  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
356  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
357  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
358  \
359  MMI_ULDC1(%[ftmp1], src, 0x02) \
360  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
361  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
362  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
363  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
364  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
365  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
366  \
367  MMI_ULDC1(%[ftmp1], src, 0x03) \
368  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
369  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
370  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
371  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
372  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
373  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
374  \
375  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
376  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
377  \
378  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
379  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
380  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
381  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
382  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
383  \
384  MMI_SDC1(%[ftmp1], dst, 0x00)
385 
386 
387 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
388  MMI_ULDC1(%[ftmp1], src, 0x00) \
389  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
390  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
391  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
392  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
393  \
394  MMI_ULDC1(%[ftmp1], src, -0x01) \
395  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
396  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
397  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
398  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
399  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
400  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
401  \
402  MMI_ULDC1(%[ftmp1], src, 0x01) \
403  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
404  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
405  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
406  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
407  \
408  MMI_ULDC1(%[ftmp1], src, 0x02) \
409  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
410  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
411  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
412  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
413  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
414  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
415  \
416  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
417  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
418  \
419  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
420  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
421  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
422  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
423  \
424  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
425  MMI_SDC1(%[ftmp1], dst, 0x00)
426 
427 
428 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
429  MMI_ULDC1(%[ftmp1], src, 0x00) \
430  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
431  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
432  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
433  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
434  \
435  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
436  MMI_ULDC1(%[ftmp1], src1, 0x00) \
437  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
438  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
439  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
440  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
441  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
442  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
443  \
444  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
445  MMI_ULDC1(%[ftmp1], src1, 0x00) \
446  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
447  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
448  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
449  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
450  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
451  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
452  \
453  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
454  MMI_ULDC1(%[ftmp1], src1, 0x00) \
455  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
456  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
457  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
458  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
459  \
460  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
461  MMI_ULDC1(%[ftmp1], src1, 0x00) \
462  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
463  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
464  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
465  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
466  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
467  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
468  \
469  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
470  MMI_ULDC1(%[ftmp1], src1, 0x00) \
471  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
472  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
473  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
474  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
475  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
476  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
477  \
478  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
479  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
480  \
481  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
482  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
483  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
484  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
485  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
486  \
487  MMI_SDC1(%[ftmp1], dst, 0x00)
488 
489 
490 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
491  MMI_ULDC1(%[ftmp1], src, 0x00) \
492  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
493  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
494  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
495  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
496  \
497  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
498  MMI_ULDC1(%[ftmp1], src1, 0x00) \
499  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
500  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
501  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
502  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
503  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
504  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
505  \
506  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
507  MMI_ULDC1(%[ftmp1], src1, 0x00) \
508  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
509  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
510  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
511  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
512  \
513  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
514  MMI_ULDC1(%[ftmp1], src1, 0x00) \
515  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
516  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
517  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
518  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
519  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
520  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
521  \
522  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
523  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
524  \
525  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
526  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
527  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
528  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
529  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
530  \
531  MMI_SDC1(%[ftmp1], dst, 0x00)
532 
533 
534 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
535  MMI_ULDC1(%[ftmp1], src, 0x00) \
536  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
537  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
538  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
539  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
540  \
541  MMI_ULDC1(%[ftmp1], src, 0x01) \
542  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
543  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
544  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
545  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
546  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
547  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
548  \
549  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
550  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
551  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
552  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
553  \
554  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
555  MMI_SDC1(%[ftmp1], dst, 0x00)
556 
557 
558 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
559  MMI_ULWC1(%[ftmp1], src, 0x00) \
560  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
561  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
562  \
563  MMI_ULWC1(%[ftmp1], src, 0x01) \
564  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
565  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
566  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
567  \
568  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
569  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
570  \
571  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
572  MMI_SWC1(%[ftmp1], dst, 0x00)
573 
574 
575 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
576  MMI_ULDC1(%[ftmp1], src, 0x00) \
577  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
578  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
579  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
580  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
581  \
582  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
583  MMI_ULDC1(%[ftmp1], src1, 0x00) \
584  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
585  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
586  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
587  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
588  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
589  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
590  \
591  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
592  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
593  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
594  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
595  \
596  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
597  MMI_SDC1(%[ftmp1], dst, 0x00)
598 
599 
600 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
601  MMI_ULWC1(%[ftmp1], src, 0x00) \
602  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
603  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
604  \
605  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
606  MMI_ULWC1(%[ftmp1], src1, 0x00) \
607  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
608  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
609  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
610  \
611  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
612  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
613  \
614  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
615  MMI_SWC1(%[ftmp1], dst, 0x00)
616 
617 
618 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
619  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
620  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
621 
622  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
623  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
624 
625  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
626  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
627 
628  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
629  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
630 
631  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
632  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
633 
634  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
635  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
636 
637  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
638  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
639 };
640 
641 #if 0
642 #define FILTER_6TAP(src, F, stride) \
643  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
644  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
645  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
646 
647 #define FILTER_4TAP(src, F, stride) \
648  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
649  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
650 
651 static const uint8_t subpel_filters[7][6] = {
652  { 0, 6, 123, 12, 1, 0 },
653  { 2, 11, 108, 36, 8, 1 },
654  { 0, 9, 93, 50, 6, 0 },
655  { 3, 16, 77, 77, 16, 3 },
656  { 0, 6, 50, 93, 9, 0 },
657  { 1, 8, 36, 108, 11, 2 },
658  { 0, 1, 12, 123, 6, 0 },
659 };
660 
661 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
662 #define MUL_35468(a) (((a) * 35468) >> 16)
663 #endif
664 
665 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
667  ptrdiff_t stride)
668 {
669  int av_unused p1 = p[-2 * stride];
670  int av_unused p0 = p[-1 * stride];
671  int av_unused q0 = p[ 0 * stride];
672  int av_unused q1 = p[ 1 * stride];
673  int a, f1, f2;
674  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
675 
676  a = 3 * (q0 - p0);
677  a += clip_int8(p1 - q1);
678  a = clip_int8(a);
679 
680  // We deviate from the spec here with c(a+3) >> 3
681  // since that's what libvpx does.
682  f1 = FFMIN(a + 4, 127) >> 3;
683  f2 = FFMIN(a + 3, 127) >> 3;
684 
685  // Despite what the spec says, we do need to clamp here to
686  // be bitexact with libvpx.
687  p[-1 * stride] = cm[p0 + f2];
688  p[ 0 * stride] = cm[q0 - f1];
689 }
690 
692  ptrdiff_t stride)
693 {
694  int av_unused p1 = p[-2 * stride];
695  int av_unused p0 = p[-1 * stride];
696  int av_unused q0 = p[ 0 * stride];
697  int av_unused q1 = p[ 1 * stride];
698  int a, f1, f2;
699  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
700 
701  a = 3 * (q0 - p0);
702  a = clip_int8(a);
703 
704  // We deviate from the spec here with c(a+3) >> 3
705  // since that's what libvpx does.
706  f1 = FFMIN(a + 4, 127) >> 3;
707  f2 = FFMIN(a + 3, 127) >> 3;
708 
709  // Despite what the spec says, we do need to clamp here to
710  // be bitexact with libvpx.
711  p[-1 * stride] = cm[p0 + f2];
712  p[ 0 * stride] = cm[q0 - f1];
713  a = (f1 + 1) >> 1;
714  p[-2 * stride] = cm[p1 + a];
715  p[ 1 * stride] = cm[q1 - a];
716 }
717 
719  int flim)
720 {
721  int av_unused p1 = p[-2 * stride];
722  int av_unused p0 = p[-1 * stride];
723  int av_unused q0 = p[ 0 * stride];
724  int av_unused q1 = p[ 1 * stride];
725 
726  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
727 }
728 
729 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
730 {
731  int av_unused p1 = p[-2 * stride];
732  int av_unused p0 = p[-1 * stride];
733  int av_unused q0 = p[ 0 * stride];
734  int av_unused q1 = p[ 1 * stride];
735 
736  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
737 }
738 
739 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
740 {
741  int a0, a1, a2, w;
742  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
743 
744  int av_unused p2 = p[-3 * stride];
745  int av_unused p1 = p[-2 * stride];
746  int av_unused p0 = p[-1 * stride];
747  int av_unused q0 = p[ 0 * stride];
748  int av_unused q1 = p[ 1 * stride];
749  int av_unused q2 = p[ 2 * stride];
750 
751  w = clip_int8(p1 - q1);
752  w = clip_int8(w + 3 * (q0 - p0));
753 
754  a0 = (27 * w + 63) >> 7;
755  a1 = (18 * w + 63) >> 7;
756  a2 = (9 * w + 63) >> 7;
757 
758  p[-3 * stride] = cm[p2 + a2];
759  p[-2 * stride] = cm[p1 + a1];
760  p[-1 * stride] = cm[p0 + a0];
761  p[ 0 * stride] = cm[q0 - a0];
762  p[ 1 * stride] = cm[q1 - a1];
763  p[ 2 * stride] = cm[q2 - a2];
764 }
765 
767  int E, int I)
768 {
769  int av_unused p3 = p[-4 * stride];
770  int av_unused p2 = p[-3 * stride];
771  int av_unused p1 = p[-2 * stride];
772  int av_unused p0 = p[-1 * stride];
773  int av_unused q0 = p[ 0 * stride];
774  int av_unused q1 = p[ 1 * stride];
775  int av_unused q2 = p[ 2 * stride];
776  int av_unused q3 = p[ 3 * stride];
777 
778  return vp8_simple_limit(p, stride, E) &&
779  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
780  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
781  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
782 }
783 
785  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
786 {
787  double ftmp[18];
788  uint32_t tmp[1];
792  __asm__ volatile(
793  /* Get data from dst */
794  "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
795  "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
796  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
797  "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
798  "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
799  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
800  "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
801  "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
802  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
803  "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
804  "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
805  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
806  "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
807  "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
808  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
809  "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
810  "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
811  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
812  "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
813  "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
814  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
815  "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
816  "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
818  /* Move to dst */
819  "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
820  "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
821  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
822  "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
823  "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
824  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
825  "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
826  "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
827  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
828  "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
829  "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
830  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
831  "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
832  "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
833  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
834  "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
835  "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
836  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
837  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
838  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
839  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
840  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
841  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
842  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
843  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
844  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
845  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
848  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
849  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
850  : "memory"
851  );
852 }
853 
855  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
856 {
857  int i;
858 
859  for (i = 0; i < 8; i++)
860  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
861  int hv = hev(dst + i * 1, stride, hev_thresh);
862  if (hv)
863  vp8_filter_common_is4tap(dst + i * 1, stride);
864  else
866  }
867 }
868 
870  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
871 {
872  double ftmp[18];
873  uint32_t tmp[1];
877  __asm__ volatile(
878  /* Get data from dst */
879  "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
880  "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
881  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
882  "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
883  "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
884  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
885  "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
886  "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
887  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
888  "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
889  "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
890  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
891  "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
892  "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
893  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
894  "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
895  "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
896  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
897  "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
898  "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
899  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
900  "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
901  "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
902  /* Matrix transpose */
903  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
904  %[q0], %[q1], %[q2], %[q3],
905  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
907  /* Matrix transpose */
908  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
909  %[q0], %[q1], %[q2], %[q3],
910  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
911  /* Move to dst */
912  "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
913  "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
914  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
915  "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
916  "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
917  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
918  "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
919  "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
920  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
921  "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
922  "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
923  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
924  "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
925  "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
926  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
927  "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
928  "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
929  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
930  "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
931  "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
932  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
933  "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
934  "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
935  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
936  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
937  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
938  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
939  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
940  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
941  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
942  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
943  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
944  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
947  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
948  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
949  : "memory"
950  );
951 }
952 
954  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
955 {
956  int i;
957 
958  for (i = 0; i < 8; i++)
959  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
960  int hv = hev(dst + i * stride, 1, hev_thresh);
961  if (hv)
962  vp8_filter_common_is4tap(dst + i * stride, 1);
963  else
965  }
966 }
967 
968 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
969 {
970 #if 1
971  double ftmp[8];
972  DECLARE_VAR_ALL64;
973 
974  __asm__ volatile (
975  MMI_LDC1(%[ftmp0], %[dc], 0x00)
976  MMI_LDC1(%[ftmp1], %[dc], 0x08)
977  MMI_LDC1(%[ftmp2], %[dc], 0x10)
978  MMI_LDC1(%[ftmp3], %[dc], 0x18)
979  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
980  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
981  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
982  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
983  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
984  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
985  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
986  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
987  MMI_SDC1(%[ftmp0], %[dc], 0x00)
988  MMI_SDC1(%[ftmp1], %[dc], 0x08)
989  MMI_SDC1(%[ftmp2], %[dc], 0x10)
990  MMI_SDC1(%[ftmp3], %[dc], 0x18)
991  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
992  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
993  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
994  [ftmp6]"=&f"(ftmp[6]),
995  RESTRICT_ASM_ALL64
996  [ftmp7]"=&f"(ftmp[7])
997  : [dc]"r"((uint8_t*)dc)
998  : "memory"
999  );
1000 
1001  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1002  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1003  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1004  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1005 
1006  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1007  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1008  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1009  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1010 
1011  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1012  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1013  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1014  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1015 
1016  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1017  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1018  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1019  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1020 
1021  __asm__ volatile (
1022  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1023  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1024  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1025  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1026  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1027  : RESTRICT_ASM_ALL64
1028  [ftmp0]"=&f"(ftmp[0])
1029  : [dc]"r"((uint8_t *)dc)
1030  : "memory"
1031  );
1032 #else
1033  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1034 
1035  t00 = dc[0] + dc[12];
1036  t10 = dc[1] + dc[13];
1037  t20 = dc[2] + dc[14];
1038  t30 = dc[3] + dc[15];
1039 
1040  t03 = dc[0] - dc[12];
1041  t13 = dc[1] - dc[13];
1042  t23 = dc[2] - dc[14];
1043  t33 = dc[3] - dc[15];
1044 
1045  t01 = dc[4] + dc[ 8];
1046  t11 = dc[5] + dc[ 9];
1047  t21 = dc[6] + dc[10];
1048  t31 = dc[7] + dc[11];
1049 
1050  t02 = dc[4] - dc[ 8];
1051  t12 = dc[5] - dc[ 9];
1052  t22 = dc[6] - dc[10];
1053  t32 = dc[7] - dc[11];
1054 
1055  dc[ 0] = t00 + t01;
1056  dc[ 1] = t10 + t11;
1057  dc[ 2] = t20 + t21;
1058  dc[ 3] = t30 + t31;
1059 
1060  dc[ 4] = t03 + t02;
1061  dc[ 5] = t13 + t12;
1062  dc[ 6] = t23 + t22;
1063  dc[ 7] = t33 + t32;
1064 
1065  dc[ 8] = t00 - t01;
1066  dc[ 9] = t10 - t11;
1067  dc[10] = t20 - t21;
1068  dc[11] = t30 - t31;
1069 
1070  dc[12] = t03 - t02;
1071  dc[13] = t13 - t12;
1072  dc[14] = t23 - t22;
1073  dc[15] = t33 - t32;
1074 
1075  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1076  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1077  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1078  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1079 
1080  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1081  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1082  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1083  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1084 
1085  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1086  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1087  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1088  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1089 
1090  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1091  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1092  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1093  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1094 
1095  AV_ZERO64(dc + 0);
1096  AV_ZERO64(dc + 4);
1097  AV_ZERO64(dc + 8);
1098  AV_ZERO64(dc + 12);
1099 #endif
1100 }
1101 
1102 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1103 {
1104  int val = (dc[0] + 3) >> 3;
1105 
1106  dc[0] = 0;
1107 
1108  block[0][0][0] = val;
1109  block[0][1][0] = val;
1110  block[0][2][0] = val;
1111  block[0][3][0] = val;
1112  block[1][0][0] = val;
1113  block[1][1][0] = val;
1114  block[1][2][0] = val;
1115  block[1][3][0] = val;
1116  block[2][0][0] = val;
1117  block[2][1][0] = val;
1118  block[2][2][0] = val;
1119  block[2][3][0] = val;
1120  block[3][0][0] = val;
1121  block[3][1][0] = val;
1122  block[3][2][0] = val;
1123  block[3][3][0] = val;
1124 }
1125 
1126 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1127 {
1128 #if 1
1129  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1130  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1131  double ftmp[12];
1132  uint32_t tmp[1];
1133  DECLARE_VAR_LOW32;
1134  DECLARE_VAR_ALL64;
1135 
1136  __asm__ volatile (
1137  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1138  MMI_LDC1(%[ftmp1], %[block], 0x00)
1139  MMI_LDC1(%[ftmp2], %[block], 0x08)
1140  MMI_LDC1(%[ftmp3], %[block], 0x10)
1141  MMI_LDC1(%[ftmp4], %[block], 0x18)
1142 
1143  "li %[tmp0], 0x02 \n\t"
1144  "mtc1 %[tmp0], %[ftmp11] \n\t"
1145 
1146  // block[0...3] + block[8...11]
1147  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1148  // block[0...3] - block[8...11]
1149  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1150  // MUL_35468(block[12...15])
1151  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1152  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1153  // MUL_35468(block[4...7])
1154  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1155  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1156  // MUL_20091(block[4...7]
1157  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1158  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1159  // MUL_20091(block[12...15])
1160  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1161  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1162 
1163  // tmp[0 4 8 12]
1164  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1165  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1166  // tmp[1 5 9 13]
1167  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1168  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1169  // tmp[2 6 10 14]
1170  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1171  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1172  // tmp[3 7 11 15]
1173  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1174  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1175 
1176  MMI_SDC1(%[ftmp0], %[block], 0x00)
1177  MMI_SDC1(%[ftmp0], %[block], 0x08)
1178  MMI_SDC1(%[ftmp0], %[block], 0x10)
1179  MMI_SDC1(%[ftmp0], %[block], 0x18)
1180 
1181  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1182  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1183 
1184  // t[0 4 8 12]
1185  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1186  // t[1 5 9 13]
1187  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1188  // t[2 6 10 14]
1189  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1190  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1191  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1192  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1193  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1194  // t[3 7 11 15]
1195  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1196  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1197  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1198  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1199  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1200 
1201  "li %[tmp0], 0x03 \n\t"
1202  "mtc1 %[tmp0], %[ftmp11] \n\t"
1203  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1204  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1205  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1206  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1207  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1208  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1209  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1210  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1211  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1212  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1213  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1214  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1215 
1216  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1217  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1218 
1219  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1220  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1221  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1222  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1223 
1224  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1225  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1226  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1227  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1228 
1229  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1230  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1231  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1232  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1233 
1234  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1235  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1236  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1237  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1238 
1239  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1240  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1241  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1242  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1243  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1244  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1245  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1246  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1247  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1248  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1249  RESTRICT_ASM_LOW32
1250  RESTRICT_ASM_ALL64
1251  [tmp0]"=&r"(tmp[0])
1252  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1253  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1254  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1255  [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1256  : "memory"
1257  );
1258 #else
1259  int i, t0, t1, t2, t3;
1260  int16_t tmp[16];
1261 
1262  for (i = 0; i < 4; i++) {
1263  t0 = block[0 + i] + block[8 + i];
1264  t1 = block[0 + i] - block[8 + i];
1265  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1266  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1267  block[ 0 + i] = 0;
1268  block[ 4 + i] = 0;
1269  block[ 8 + i] = 0;
1270  block[12 + i] = 0;
1271 
1272  tmp[i * 4 + 0] = t0 + t3;
1273  tmp[i * 4 + 1] = t1 + t2;
1274  tmp[i * 4 + 2] = t1 - t2;
1275  tmp[i * 4 + 3] = t0 - t3;
1276  }
1277 
1278  for (i = 0; i < 4; i++) {
1279  t0 = tmp[0 + i] + tmp[8 + i];
1280  t1 = tmp[0 + i] - tmp[8 + i];
1281  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1282  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1283 
1284  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1285  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1286  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1287  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1288  dst += stride;
1289  }
1290 #endif
1291 }
1292 
1293 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1294 {
1295 #if 1
1296  int dc = (block[0] + 4) >> 3;
1297  double ftmp[6];
1298  DECLARE_VAR_LOW32;
1299 
1300  block[0] = 0;
1301 
1302  __asm__ volatile (
1303  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1304  "mtc1 %[dc], %[ftmp5] \n\t"
1305  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1306  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1307  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1308  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1309  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1310  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1311  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1312  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1313  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1314  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1315  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1316  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1317  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1318  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1319  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1320  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1321  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1322  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1323  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1324  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1325  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1326  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1327  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1328  [ftmp4]"=&f"(ftmp[4]),
1329  RESTRICT_ASM_LOW32
1330  [ftmp5]"=&f"(ftmp[5])
1331  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1332  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1333  [dc]"r"(dc)
1334  : "memory"
1335  );
1336 #else
1337  int i, dc = (block[0] + 4) >> 3;
1338 
1339  block[0] = 0;
1340 
1341  for (i = 0; i < 4; i++) {
1342  dst[0] = av_clip_uint8(dst[0] + dc);
1343  dst[1] = av_clip_uint8(dst[1] + dc);
1344  dst[2] = av_clip_uint8(dst[2] + dc);
1345  dst[3] = av_clip_uint8(dst[3] + dc);
1346  dst += stride;
1347  }
1348 #endif
1349 }
1350 
1351 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1352  ptrdiff_t stride)
1353 {
1354  ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1355  ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1356  ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1357  ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1358 }
1359 
1360 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1361  ptrdiff_t stride)
1362 {
1363  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1364  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1365  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1366  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1367 }
1368 
1369 // loop filter applied to edges between macroblocks
1370 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1371  int flim_I, int hev_thresh)
1372 {
1373  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1374  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1375 }
1376 
1377 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1378  int flim_I, int hev_thresh)
1379 {
1380  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1381  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1382  hev_thresh);
1383 }
1384 
1385 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1386  int flim_E, int flim_I, int hev_thresh)
1387 {
1388  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1389  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1390 }
1391 
1392 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1393  int flim_E, int flim_I, int hev_thresh)
1394 {
1395  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1396  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1397 }
1398 
1399 // loop filter applied to inner macroblock edges
1401  int flim_E, int flim_I, int hev_thresh)
1402 {
1403  int i;
1404 
1405  for (i = 0; i < 16; i++)
1406  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1407  int hv = hev(dst + i * 1, stride, hev_thresh);
1408  if (hv)
1409  vp8_filter_common_is4tap(dst + i * 1, stride);
1410  else
1411  vp8_filter_common_isnot4tap(dst + i * 1, stride);
1412  }
1413 }
1414 
1416  int flim_E, int flim_I, int hev_thresh)
1417 {
1418  int i;
1419 
1420  for (i = 0; i < 16; i++)
1421  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1422  int hv = hev(dst + i * stride, 1, hev_thresh);
1423  if (hv)
1424  vp8_filter_common_is4tap(dst + i * stride, 1);
1425  else
1426  vp8_filter_common_isnot4tap(dst + i * stride, 1);
1427  }
1428 }
1429 
1431  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1432 {
1433  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1434  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1435 }
1436 
1438  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1439 {
1440  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1441  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1442 }
1443 
1444 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1445 {
1446  int i;
1447 
1448  for (i = 0; i < 16; i++)
1449  if (vp8_simple_limit(dst + i, stride, flim))
1451 }
1452 
1453 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1454 {
1455  int i;
1456 
1457  for (i = 0; i < 16; i++)
1458  if (vp8_simple_limit(dst + i * stride, 1, flim))
1459  vp8_filter_common_is4tap(dst + i * stride, 1);
1460 }
1461 
1462 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1463  ptrdiff_t srcstride, int h, int x, int y)
1464 {
1465 #if 1
1466  double ftmp[2];
1467  uint64_t tmp[2];
1468  mips_reg addr[2];
1469  DECLARE_VAR_ALL64;
1470 
1471  __asm__ volatile (
1472  "1: \n\t"
1473  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1474  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1475  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1476  "ldr %[tmp0], 0x08(%[src]) \n\t"
1477  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1478  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1479  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1480  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1481  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1482  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1483  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1484  "addiu %[h], %[h], -0x02 \n\t"
1485  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1486  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1487  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1488  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1489  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1490  "bnez %[h], 1b \n\t"
1491  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1492  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1493  RESTRICT_ASM_ALL64
1494  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1495  [dst]"+&r"(dst), [src]"+&r"(src),
1496  [h]"+&r"(h)
1497  : [dststride]"r"((mips_reg)dststride),
1498  [srcstride]"r"((mips_reg)srcstride)
1499  : "memory"
1500  );
1501 #else
1502  int i;
1503 
1504  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1505  memcpy(dst, src, 16);
1506 #endif
1507 }
1508 
1509 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1510  ptrdiff_t srcstride, int h, int x, int y)
1511 {
1512 #if 1
1513  double ftmp[1];
1514  uint64_t tmp[1];
1515  mips_reg addr[2];
1516  DECLARE_VAR_ALL64;
1517 
1518  __asm__ volatile (
1519  "1: \n\t"
1520  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1521  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1522  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1523  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1524  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1525  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1526  "addiu %[h], %[h], -0x02 \n\t"
1527  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1528  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1529  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1530  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1531  "bnez %[h], 1b \n\t"
1532  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1533  RESTRICT_ASM_ALL64
1534  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1535  [dst]"+&r"(dst), [src]"+&r"(src),
1536  [h]"+&r"(h)
1537  : [dststride]"r"((mips_reg)dststride),
1538  [srcstride]"r"((mips_reg)srcstride)
1539  : "memory"
1540  );
1541 #else
1542  int i;
1543 
1544  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1545  memcpy(dst, src, 8);
1546 #endif
1547 }
1548 
1549 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1550  ptrdiff_t srcstride, int h, int x, int y)
1551 {
1552 #if 1
1553  double ftmp[1];
1554  uint64_t tmp[1];
1555  mips_reg addr[2];
1556  DECLARE_VAR_LOW32;
1557 
1558  __asm__ volatile (
1559  "1: \n\t"
1560  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1561  MMI_LWC1(%[ftmp0], %[src], 0x00)
1562  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1563  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1564  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1565  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1566  "addiu %[h], %[h], -0x02 \n\t"
1567  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1568  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1569  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1570  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1571  "bnez %[h], 1b \n\t"
1572  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1573  RESTRICT_ASM_LOW32
1574  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1575  [dst]"+&r"(dst), [src]"+&r"(src),
1576  [h]"+&r"(h)
1577  : [dststride]"r"((mips_reg)dststride),
1578  [srcstride]"r"((mips_reg)srcstride)
1579  : "memory"
1580  );
1581 #else
1582  int i;
1583 
1584  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1585  memcpy(dst, src, 4);
1586 #endif
1587 }
1588 
1589 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1590  ptrdiff_t srcstride, int h, int mx, int my)
1591 {
1592 #if 1
1593  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1594  double ftmp[9];
1595  uint32_t tmp[1];
1596  mips_reg src1, dst1;
1597  DECLARE_VAR_ALL64;
1598 
1599  /*
1600  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1601  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1602  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1603  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1604  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1605  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1606  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1607  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1608 
1609  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1610  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1611  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1612  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1613  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1614  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1615  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1616  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1617  */
1618  __asm__ volatile (
1619  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1620  "li %[tmp0], 0x07 \n\t"
1621  "mtc1 %[tmp0], %[ftmp4] \n\t"
1622 
1623  "1: \n\t"
1624  // 0 - 7
1625  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1626  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1627  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1628  // 8 - 15
1629  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1630 
1631  "addiu %[h], %[h], -0x01 \n\t"
1632  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1633  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1634  "bnez %[h], 1b \n\t"
1635  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1636  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1637  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1638  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1639  [ftmp8]"=&f"(ftmp[8]),
1640  [tmp0]"=&r"(tmp[0]),
1641  RESTRICT_ASM_ALL64
1642  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1643  [h]"+&r"(h),
1644  [dst]"+&r"(dst), [src]"+&r"(src)
1645  : [ff_pw_64]"f"(ff_pw_64),
1646  [srcstride]"r"((mips_reg)srcstride),
1647  [dststride]"r"((mips_reg)dststride),
1648  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1649  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1650  : "memory"
1651  );
1652 #else
1653  const uint8_t *filter = subpel_filters[mx - 1];
1654  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1655  int x, y;
1656 
1657  for (y = 0; y < h; y++) {
1658  for (x = 0; x < 16; x++)
1659  dst[x] = FILTER_4TAP(src, filter, 1);
1660  dst += dststride;
1661  src += srcstride;
1662  }
1663 #endif
1664 }
1665 
1666 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1667  ptrdiff_t srcstride, int h, int mx, int my)
1668 {
1669 #if 1
1670  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1671  double ftmp[9];
1672  uint32_t tmp[1];
1673  DECLARE_VAR_ALL64;
1674 
1675  /*
1676  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1677  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1678  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1679  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1680  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1681  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1682  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1683  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1684  */
1685  __asm__ volatile (
1686  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1687  "li %[tmp0], 0x07 \n\t"
1688  "mtc1 %[tmp0], %[ftmp4] \n\t"
1689 
1690  "1: \n\t"
1691  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1692 
1693  "addiu %[h], %[h], -0x01 \n\t"
1694  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1695  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1696  "bnez %[h], 1b \n\t"
1697  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1698  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1699  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1700  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1701  [ftmp8]"=&f"(ftmp[8]),
1702  [tmp0]"=&r"(tmp[0]),
1703  RESTRICT_ASM_ALL64
1704  [h]"+&r"(h),
1705  [dst]"+&r"(dst), [src]"+&r"(src)
1706  : [ff_pw_64]"f"(ff_pw_64),
1707  [srcstride]"r"((mips_reg)srcstride),
1708  [dststride]"r"((mips_reg)dststride),
1709  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1710  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1711  : "memory"
1712  );
1713 #else
1714  const uint8_t *filter = subpel_filters[mx - 1];
1715  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1716  int x, y;
1717 
1718  for (y = 0; y < h; y++) {
1719  for (x = 0; x < 8; x++)
1720  dst[x] = FILTER_4TAP(src, filter, 1);
1721  dst += dststride;
1722  src += srcstride;
1723  }
1724 #endif
1725 }
1726 
1727 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1728  ptrdiff_t srcstride, int h, int mx, int my)
1729 {
1730 #if 1
1731  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1732  double ftmp[6];
1733  uint32_t tmp[1];
1734  DECLARE_VAR_LOW32;
1735 
1736  /*
1737  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1738  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1739  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1740  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1741  */
1742  __asm__ volatile (
1743  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1744  "li %[tmp0], 0x07 \n\t"
1745  "mtc1 %[tmp0], %[ftmp4] \n\t"
1746 
1747  "1: \n\t"
1748  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1749 
1750  "addiu %[h], %[h], -0x01 \n\t"
1751  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1752  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1753  "bnez %[h], 1b \n\t"
1754  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1755  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1756  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1757  [tmp0]"=&r"(tmp[0]),
1758  RESTRICT_ASM_LOW32
1759  [h]"+&r"(h),
1760  [dst]"+&r"(dst), [src]"+&r"(src)
1761  : [ff_pw_64]"f"(ff_pw_64),
1762  [srcstride]"r"((mips_reg)srcstride),
1763  [dststride]"r"((mips_reg)dststride),
1764  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1765  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1766  : "memory"
1767  );
1768 #else
1769  const uint8_t *filter = subpel_filters[mx - 1];
1770  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1771  int x, y;
1772 
1773  for (y = 0; y < h; y++) {
1774  for (x = 0; x < 4; x++)
1775  dst[x] = FILTER_4TAP(src, filter, 1);
1776  dst += dststride;
1777  src += srcstride;
1778  }
1779 #endif
1780 }
1781 
1782 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1783  ptrdiff_t srcstride, int h, int mx, int my)
1784 {
1785 #if 1
1786  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1787  double ftmp[9];
1788  uint32_t tmp[1];
1789  mips_reg src1, dst1;
1790  DECLARE_VAR_ALL64;
1791 
1792  /*
1793  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1794  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1795  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1796  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1797  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1798  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1799  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1800  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1801 
1802  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1803  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1804  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1805  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1806  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1807  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1808  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1809  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1810  */
1811  __asm__ volatile (
1812  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1813  "li %[tmp0], 0x07 \n\t"
1814  "mtc1 %[tmp0], %[ftmp4] \n\t"
1815 
1816  "1: \n\t"
1817  // 0 - 7
1818  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1819  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1820  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1821  // 8 - 15
1822  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1823 
1824  "addiu %[h], %[h], -0x01 \n\t"
1825  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1826  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1827  "bnez %[h], 1b \n\t"
1828  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1829  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1830  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1831  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1832  [ftmp8]"=&f"(ftmp[8]),
1833  [tmp0]"=&r"(tmp[0]),
1834  RESTRICT_ASM_ALL64
1835  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1836  [h]"+&r"(h),
1837  [dst]"+&r"(dst), [src]"+&r"(src)
1838  : [ff_pw_64]"f"(ff_pw_64),
1839  [srcstride]"r"((mips_reg)srcstride),
1840  [dststride]"r"((mips_reg)dststride),
1841  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1842  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1843  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1844  : "memory"
1845  );
1846 #else
1847  const uint8_t *filter = subpel_filters[mx - 1];
1848  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1849  int x, y;
1850 
1851  for (y = 0; y < h; y++) {
1852  for (x = 0; x < 16; x++)
1853  dst[x] = FILTER_6TAP(src, filter, 1);
1854  dst += dststride;
1855  src += srcstride;
1856  }
1857 #endif
1858 }
1859 
1860 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1861  ptrdiff_t srcstride, int h, int mx, int my)
1862 {
1863 #if 1
1864  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1865  double ftmp[9];
1866  uint32_t tmp[1];
1867  DECLARE_VAR_ALL64;
1868 
1869  /*
1870  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1871  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1872  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1873  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1874  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1875  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1876  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1877  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1878  */
1879  __asm__ volatile (
1880  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1881  "li %[tmp0], 0x07 \n\t"
1882  "mtc1 %[tmp0], %[ftmp4] \n\t"
1883 
1884  "1: \n\t"
1885  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1886 
1887  "addiu %[h], %[h], -0x01 \n\t"
1888  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1889  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1890  "bnez %[h], 1b \n\t"
1891  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1892  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1893  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1894  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1895  [ftmp8]"=&f"(ftmp[8]),
1896  [tmp0]"=&r"(tmp[0]),
1897  RESTRICT_ASM_ALL64
1898  [h]"+&r"(h),
1899  [dst]"+&r"(dst), [src]"+&r"(src)
1900  : [ff_pw_64]"f"(ff_pw_64),
1901  [srcstride]"r"((mips_reg)srcstride),
1902  [dststride]"r"((mips_reg)dststride),
1903  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1904  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1905  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1906  : "memory"
1907  );
1908 #else
1909  const uint8_t *filter = subpel_filters[mx - 1];
1910  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1911  int x, y;
1912 
1913  for (y = 0; y < h; y++) {
1914  for (x = 0; x < 8; x++)
1915  dst[x] = FILTER_6TAP(src, filter, 1);
1916  dst += dststride;
1917  src += srcstride;
1918  }
1919 #endif
1920 }
1921 
1922 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1923  ptrdiff_t srcstride, int h, int mx, int my)
1924 {
1925 #if 1
1926  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1927  double ftmp[6];
1928  uint32_t tmp[1];
1929  DECLARE_VAR_LOW32;
1930 
1931  /*
1932  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1933  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1934  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1935  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1936  */
1937  __asm__ volatile (
1938  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1939  "li %[tmp0], 0x07 \n\t"
1940  "mtc1 %[tmp0], %[ftmp4] \n\t"
1941 
1942  "1: \n\t"
1943  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1944 
1945  "addiu %[h], %[h], -0x01 \n\t"
1946  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1947  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1948  "bnez %[h], 1b \n\t"
1949  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1950  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1951  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1952  [tmp0]"=&r"(tmp[0]),
1953  RESTRICT_ASM_LOW32
1954  [h]"+&r"(h),
1955  [dst]"+&r"(dst), [src]"+&r"(src)
1956  : [ff_pw_64]"f"(ff_pw_64),
1957  [srcstride]"r"((mips_reg)srcstride),
1958  [dststride]"r"((mips_reg)dststride),
1959  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1960  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1961  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1962  : "memory"
1963  );
1964 #else
1965  const uint8_t *filter = subpel_filters[mx - 1];
1966  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1967  int x, y;
1968 
1969  for (y = 0; y < h; y++) {
1970  for (x = 0; x < 4; x++)
1971  dst[x] = FILTER_6TAP(src, filter, 1);
1972  dst += dststride;
1973  src += srcstride;
1974  }
1975 #endif
1976 }
1977 
1978 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1979  ptrdiff_t srcstride, int h, int mx, int my)
1980 {
1981 #if 1
1982  const uint64_t *filter = fourtap_subpel_filters[my - 1];
1983  double ftmp[9];
1984  uint32_t tmp[1];
1985  mips_reg src0, src1, dst0;
1986  DECLARE_VAR_ALL64;
1987 
1988  /*
1989  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1990  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1991  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1992  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1993  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1994  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1995  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1996  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1997 
1998  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
1999  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2000  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2001  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2002  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2003  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2004  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2005  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2006  */
2007  __asm__ volatile (
2008  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2009  "li %[tmp0], 0x07 \n\t"
2010  "mtc1 %[tmp0], %[ftmp4] \n\t"
2011 
2012  "1: \n\t"
2013  // 0 - 7
2014  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2015  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2016  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2017  // 8 - 15
2018  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2019 
2020  "addiu %[h], %[h], -0x01 \n\t"
2021  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2022  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2023  "bnez %[h], 1b \n\t"
2024  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2025  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2026  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2027  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2028  [ftmp8]"=&f"(ftmp[8]),
2029  [tmp0]"=&r"(tmp[0]),
2030  RESTRICT_ASM_ALL64
2031  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2032  [src1]"=&r"(src1),
2033  [h]"+&r"(h),
2034  [dst]"+&r"(dst), [src]"+&r"(src)
2035  : [ff_pw_64]"f"(ff_pw_64),
2036  [srcstride]"r"((mips_reg)srcstride),
2037  [dststride]"r"((mips_reg)dststride),
2038  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2039  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2040  : "memory"
2041  );
2042 #else
2043  const uint8_t *filter = subpel_filters[my - 1];
2044  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2045  int x, y;
2046 
2047  for (y = 0; y < h; y++) {
2048  for (x = 0; x < 16; x++)
2049  dst[x] = FILTER_4TAP(src, filter, srcstride);
2050  dst += dststride;
2051  src += srcstride;
2052  }
2053 #endif
2054 }
2055 
2056 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2057  ptrdiff_t srcstride, int h, int mx, int my)
2058 {
2059 #if 1
2060  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2061  double ftmp[9];
2062  uint32_t tmp[1];
2063  mips_reg src1;
2064  DECLARE_VAR_ALL64;
2065 
2066  /*
2067  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2068  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2069  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2070  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2071  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2072  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2073  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2074  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2075  */
2076  __asm__ volatile (
2077  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2078  "li %[tmp0], 0x07 \n\t"
2079  "mtc1 %[tmp0], %[ftmp4] \n\t"
2080 
2081  "1: \n\t"
2082  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2083 
2084  "addiu %[h], %[h], -0x01 \n\t"
2085  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2086  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2087  "bnez %[h], 1b \n\t"
2088  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2089  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2090  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2091  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2092  [ftmp8]"=&f"(ftmp[8]),
2093  [tmp0]"=&r"(tmp[0]),
2094  RESTRICT_ASM_ALL64
2095  [src1]"=&r"(src1),
2096  [h]"+&r"(h),
2097  [dst]"+&r"(dst), [src]"+&r"(src)
2098  : [ff_pw_64]"f"(ff_pw_64),
2099  [srcstride]"r"((mips_reg)srcstride),
2100  [dststride]"r"((mips_reg)dststride),
2101  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2102  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2103  : "memory"
2104  );
2105 #else
2106  const uint8_t *filter = subpel_filters[my - 1];
2107  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2108  int x, y;
2109 
2110  for (y = 0; y < h; y++) {
2111  for (x = 0; x < 8; x++)
2112  dst[x] = FILTER_4TAP(src, filter, srcstride);
2113  dst += dststride;
2114  src += srcstride;
2115  }
2116 #endif
2117 }
2118 
2119 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2120  ptrdiff_t srcstride, int h, int mx, int my)
2121 {
2122 #if 1
2123  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2124  double ftmp[6];
2125  uint32_t tmp[1];
2126  mips_reg src1;
2127  DECLARE_VAR_LOW32;
2128 
2129  /*
2130  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2131  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2132  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2133  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2134  */
2135  __asm__ volatile (
2136  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2137  "li %[tmp0], 0x07 \n\t"
2138  "mtc1 %[tmp0], %[ftmp4] \n\t"
2139 
2140  "1: \n\t"
2141  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2142 
2143  "addiu %[h], %[h], -0x01 \n\t"
2144  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2145  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2146  "bnez %[h], 1b \n\t"
2147  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2148  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2149  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2150  [tmp0]"=&r"(tmp[0]),
2151  RESTRICT_ASM_LOW32
2152  [src1]"=&r"(src1),
2153  [h]"+&r"(h),
2154  [dst]"+&r"(dst), [src]"+&r"(src)
2155  : [ff_pw_64]"f"(ff_pw_64),
2156  [srcstride]"r"((mips_reg)srcstride),
2157  [dststride]"r"((mips_reg)dststride),
2158  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2159  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2160  : "memory"
2161  );
2162 #else
2163  const uint8_t *filter = subpel_filters[my - 1];
2164  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2165  int x, y;
2166 
2167  for (y = 0; y < h; y++) {
2168  for (x = 0; x < 4; x++)
2169  dst[x] = FILTER_4TAP(src, filter, srcstride);
2170  dst += dststride;
2171  src += srcstride;
2172  }
2173 #endif
2174 }
2175 
2176 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177  ptrdiff_t srcstride, int h, int mx, int my)
2178 {
2179 #if 1
2180  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181  double ftmp[9];
2182  uint32_t tmp[1];
2183  mips_reg src0, src1, dst0;
2184  DECLARE_VAR_ALL64;
2185 
2186  /*
2187  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2188  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2189  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2190  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2191  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2192  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2193  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2194  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2195 
2196  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2197  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2198  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2199  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2200  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2201  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2202  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2203  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2204  */
2205  __asm__ volatile (
2206  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2207  "li %[tmp0], 0x07 \n\t"
2208  "mtc1 %[tmp0], %[ftmp4] \n\t"
2209 
2210  "1: \n\t"
2211  // 0 - 7
2212  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2213  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2214  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2215  // 8 - 15
2216  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2217 
2218  "addiu %[h], %[h], -0x01 \n\t"
2219  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2220  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2221  "bnez %[h], 1b \n\t"
2222  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2223  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2224  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2225  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2226  [ftmp8]"=&f"(ftmp[8]),
2227  [tmp0]"=&r"(tmp[0]),
2228  RESTRICT_ASM_ALL64
2229  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2230  [src1]"=&r"(src1),
2231  [h]"+&r"(h),
2232  [dst]"+&r"(dst), [src]"+&r"(src)
2233  : [ff_pw_64]"f"(ff_pw_64),
2234  [srcstride]"r"((mips_reg)srcstride),
2235  [dststride]"r"((mips_reg)dststride),
2236  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2237  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2238  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2239  : "memory"
2240  );
2241 #else
2242  const uint8_t *filter = subpel_filters[my - 1];
2243  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2244  int x, y;
2245 
2246  for (y = 0; y < h; y++) {
2247  for (x = 0; x < 16; x++)
2248  dst[x] = FILTER_6TAP(src, filter, srcstride);
2249  dst += dststride;
2250  src += srcstride;
2251  }
2252 #endif
2253 }
2254 
2255 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2256  ptrdiff_t srcstride, int h, int mx, int my)
2257 {
2258 #if 1
2259  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2260  double ftmp[9];
2261  uint32_t tmp[1];
2262  mips_reg src1;
2263  DECLARE_VAR_ALL64;
2264 
2265  /*
2266  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2267  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2268  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2269  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2270  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2271  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2272  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2273  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2274  */
2275  __asm__ volatile (
2276  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2277  "li %[tmp0], 0x07 \n\t"
2278  "mtc1 %[tmp0], %[ftmp4] \n\t"
2279 
2280  "1: \n\t"
2281  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2282 
2283  "addiu %[h], %[h], -0x01 \n\t"
2284  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2285  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2286  "bnez %[h], 1b \n\t"
2287  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2288  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2289  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2290  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2291  [ftmp8]"=&f"(ftmp[8]),
2292  [tmp0]"=&r"(tmp[0]),
2293  RESTRICT_ASM_ALL64
2294  [src1]"=&r"(src1),
2295  [h]"+&r"(h),
2296  [dst]"+&r"(dst), [src]"+&r"(src)
2297  : [ff_pw_64]"f"(ff_pw_64),
2298  [srcstride]"r"((mips_reg)srcstride),
2299  [dststride]"r"((mips_reg)dststride),
2300  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2301  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2302  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2303  : "memory"
2304  );
2305 #else
2306  const uint8_t *filter = subpel_filters[my - 1];
2307  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2308  int x, y;
2309 
2310  for (y = 0; y < h; y++) {
2311  for (x = 0; x < 8; x++)
2312  dst[x] = FILTER_6TAP(src, filter, srcstride);
2313  dst += dststride;
2314  src += srcstride;
2315  }
2316 #endif
2317 }
2318 
2319 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2320  ptrdiff_t srcstride, int h, int mx, int my)
2321 {
2322 #if 1
2323  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2324  double ftmp[6];
2325  uint32_t tmp[1];
2326  mips_reg src1;
2327  DECLARE_VAR_LOW32;
2328 
2329  /*
2330  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2331  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2332  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2333  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2334  */
2335  __asm__ volatile (
2336  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2337  "li %[tmp0], 0x07 \n\t"
2338  "mtc1 %[tmp0], %[ftmp4] \n\t"
2339 
2340  "1: \n\t"
2341  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2342 
2343  "addiu %[h], %[h], -0x01 \n\t"
2344  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2345  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2346  "bnez %[h], 1b \n\t"
2347  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2348  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2349  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2350  [tmp0]"=&r"(tmp[0]),
2351  RESTRICT_ASM_LOW32
2352  [src1]"=&r"(src1),
2353  [h]"+&r"(h),
2354  [dst]"+&r"(dst), [src]"+&r"(src)
2355  : [ff_pw_64]"f"(ff_pw_64),
2356  [srcstride]"r"((mips_reg)srcstride),
2357  [dststride]"r"((mips_reg)dststride),
2358  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2359  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2360  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2361  : "memory"
2362  );
2363 #else
2364  const uint8_t *filter = subpel_filters[my - 1];
2365  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2366  int x, y;
2367 
2368  for (y = 0; y < h; y++) {
2369  for (x = 0; x < 4; x++)
2370  dst[x] = FILTER_6TAP(src, filter, srcstride);
2371  dst += dststride;
2372  src += srcstride;
2373  }
2374 #endif
2375 }
2376 
2377 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2378  ptrdiff_t srcstride, int h, int mx, int my)
2379 {
2380 #if 1
2381  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2382  uint8_t *tmp = tmp_array;
2383 
2384  src -= srcstride;
2385  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2386  tmp = tmp_array + 16;
2387  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2388 #else
2389  const uint8_t *filter = subpel_filters[mx - 1];
2390  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2391  int x, y;
2392  uint8_t tmp_array[560];
2393  uint8_t *tmp = tmp_array;
2394 
2395  src -= srcstride;
2396 
2397  for (y = 0; y < h + 3; y++) {
2398  for (x = 0; x < 16; x++)
2399  tmp[x] = FILTER_4TAP(src, filter, 1);
2400  tmp += 16;
2401  src += srcstride;
2402  }
2403 
2404  tmp = tmp_array + 16;
2405  filter = subpel_filters[my - 1];
2406 
2407  for (y = 0; y < h; y++) {
2408  for (x = 0; x < 16; x++)
2409  dst[x] = FILTER_4TAP(tmp, filter, 16);
2410  dst += dststride;
2411  tmp += 16;
2412  }
2413 #endif
2414 }
2415 
2416 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2417  ptrdiff_t srcstride, int h, int mx, int my)
2418 {
2419 #if 1
2420  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2421  uint8_t *tmp = tmp_array;
2422 
2423  src -= srcstride;
2424  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2425  tmp = tmp_array + 8;
2426  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2427 #else
2428  const uint8_t *filter = subpel_filters[mx - 1];
2429  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2430  int x, y;
2431  uint8_t tmp_array[152];
2432  uint8_t *tmp = tmp_array;
2433 
2434  src -= srcstride;
2435 
2436  for (y = 0; y < h + 3; y++) {
2437  for (x = 0; x < 8; x++)
2438  tmp[x] = FILTER_4TAP(src, filter, 1);
2439  tmp += 8;
2440  src += srcstride;
2441  }
2442 
2443  tmp = tmp_array + 8;
2444  filter = subpel_filters[my - 1];
2445 
2446  for (y = 0; y < h; y++) {
2447  for (x = 0; x < 8; x++)
2448  dst[x] = FILTER_4TAP(tmp, filter, 8);
2449  dst += dststride;
2450  tmp += 8;
2451  }
2452 #endif
2453 }
2454 
2455 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2456  ptrdiff_t srcstride, int h, int mx, int my)
2457 {
2458 #if 1
2459  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2460  uint8_t *tmp = tmp_array;
2461 
2462  src -= srcstride;
2463  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2464  tmp = tmp_array + 4;
2465  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2466 #else
2467  const uint8_t *filter = subpel_filters[mx - 1];
2468  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2469  int x, y;
2470  uint8_t tmp_array[44];
2471  uint8_t *tmp = tmp_array;
2472 
2473  src -= srcstride;
2474 
2475  for (y = 0; y < h + 3; y++) {
2476  for (x = 0; x < 4; x++)
2477  tmp[x] = FILTER_4TAP(src, filter, 1);
2478  tmp += 4;
2479  src += srcstride;
2480  }
2481  tmp = tmp_array + 4;
2482  filter = subpel_filters[my - 1];
2483 
2484  for (y = 0; y < h; y++) {
2485  for (x = 0; x < 4; x++)
2486  dst[x] = FILTER_4TAP(tmp, filter, 4);
2487  dst += dststride;
2488  tmp += 4;
2489  }
2490 #endif
2491 }
2492 
2493 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2494  ptrdiff_t srcstride, int h, int mx, int my)
2495 {
2496 #if 1
2497  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2498  uint8_t *tmp = tmp_array;
2499 
2500  src -= 2 * srcstride;
2501  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2502  tmp = tmp_array + 32;
2503  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2504 #else
2505  const uint8_t *filter = subpel_filters[mx - 1];
2506  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2507  int x, y;
2508  uint8_t tmp_array[592];
2509  uint8_t *tmp = tmp_array;
2510 
2511  src -= 2 * srcstride;
2512 
2513  for (y = 0; y < h + 5; y++) {
2514  for (x = 0; x < 16; x++)
2515  tmp[x] = FILTER_4TAP(src, filter, 1);
2516  tmp += 16;
2517  src += srcstride;
2518  }
2519 
2520  tmp = tmp_array + 32;
2521  filter = subpel_filters[my - 1];
2522 
2523  for (y = 0; y < h; y++) {
2524  for (x = 0; x < 16; x++)
2525  dst[x] = FILTER_6TAP(tmp, filter, 16);
2526  dst += dststride;
2527  tmp += 16;
2528  }
2529 #endif
2530 }
2531 
2532 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2533  ptrdiff_t srcstride, int h, int mx, int my)
2534 {
2535 #if 1
2536  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2537  uint8_t *tmp = tmp_array;
2538 
2539  src -= 2 * srcstride;
2540  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2541  tmp = tmp_array + 16;
2542  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2543 #else
2544  const uint8_t *filter = subpel_filters[mx - 1];
2545  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2546  int x, y;
2547  uint8_t tmp_array[168];
2548  uint8_t *tmp = tmp_array;
2549 
2550  src -= 2 * srcstride;
2551 
2552  for (y = 0; y < h + 5; y++) {
2553  for (x = 0; x < 8; x++)
2554  tmp[x] = FILTER_4TAP(src, filter, 1);
2555  tmp += 8;
2556  src += srcstride;
2557  }
2558 
2559  tmp = tmp_array + 16;
2560  filter = subpel_filters[my - 1];
2561 
2562  for (y = 0; y < h; y++) {
2563  for (x = 0; x < 8; x++)
2564  dst[x] = FILTER_6TAP(tmp, filter, 8);
2565  dst += dststride;
2566  tmp += 8;
2567  }
2568 #endif
2569 }
2570 
2571 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2572  ptrdiff_t srcstride, int h, int mx, int my)
2573 {
2574 #if 1
2575  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2576  uint8_t *tmp = tmp_array;
2577 
2578  src -= 2 * srcstride;
2579  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2580  tmp = tmp_array + 8;
2581  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2582 #else
2583  const uint8_t *filter = subpel_filters[mx - 1];
2584  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2585  int x, y;
2586  uint8_t tmp_array[52];
2587  uint8_t *tmp = tmp_array;
2588 
2589  src -= 2 * srcstride;
2590 
2591  for (y = 0; y < h + 5; y++) {
2592  for (x = 0; x < 4; x++)
2593  tmp[x] = FILTER_4TAP(src, filter, 1);
2594  tmp += 4;
2595  src += srcstride;
2596  }
2597 
2598  tmp = tmp_array + 8;
2599  filter = subpel_filters[my - 1];
2600 
2601  for (y = 0; y < h; y++) {
2602  for (x = 0; x < 4; x++)
2603  dst[x] = FILTER_6TAP(tmp, filter, 4);
2604  dst += dststride;
2605  tmp += 4;
2606  }
2607 #endif
2608 }
2609 
2610 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2611  ptrdiff_t srcstride, int h, int mx, int my)
2612 {
2613 #if 1
2614  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2615  uint8_t *tmp = tmp_array;
2616 
2617  src -= srcstride;
2618  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2619  tmp = tmp_array + 16;
2620  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2621 #else
2622  const uint8_t *filter = subpel_filters[mx - 1];
2623  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2624  int x, y;
2625  uint8_t tmp_array[560];
2626  uint8_t *tmp = tmp_array;
2627 
2628  src -= srcstride;
2629 
2630  for (y = 0; y < h + 3; y++) {
2631  for (x = 0; x < 16; x++)
2632  tmp[x] = FILTER_6TAP(src, filter, 1);
2633  tmp += 16;
2634  src += srcstride;
2635  }
2636 
2637  tmp = tmp_array + 16;
2638  filter = subpel_filters[my - 1];
2639 
2640  for (y = 0; y < h; y++) {
2641  for (x = 0; x < 16; x++)
2642  dst[x] = FILTER_4TAP(tmp, filter, 16);
2643  dst += dststride;
2644  tmp += 16;
2645  }
2646 #endif
2647 }
2648 
2649 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2650  ptrdiff_t srcstride, int h, int mx, int my)
2651 {
2652 #if 1
2653  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2654  uint8_t *tmp = tmp_array;
2655 
2656  src -= srcstride;
2657  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2658  tmp = tmp_array + 8;
2659  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2660 #else
2661  const uint8_t *filter = subpel_filters[mx - 1];
2662  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2663  int x, y;
2664  uint8_t tmp_array[152];
2665  uint8_t *tmp = tmp_array;
2666 
2667  src -= srcstride;
2668 
2669  for (y = 0; y < h + 3; y++) {
2670  for (x = 0; x < 8; x++)
2671  tmp[x] = FILTER_6TAP(src, filter, 1);
2672  tmp += 8;
2673  src += srcstride;
2674  }
2675 
2676  tmp = tmp_array + 8;
2677  filter = subpel_filters[my - 1];
2678 
2679  for (y = 0; y < h; y++) {
2680  for (x = 0; x < 8; x++)
2681  dst[x] = FILTER_4TAP(tmp, filter, 8);
2682  dst += dststride;
2683  tmp += 8;
2684  }
2685 #endif
2686 }
2687 
2688 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2689  ptrdiff_t srcstride, int h, int mx, int my)
2690 {
2691 #if 1
2692  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2693  uint8_t *tmp = tmp_array;
2694 
2695  src -= srcstride;
2696  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2697  tmp = tmp_array + 4;
2698  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2699 #else
2700  const uint8_t *filter = subpel_filters[mx - 1];
2701  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2702  int x, y;
2703  uint8_t tmp_array[44];
2704  uint8_t *tmp = tmp_array;
2705 
2706  src -= srcstride;
2707 
2708  for (y = 0; y < h + 3; y++) {
2709  for (x = 0; x < 4; x++)
2710  tmp[x] = FILTER_6TAP(src, filter, 1);
2711  tmp += 4;
2712  src += srcstride;
2713  }
2714 
2715  tmp = tmp_array + 4;
2716  filter = subpel_filters[my - 1];
2717 
2718  for (y = 0; y < h; y++) {
2719  for (x = 0; x < 4; x++)
2720  dst[x] = FILTER_4TAP(tmp, filter, 4);
2721  dst += dststride;
2722  tmp += 4;
2723  }
2724 #endif
2725 }
2726 
2727 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2728  ptrdiff_t srcstride, int h, int mx, int my)
2729 {
2730 #if 1
2731  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2732  uint8_t *tmp = tmp_array;
2733 
2734  src -= 2 * srcstride;
2735  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2736  tmp = tmp_array + 32;
2737  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2738 #else
2739  const uint8_t *filter = subpel_filters[mx - 1];
2740  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2741  int x, y;
2742  uint8_t tmp_array[592];
2743  uint8_t *tmp = tmp_array;
2744 
2745  src -= 2 * srcstride;
2746 
2747  for (y = 0; y < h + 5; y++) {
2748  for (x = 0; x < 16; x++)
2749  tmp[x] = FILTER_6TAP(src, filter, 1);
2750  tmp += 16;
2751  src += srcstride;
2752  }
2753 
2754  tmp = tmp_array + 32;
2755  filter = subpel_filters[my - 1];
2756 
2757  for (y = 0; y < h; y++) {
2758  for (x = 0; x < 16; x++)
2759  dst[x] = FILTER_6TAP(tmp, filter, 16);
2760  dst += dststride;
2761  tmp += 16;
2762  }
2763 #endif
2764 }
2765 
2766 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2767  ptrdiff_t srcstride, int h, int mx, int my)
2768 {
2769 #if 1
2770  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2771  uint8_t *tmp = tmp_array;
2772 
2773  src -= 2 * srcstride;
2774  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2775  tmp = tmp_array + 16;
2776  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2777 #else
2778  const uint8_t *filter = subpel_filters[mx - 1];
2779  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2780  int x, y;
2781  uint8_t tmp_array[168];
2782  uint8_t *tmp = tmp_array;
2783 
2784  src -= 2 * srcstride;
2785 
2786  for (y = 0; y < h + 5; y++) {
2787  for (x = 0; x < 8; x++)
2788  tmp[x] = FILTER_6TAP(src, filter, 1);
2789  tmp += 8;
2790  src += srcstride;
2791  }
2792 
2793  tmp = tmp_array + 16;
2794  filter = subpel_filters[my - 1];
2795 
2796  for (y = 0; y < h; y++) {
2797  for (x = 0; x < 8; x++)
2798  dst[x] = FILTER_6TAP(tmp, filter, 8);
2799  dst += dststride;
2800  tmp += 8;
2801  }
2802 #endif
2803 }
2804 
2805 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2806  ptrdiff_t srcstride, int h, int mx, int my)
2807 {
2808 #if 1
2809  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2810  uint8_t *tmp = tmp_array;
2811 
2812  src -= 2 * srcstride;
2813  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2814  tmp = tmp_array + 8;
2815  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2816 #else
2817  const uint8_t *filter = subpel_filters[mx - 1];
2818  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2819  int x, y;
2820  uint8_t tmp_array[52];
2821  uint8_t *tmp = tmp_array;
2822 
2823  src -= 2 * srcstride;
2824 
2825  for (y = 0; y < h + 5; y++) {
2826  for (x = 0; x < 4; x++)
2827  tmp[x] = FILTER_6TAP(src, filter, 1);
2828  tmp += 4;
2829  src += srcstride;
2830  }
2831 
2832  tmp = tmp_array + 8;
2833  filter = subpel_filters[my - 1];
2834 
2835  for (y = 0; y < h; y++) {
2836  for (x = 0; x < 4; x++)
2837  dst[x] = FILTER_6TAP(tmp, filter, 4);
2838  dst += dststride;
2839  tmp += 4;
2840  }
2841 #endif
2842 }
2843 
2844 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2845  ptrdiff_t sstride, int h, int mx, int my)
2846 {
2847 #if 1
2848  int a = 8 - mx, b = mx;
2849  double ftmp[7];
2850  uint32_t tmp[1];
2851  mips_reg dst0, src0;
2852  DECLARE_VAR_ALL64;
2853 
2854  /*
2855  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2856  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2857  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2858  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2859  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2860  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2861  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2862  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2863 
2864  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2865  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2866  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2867  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2868  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2869  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2870  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2871  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2872  */
2873  __asm__ volatile (
2874  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2875  "li %[tmp0], 0x03 \n\t"
2876  "mtc1 %[tmp0], %[ftmp4] \n\t"
2877  "pshufh %[a], %[a], %[ftmp0] \n\t"
2878  "pshufh %[b], %[b], %[ftmp0] \n\t"
2879 
2880  "1: \n\t"
2881  // 0 - 7
2882  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2883  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2884  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2885  // 8 - 15
2886  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2887 
2888  "addiu %[h], %[h], -0x01 \n\t"
2889  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2890  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2891  "bnez %[h], 1b \n\t"
2892  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2893  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2894  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2895  [ftmp6]"=&f"(ftmp[6]),
2896  [tmp0]"=&r"(tmp[0]),
2897  RESTRICT_ASM_ALL64
2898  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2899  [h]"+&r"(h),
2900  [dst]"+&r"(dst), [src]"+&r"(src),
2901  [a]"+&f"(a), [b]"+&f"(b)
2902  : [sstride]"r"((mips_reg)sstride),
2903  [dstride]"r"((mips_reg)dstride),
2904  [ff_pw_4]"f"(ff_pw_4)
2905  : "memory"
2906  );
2907 #else
2908  int a = 8 - mx, b = mx;
2909  int x, y;
2910 
2911  for (y = 0; y < h; y++) {
2912  for (x = 0; x < 16; x++)
2913  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2914  dst += dstride;
2915  src += sstride;
2916  }
2917 #endif
2918 }
2919 
2920 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2921  ptrdiff_t sstride, int h, int mx, int my)
2922 {
2923 #if 1
2924  int c = 8 - my, d = my;
2925  double ftmp[7];
2926  uint32_t tmp[1];
2927  mips_reg src0, src1, dst0;
2928  DECLARE_VAR_ALL64;
2929 
2930  /*
2931  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2932  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2933  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2934  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2935  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2936  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2937  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2938  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2939  */
2940  __asm__ volatile (
2941  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2942  "li %[tmp0], 0x03 \n\t"
2943  "mtc1 %[tmp0], %[ftmp4] \n\t"
2944  "pshufh %[c], %[c], %[ftmp0] \n\t"
2945  "pshufh %[d], %[d], %[ftmp0] \n\t"
2946 
2947  "1: \n\t"
2948  // 0 - 7
2949  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2950  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2951  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2952  // 8 - 15
2953  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2954 
2955  "addiu %[h], %[h], -0x01 \n\t"
2956  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2957  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2958  "bnez %[h], 1b \n\t"
2959  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2960  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2961  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2962  [ftmp6]"=&f"(ftmp[6]),
2963  [tmp0]"=&r"(tmp[0]),
2964  RESTRICT_ASM_ALL64
2965  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2966  [src1]"=&r"(src1),
2967  [h]"+&r"(h),
2968  [dst]"+&r"(dst), [src]"+&r"(src),
2969  [c]"+&f"(c), [d]"+&f"(d)
2970  : [sstride]"r"((mips_reg)sstride),
2971  [dstride]"r"((mips_reg)dstride),
2972  [ff_pw_4]"f"(ff_pw_4)
2973  : "memory"
2974  );
2975 #else
2976  int c = 8 - my, d = my;
2977  int x, y;
2978 
2979  for (y = 0; y < h; y++) {
2980  for (x = 0; x < 16; x++)
2981  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2982  dst += dstride;
2983  src += sstride;
2984  }
2985 #endif
2986 }
2987 
2988 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2989  ptrdiff_t sstride, int h, int mx, int my)
2990 {
2991 #if 1
2992  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2993  uint8_t *tmp = tmp_array;
2994 
2995  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2996  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2997 #else
2998  int a = 8 - mx, b = mx;
2999  int c = 8 - my, d = my;
3000  int x, y;
3001  uint8_t tmp_array[528];
3002  uint8_t *tmp = tmp_array;
3003 
3004  for (y = 0; y < h + 1; y++) {
3005  for (x = 0; x < 16; x++)
3006  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3007  tmp += 16;
3008  src += sstride;
3009  }
3010 
3011  tmp = tmp_array;
3012 
3013  for (y = 0; y < h; y++) {
3014  for (x = 0; x < 16; x++)
3015  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3016  dst += dstride;
3017  tmp += 16;
3018  }
3019 #endif
3020 }
3021 
3022 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3023  ptrdiff_t sstride, int h, int mx, int my)
3024 {
3025 #if 1
3026  int a = 8 - mx, b = mx;
3027  double ftmp[7];
3028  uint32_t tmp[1];
3029  DECLARE_VAR_ALL64;
3030 
3031  /*
3032  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3033  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3034  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3035  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3036  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3037  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3038  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3039  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3040  */
3041  __asm__ volatile (
3042  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3043  "li %[tmp0], 0x03 \n\t"
3044  "mtc1 %[tmp0], %[ftmp4] \n\t"
3045  "pshufh %[a], %[a], %[ftmp0] \n\t"
3046  "pshufh %[b], %[b], %[ftmp0] \n\t"
3047 
3048  "1: \n\t"
3049  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3050 
3051  "addiu %[h], %[h], -0x01 \n\t"
3052  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3053  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3054  "bnez %[h], 1b \n\t"
3055  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3056  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3057  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3058  [ftmp6]"=&f"(ftmp[6]),
3059  [tmp0]"=&r"(tmp[0]),
3060  RESTRICT_ASM_ALL64
3061  [h]"+&r"(h),
3062  [dst]"+&r"(dst), [src]"+&r"(src),
3063  [a]"+&f"(a), [b]"+&f"(b)
3064  : [sstride]"r"((mips_reg)sstride),
3065  [dstride]"r"((mips_reg)dstride),
3066  [ff_pw_4]"f"(ff_pw_4)
3067  : "memory"
3068  );
3069 #else
3070  int a = 8 - mx, b = mx;
3071  int x, y;
3072 
3073  for (y = 0; y < h; y++) {
3074  for (x = 0; x < 8; x++)
3075  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3076  dst += dstride;
3077  src += sstride;
3078  }
3079 #endif
3080 }
3081 
3082 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3083  ptrdiff_t sstride, int h, int mx, int my)
3084 {
3085 #if 1
3086  int c = 8 - my, d = my;
3087  double ftmp[7];
3088  uint32_t tmp[1];
3089  mips_reg src1;
3090  DECLARE_VAR_ALL64;
3091 
3092  /*
3093  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3094  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3095  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3096  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3097  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3098  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3099  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3100  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3101  */
3102  __asm__ volatile (
3103  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3104  "li %[tmp0], 0x03 \n\t"
3105  "mtc1 %[tmp0], %[ftmp4] \n\t"
3106  "pshufh %[c], %[c], %[ftmp0] \n\t"
3107  "pshufh %[d], %[d], %[ftmp0] \n\t"
3108 
3109  "1: \n\t"
3110  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3111 
3112  "addiu %[h], %[h], -0x01 \n\t"
3113  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3114  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3115  "bnez %[h], 1b \n\t"
3116  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3117  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3118  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3119  [ftmp6]"=&f"(ftmp[6]),
3120  [tmp0]"=&r"(tmp[0]),
3121  RESTRICT_ASM_ALL64
3122  [src1]"=&r"(src1),
3123  [h]"+&r"(h),
3124  [dst]"+&r"(dst), [src]"+&r"(src),
3125  [c]"+&f"(c), [d]"+&f"(d)
3126  : [sstride]"r"((mips_reg)sstride),
3127  [dstride]"r"((mips_reg)dstride),
3128  [ff_pw_4]"f"(ff_pw_4)
3129  : "memory"
3130  );
3131 #else
3132  int c = 8 - my, d = my;
3133  int x, y;
3134 
3135  for (y = 0; y < h; y++) {
3136  for (x = 0; x < 8; x++)
3137  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3138  dst += dstride;
3139  src += sstride;
3140  }
3141 #endif
3142 }
3143 
3144 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3145  ptrdiff_t sstride, int h, int mx, int my)
3146 {
3147 #if 1
3148  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3149  uint8_t *tmp = tmp_array;
3150 
3151  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3152  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3153 #else
3154  int a = 8 - mx, b = mx;
3155  int c = 8 - my, d = my;
3156  int x, y;
3157  uint8_t tmp_array[136];
3158  uint8_t *tmp = tmp_array;
3159 
3160  for (y = 0; y < h + 1; y++) {
3161  for (x = 0; x < 8; x++)
3162  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3163  tmp += 8;
3164  src += sstride;
3165  }
3166 
3167  tmp = tmp_array;
3168 
3169  for (y = 0; y < h; y++) {
3170  for (x = 0; x < 8; x++)
3171  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3172  dst += dstride;
3173  tmp += 8;
3174  }
3175 #endif
3176 }
3177 
3178 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3179  ptrdiff_t sstride, int h, int mx, int my)
3180 {
3181 #if 1
3182  int a = 8 - mx, b = mx;
3183  double ftmp[5];
3184  uint32_t tmp[1];
3185  DECLARE_VAR_LOW32;
3186  DECLARE_VAR_ALL64;
3187 
3188  /*
3189  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3190  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3191  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3192  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3193  */
3194  __asm__ volatile (
3195  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3196  "li %[tmp0], 0x03 \n\t"
3197  "mtc1 %[tmp0], %[ftmp4] \n\t"
3198  "pshufh %[a], %[a], %[ftmp0] \n\t"
3199  "pshufh %[b], %[b], %[ftmp0] \n\t"
3200 
3201  "1: \n\t"
3202  PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3203 
3204  "addiu %[h], %[h], -0x01 \n\t"
3205  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3206  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3207  "bnez %[h], 1b \n\t"
3208  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3209  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3210  [ftmp4]"=&f"(ftmp[4]),
3211  [tmp0]"=&r"(tmp[0]),
3212  RESTRICT_ASM_LOW32
3213  RESTRICT_ASM_ALL64
3214  [h]"+&r"(h),
3215  [dst]"+&r"(dst), [src]"+&r"(src),
3216  [a]"+&f"(a), [b]"+&f"(b)
3217  : [sstride]"r"((mips_reg)sstride),
3218  [dstride]"r"((mips_reg)dstride),
3219  [ff_pw_4]"f"(ff_pw_4)
3220  : "memory"
3221  );
3222 #else
3223  int a = 8 - mx, b = mx;
3224  int x, y;
3225 
3226  for (y = 0; y < h; y++) {
3227  for (x = 0; x < 4; x++)
3228  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3229  dst += dstride;
3230  src += sstride;
3231  }
3232 #endif
3233 }
3234 
3235 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3236  ptrdiff_t sstride, int h, int mx, int my)
3237 {
3238 #if 1
3239  int c = 8 - my, d = my;
3240  double ftmp[7];
3241  uint32_t tmp[1];
3242  mips_reg src1;
3243  DECLARE_VAR_LOW32;
3244  DECLARE_VAR_ALL64;
3245 
3246  /*
3247  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3248  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3249  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3250  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3251  */
3252  __asm__ volatile (
3253  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3254  "li %[tmp0], 0x03 \n\t"
3255  "mtc1 %[tmp0], %[ftmp4] \n\t"
3256  "pshufh %[c], %[c], %[ftmp0] \n\t"
3257  "pshufh %[d], %[d], %[ftmp0] \n\t"
3258 
3259  "1: \n\t"
3260  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3261 
3262  "addiu %[h], %[h], -0x01 \n\t"
3263  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3264  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3265  "bnez %[h], 1b \n\t"
3266  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3267  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3268  [ftmp4]"=&f"(ftmp[4]),
3269  [tmp0]"=&r"(tmp[0]),
3270  RESTRICT_ASM_LOW32
3271  RESTRICT_ASM_ALL64
3272  [src1]"=&r"(src1),
3273  [h]"+&r"(h),
3274  [dst]"+&r"(dst), [src]"+&r"(src),
3275  [c]"+&f"(c), [d]"+&f"(d)
3276  : [sstride]"r"((mips_reg)sstride),
3277  [dstride]"r"((mips_reg)dstride),
3278  [ff_pw_4]"f"(ff_pw_4)
3279  : "memory"
3280  );
3281 #else
3282  int c = 8 - my, d = my;
3283  int x, y;
3284 
3285  for (y = 0; y < h; y++) {
3286  for (x = 0; x < 4; x++)
3287  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3288  dst += dstride;
3289  src += sstride;
3290  }
3291 #endif
3292 }
3293 
3294 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3295  ptrdiff_t sstride, int h, int mx, int my)
3296 {
3297 #if 1
3298  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3299  uint8_t *tmp = tmp_array;
3300 
3301  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3302  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3303 #else
3304  int a = 8 - mx, b = mx;
3305  int c = 8 - my, d = my;
3306  int x, y;
3307  uint8_t tmp_array[36];
3308  uint8_t *tmp = tmp_array;
3309 
3310  for (y = 0; y < h + 1; y++) {
3311  for (x = 0; x < 4; x++)
3312  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3313  tmp += 4;
3314  src += sstride;
3315  }
3316 
3317  tmp = tmp_array;
3318 
3319  for (y = 0; y < h; y++) {
3320  for (x = 0; x < 4; x++)
3321  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3322  dst += dstride;
3323  tmp += 4;
3324  }
3325 #endif
3326 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:30
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:256
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
ff_pw_64
const uint64_t ff_pw_64
Definition: constants.c:45
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1727
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:486
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:691
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1377
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:358
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1444
av_unused
#define av_unused
Definition: attributes.h:131
ff_pw_4
const uint64_t ff_pw_4
Definition: constants.c:29
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
w
uint8_t w
Definition: llviddspenc.c:38
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:41
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1462
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2920
t1
#define t1
Definition: regdef.h:29
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
mips_reg
#define mips_reg
Definition: asmdefs.h:44
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:31
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1453
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:490
t10
#define t10
Definition: regdef.h:55
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:968
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:718
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:558
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:267
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2844
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3082
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:666
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2416
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:76
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:618
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:29
mmiutils.h
a1
#define a1
Definition: regdef.h:47
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2727
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2532
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:784
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2571
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:633
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2176
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:600
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:575
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1415
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2056
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1126
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2455
q0
static const uint8_t q0[256]
Definition: twofish.c:77
E
#define E
Definition: avdct.c:32
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:481
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1922
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:534
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:33
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:299
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3178
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2766
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1666
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:47
src
#define src
Definition: vp8dsp.c:254
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2805
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:854
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1782
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:228
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1400
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:32
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1385
t11
#define t11
Definition: regdef.h:56
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:869
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1351
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2649
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1978
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3144
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3235
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1860
t12
#define t12
Definition: regdef.h:58
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1293
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1360
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:766
a0
#define a0
Definition: regdef.h:46
src0
#define src0
Definition: h264pred.c:138
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2377
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2119
src1
#define src1
Definition: h264pred.c:139
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2988
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3022
t3
#define t3
Definition: regdef.h:31
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1430
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1370
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1589
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:665
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3294
a2
#define a2
Definition: regdef.h:48
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:28
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:428
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2610
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:387
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2255
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:953
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1437
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1102
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2493
t2
#define t2
Definition: regdef.h:30
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1549
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:283
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:191
cm
#define cm
Definition: dvbsubdec.c:37
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2688
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:350
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:739
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:729
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2319
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1509
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:330
h
h
Definition: vp9dsp_template.c:2038
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1392
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:457