FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
qpel.c
Go to the documentation of this file.
1 /*
2  * This is optimized for sh, which have post increment addressing (*p++).
3  * Some CPU may be index (p[n]) faster than post increment (*p++).
4  *
5  * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/common.h"
25 #include "libavcodec/copy_block.h"
26 #include "libavcodec/rnd_avg.h"
27 
28 #define PIXOP2(OPNAME, OP) \
29 \
30 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
31 {\
32  do {\
33  OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
34  src1+=src_stride1; \
35  src2+=src_stride2; \
36  dst+=dst_stride; \
37  } while(--h); \
38 }\
39 \
40 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
41 {\
42  do {\
43  OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
44  src1+=src_stride1; \
45  src2+=src_stride2; \
46  dst+=dst_stride; \
47  } while(--h); \
48 }\
49 \
50 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
51 {\
52  do {\
53  OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
54  OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
55  OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
56  OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
57  src1+=src_stride1; \
58  src2+=src_stride2; \
59  dst+=dst_stride; \
60  } while(--h); \
61 }\
62 \
63 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
64 {\
65  do {\
66  OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
67  OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
68  OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
69  OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
70  src1+=src_stride1; \
71  src2+=src_stride2; \
72  dst+=dst_stride; \
73  } while(--h); \
74 }\
75 \
76 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
77 {\
78  do { /* onlye src2 aligned */\
79  OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
80  OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
81  src1+=src_stride1; \
82  src2+=src_stride2; \
83  dst+=dst_stride; \
84  } while(--h); \
85 }\
86 \
87 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
88 {\
89  do {\
90  OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
91  OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
92  src1+=src_stride1; \
93  src2+=src_stride2; \
94  dst+=dst_stride; \
95  } while(--h); \
96 }\
97 \
98 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
99 {\
100  do {\
101  OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
102  OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
103  src1+=src_stride1; \
104  src2+=src_stride2; \
105  dst+=dst_stride; \
106  } while(--h); \
107 }\
108 \
109 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
110 {\
111  do {\
112  OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
113  OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
114  src1+=src_stride1; \
115  src2+=src_stride2; \
116  dst+=dst_stride; \
117  } while(--h); \
118 }\
119 \
120 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
121 {\
122  do {\
123  OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
124  OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
125  OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
126  OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
127  src1+=src_stride1; \
128  src2+=src_stride2; \
129  dst+=dst_stride; \
130  } while(--h); \
131 }\
132 \
133 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
134 {\
135  do {\
136  OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
137  OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
138  OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
139  OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
140  src1+=src_stride1; \
141  src2+=src_stride2; \
142  dst+=dst_stride; \
143  } while(--h); \
144 }\
145 \
146 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
147 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
148 \
149 static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
150 { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
151 \
152 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
153 { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
154 \
155 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
156 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
157 \
158 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
159  do { \
160  uint32_t a0,a1,a2,a3; \
161  UNPACK(a0,a1,LPC(src1),LPC(src2)); \
162  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
163  OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
164  UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
165  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
166  OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
167  src1+=src_stride1;\
168  src2+=src_stride2;\
169  src3+=src_stride3;\
170  src4+=src_stride4;\
171  dst+=dst_stride;\
172  } while(--h); \
173 } \
174 \
175 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
176  do { \
177  uint32_t a0,a1,a2,a3; \
178  UNPACK(a0,a1,LPC(src1),LPC(src2)); \
179  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
180  OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
181  UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
182  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
183  OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
184  src1+=src_stride1;\
185  src2+=src_stride2;\
186  src3+=src_stride3;\
187  src4+=src_stride4;\
188  dst+=dst_stride;\
189  } while(--h); \
190 } \
191 \
192 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
193  do { \
194  uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
195  UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
196  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
197  OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
198  UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
199  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
200  OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
201  src1+=src_stride1;\
202  src2+=src_stride2;\
203  src3+=src_stride3;\
204  src4+=src_stride4;\
205  dst+=dst_stride;\
206  } while(--h); \
207 } \
208 \
209 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
210  do { \
211  uint32_t a0,a1,a2,a3; \
212  UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
213  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
214  OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
215  UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
216  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
217  OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
218  src1+=src_stride1;\
219  src2+=src_stride2;\
220  src3+=src_stride3;\
221  src4+=src_stride4;\
222  dst+=dst_stride;\
223  } while(--h); \
224 } \
225 \
226 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
227  do { \
228  uint32_t a0,a1,a2,a3; \
229  UNPACK(a0,a1,LPC(src1),LPC(src2)); \
230  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
231  OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
232  UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
233  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
234  OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
235  UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
236  UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
237  OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
238  UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
239  UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
240  OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
241  src1+=src_stride1;\
242  src2+=src_stride2;\
243  src3+=src_stride3;\
244  src4+=src_stride4;\
245  dst+=dst_stride;\
246  } while(--h); \
247 } \
248 \
249 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
250  do { \
251  uint32_t a0,a1,a2,a3; \
252  UNPACK(a0,a1,LPC(src1),LPC(src2)); \
253  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
254  OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
255  UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
256  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
257  OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
258  UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
259  UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
260  OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
261  UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
262  UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
263  OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
264  src1+=src_stride1;\
265  src2+=src_stride2;\
266  src3+=src_stride3;\
267  src4+=src_stride4;\
268  dst+=dst_stride;\
269  } while(--h); \
270 } \
271 \
272 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
273  do { /* src1 is unaligned */\
274  uint32_t a0,a1,a2,a3; \
275  UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
276  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
277  OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
278  UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
279  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
280  OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
281  UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
282  UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
283  OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
284  UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
285  UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
286  OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
287  src1+=src_stride1;\
288  src2+=src_stride2;\
289  src3+=src_stride3;\
290  src4+=src_stride4;\
291  dst+=dst_stride;\
292  } while(--h); \
293 } \
294 \
295 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
296  do { \
297  uint32_t a0,a1,a2,a3; \
298  UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
299  UNPACK(a2,a3,LPC(src3),LPC(src4)); \
300  OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
301  UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
302  UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
303  OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
304  UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
305  UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
306  OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
307  UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
308  UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
309  OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
310  src1+=src_stride1;\
311  src2+=src_stride2;\
312  src3+=src_stride3;\
313  src4+=src_stride4;\
314  dst+=dst_stride;\
315  } while(--h); \
316 } \
317 \
318 
319 #define op_avg(a, b) a = rnd_avg32(a,b)
320 #define op_put(a, b) a = b
321 
322 PIXOP2(avg, op_avg)
323 PIXOP2(put, op_put)
324 #undef op_avg
325 #undef op_put
326 
327 #define avg2(a,b) ((a+b+1)>>1)
328 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
329 
330 
331 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
332 {
333  const int A=(16-x16)*(16-y16);
334  const int B=( x16)*(16-y16);
335  const int C=(16-x16)*( y16);
336  const int D=( x16)*( y16);
337 
338  do {
339  int t0,t1,t2,t3;
340  uint8_t *s0 = src;
341  uint8_t *s1 = src+stride;
342  t0 = *s0++; t2 = *s1++;
343  t1 = *s0++; t3 = *s1++;
344  dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
345  t0 = *s0++; t2 = *s1++;
346  dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
347  t1 = *s0++; t3 = *s1++;
348  dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
349  t0 = *s0++; t2 = *s1++;
350  dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
351  t1 = *s0++; t3 = *s1++;
352  dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
353  t0 = *s0++; t2 = *s1++;
354  dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
355  t1 = *s0++; t3 = *s1++;
356  dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
357  t0 = *s0++; t2 = *s1++;
358  dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
359  dst+= stride;
360  src+= stride;
361  }while(--h);
362 }
363 
364 #define QPEL_MC(r, OPNAME, RND, OP) \
365 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
366  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
367  do {\
368  uint8_t *s = src; \
369  int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
370  src0= *s++;\
371  src1= *s++;\
372  src2= *s++;\
373  src3= *s++;\
374  src4= *s++;\
375  OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
376  src5= *s++;\
377  OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
378  src6= *s++;\
379  OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
380  src7= *s++;\
381  OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
382  src8= *s++;\
383  OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
384  OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
385  OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
386  OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
387  dst+=dstStride;\
388  src+=srcStride;\
389  }while(--h);\
390 }\
391 \
392 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
393  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
394  int w=8;\
395  do{\
396  uint8_t *s = src, *d=dst;\
397  int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
398  src0 = *s; s+=srcStride; \
399  src1 = *s; s+=srcStride; \
400  src2 = *s; s+=srcStride; \
401  src3 = *s; s+=srcStride; \
402  src4 = *s; s+=srcStride; \
403  OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
404  src5 = *s; s+=srcStride; \
405  OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
406  src6 = *s; s+=srcStride; \
407  OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
408  src7 = *s; s+=srcStride; \
409  OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
410  src8 = *s; \
411  OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
412  OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
413  OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
414  OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
415  dst++;\
416  src++;\
417  }while(--w);\
418 }\
419 \
420 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
421  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
422  do {\
423  uint8_t *s = src;\
424  int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
425  int src9,src10,src11,src12,src13,src14,src15,src16;\
426  src0= *s++;\
427  src1= *s++;\
428  src2= *s++;\
429  src3= *s++;\
430  src4= *s++;\
431  OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
432  src5= *s++;\
433  OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
434  src6= *s++;\
435  OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
436  src7= *s++;\
437  OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
438  src8= *s++;\
439  OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
440  src9= *s++;\
441  OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
442  src10= *s++;\
443  OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
444  src11= *s++;\
445  OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
446  src12= *s++;\
447  OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
448  src13= *s++;\
449  OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
450  src14= *s++;\
451  OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
452  src15= *s++;\
453  OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
454  src16= *s++;\
455  OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
456  OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
457  OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
458  OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
459  dst+=dstStride;\
460  src+=srcStride;\
461  }while(--h);\
462 }\
463 \
464 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
465  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
466  int w=16;\
467  do {\
468  uint8_t *s = src, *d=dst;\
469  int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
470  int src9,src10,src11,src12,src13,src14,src15,src16;\
471  src0 = *s; s+=srcStride; \
472  src1 = *s; s+=srcStride; \
473  src2 = *s; s+=srcStride; \
474  src3 = *s; s+=srcStride; \
475  src4 = *s; s+=srcStride; \
476  OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
477  src5 = *s; s+=srcStride; \
478  OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
479  src6 = *s; s+=srcStride; \
480  OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
481  src7 = *s; s+=srcStride; \
482  OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
483  src8 = *s; s+=srcStride; \
484  OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
485  src9 = *s; s+=srcStride; \
486  OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
487  src10 = *s; s+=srcStride; \
488  OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
489  src11 = *s; s+=srcStride; \
490  OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
491  src12 = *s; s+=srcStride; \
492  OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
493  src13 = *s; s+=srcStride; \
494  OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
495  src14 = *s; s+=srcStride; \
496  OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
497  src15 = *s; s+=srcStride; \
498  OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
499  src16 = *s; \
500  OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
501  OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
502  OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
503  OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
504  dst++;\
505  src++;\
506  }while(--w);\
507 }\
508 \
509 static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
510  OPNAME ## pixels8_c(dst, src, stride, 8);\
511 }\
512 \
513 static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
514  uint8_t half[64];\
515  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
516  OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
517 }\
518 \
519 static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
520  OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
521 }\
522 \
523 static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
524  uint8_t half[64];\
525  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
526  OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
527 }\
528 \
529 static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
530  uint8_t full[16*9];\
531  uint8_t half[64];\
532  copy_block9(full, src, 16, stride, 9);\
533  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
534  OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
535 }\
536 \
537 static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
538  uint8_t full[16*9];\
539  copy_block9(full, src, 16, stride, 9);\
540  OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
541 }\
542 \
543 static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
544  uint8_t full[16*9];\
545  uint8_t half[64];\
546  copy_block9(full, src, 16, stride, 9);\
547  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
548  OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
549 }\
550 static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
551  uint8_t full[16*9];\
552  uint8_t halfH[72];\
553  uint8_t halfHV[64];\
554  copy_block9(full, src, 16, stride, 9);\
555  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
556  put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
557  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
558  OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
559 }\
560 static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
561  uint8_t full[16*9];\
562  uint8_t halfH[72];\
563  uint8_t halfHV[64];\
564  copy_block9(full, src, 16, stride, 9);\
565  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
566  put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
567  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
568  OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
569 }\
570 static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
571  uint8_t full[16*9];\
572  uint8_t halfH[72];\
573  uint8_t halfHV[64];\
574  copy_block9(full, src, 16, stride, 9);\
575  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
576  put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
577  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
578  OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
579 }\
580 static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
581  uint8_t full[16*9];\
582  uint8_t halfH[72];\
583  uint8_t halfHV[64];\
584  copy_block9(full, src, 16, stride, 9);\
585  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
586  put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
587  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
588  OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
589 }\
590 static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
591  uint8_t halfH[72];\
592  uint8_t halfHV[64];\
593  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
594  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
595  OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
596 }\
597 static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
598  uint8_t halfH[72];\
599  uint8_t halfHV[64];\
600  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
601  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
602  OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
603 }\
604 static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
605  uint8_t full[16*9];\
606  uint8_t halfH[72];\
607  copy_block9(full, src, 16, stride, 9);\
608  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
609  put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
610  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
611 }\
612 static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
613  uint8_t full[16*9];\
614  uint8_t halfH[72];\
615  copy_block9(full, src, 16, stride, 9);\
616  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
617  put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
618  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
619 }\
620 static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
621  uint8_t halfH[72];\
622  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
623  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
624 }\
625 static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
626  OPNAME ## pixels16_c(dst, src, stride, 16);\
627 }\
628 \
629 static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
630  uint8_t half[256];\
631  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
632  OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
633 }\
634 \
635 static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
636  OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
637 }\
638 \
639 static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
640  uint8_t half[256];\
641  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
642  OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
643 }\
644 \
645 static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
646  uint8_t full[24*17];\
647  uint8_t half[256];\
648  copy_block17(full, src, 24, stride, 17);\
649  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
650  OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
651 }\
652 \
653 static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
654  uint8_t full[24*17];\
655  copy_block17(full, src, 24, stride, 17);\
656  OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
657 }\
658 \
659 static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
660  uint8_t full[24*17];\
661  uint8_t half[256];\
662  copy_block17(full, src, 24, stride, 17);\
663  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
664  OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
665 }\
666 static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
667  uint8_t full[24*17];\
668  uint8_t halfH[272];\
669  uint8_t halfHV[256];\
670  copy_block17(full, src, 24, stride, 17);\
671  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
672  put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
673  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
674  OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
675 }\
676 static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
677  uint8_t full[24*17];\
678  uint8_t halfH[272];\
679  uint8_t halfHV[256];\
680  copy_block17(full, src, 24, stride, 17);\
681  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
682  put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
683  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
684  OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
685 }\
686 static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
687  uint8_t full[24*17];\
688  uint8_t halfH[272];\
689  uint8_t halfHV[256];\
690  copy_block17(full, src, 24, stride, 17);\
691  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
692  put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
693  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
694  OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
695 }\
696 static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
697  uint8_t full[24*17];\
698  uint8_t halfH[272];\
699  uint8_t halfHV[256];\
700  copy_block17(full, src, 24, stride, 17);\
701  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
702  put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
703  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
704  OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
705 }\
706 static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
707  uint8_t halfH[272];\
708  uint8_t halfHV[256];\
709  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
710  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
711  OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
712 }\
713 static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
714  uint8_t halfH[272];\
715  uint8_t halfHV[256];\
716  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
717  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
718  OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
719 }\
720 static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
721  uint8_t full[24*17];\
722  uint8_t halfH[272];\
723  copy_block17(full, src, 24, stride, 17);\
724  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
725  put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
726  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
727 }\
728 static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
729  uint8_t full[24*17];\
730  uint8_t halfH[272];\
731  copy_block17(full, src, 24, stride, 17);\
732  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
733  put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
734  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
735 }\
736 static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
737  uint8_t halfH[272];\
738  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
739  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
740 }
741 
742 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
743 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
744 #define op_put(a, b) a = cm[((b) + 16)>>5]
745 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
746 
747 QPEL_MC(0, put_ , _ , op_put)
748 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
749 QPEL_MC(0, avg_ , _ , op_avg)
750 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
751 #undef op_avg
752 #undef op_avg_no_rnd
753 #undef op_put
754 #undef op_put_no_rnd
755 
756 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
758 
759  do{
760  int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
761  uint8_t *s = src;
762  src_1 = s[-1];
763  src0 = *s++;
764  src1 = *s++;
765  src2 = *s++;
766  dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
767  src3 = *s++;
768  dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
769  src4 = *s++;
770  dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
771  src5 = *s++;
772  dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
773  src6 = *s++;
774  dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
775  src7 = *s++;
776  dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
777  src8 = *s++;
778  dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
779  src9 = *s++;
780  dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
781  dst+=dstStride;
782  src+=srcStride;
783  }while(--h);
784 }
785 
786 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
788 
789  do{
790  int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
791  uint8_t *s = src,*d = dst;
792  src_1 = *(s-srcStride);
793  src0 = *s; s+=srcStride;
794  src1 = *s; s+=srcStride;
795  src2 = *s; s+=srcStride;
796  *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
797  src3 = *s; s+=srcStride;
798  *d= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; d+=dstStride;
799  src4 = *s; s+=srcStride;
800  *d= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; d+=dstStride;
801  src5 = *s; s+=srcStride;
802  *d= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; d+=dstStride;
803  src6 = *s; s+=srcStride;
804  *d= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; d+=dstStride;
805  src7 = *s; s+=srcStride;
806  *d= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; d+=dstStride;
807  src8 = *s; s+=srcStride;
808  *d= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; d+=dstStride;
809  src9 = *s;
810  *d= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; d+=dstStride;
811  src++;
812  dst++;
813  }while(--w);
814 }
815 
816 static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
817  put_pixels8_c(dst, src, stride, 8);
818 }
819 
820 static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
821  uint8_t half[64];
822  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
823  put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
824 }
825 
826 static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
827  wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
828 }
829 
830 static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
831  uint8_t half[64];
832  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
833  put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
834 }
835 
836 static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
837  wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
838 }
839 
840 static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
841  uint8_t halfH[88];
842  uint8_t halfV[64];
843  uint8_t halfHV[64];
844  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
845  wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
846  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
847  put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
848 }
849 static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
850  uint8_t halfH[88];
851  uint8_t halfV[64];
852  uint8_t halfHV[64];
853  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
854  wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
855  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
856  put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
857 }
858 static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
859  uint8_t halfH[88];
860  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
861  wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
862 }