FFmpeg
h264_qpel.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3  * Copyright (c) 2011 Daniel Kang
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/mem_internal.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/h264qpel.h"
28 #include "libavcodec/pixels.h"
29 #include "fpel.h"
30 
31 #if HAVE_X86ASM
32 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
33  int dstStride, int src1Stride, int h);
34 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
35  int dstStride, int src1Stride, int h);
36 void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
37  int dstStride, int src1Stride, int h);
38 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
39  int dstStride, int src1Stride, int h);
40 void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
41  int dstStride, int src1Stride, int h);
42 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
43  int dstStride, int src1Stride, int h);
44 #define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
45 #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
46 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
47 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
48 #define ff_put_pixels16_mmxext ff_put_pixels16_mmx
49 #define ff_put_pixels8_mmxext ff_put_pixels8_mmx
50 #define ff_put_pixels4_mmxext ff_put_pixels4_mmx
51 
52 #define DEF_QPEL(OPNAME)\
53 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
54 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
55 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
56 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
57 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
58 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
59 void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
60 void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
61 void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
62 void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
63 void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
64 void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
65 void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
66 void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
67 void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
68 void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
69 void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
70 
71 DEF_QPEL(avg)
72 DEF_QPEL(put)
73 
74 static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
75 {
76  int w = (size + 8) >> 2;
77  src -= 2 * srcStride + 2;
78  while (w--) {
79  ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
80  tmp += 4;
81  src += 4;
82  }
83 }
84 
85 #define QPEL_H264(OPNAME, OP, MMX)\
86 static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
87  int w=3;\
88  src -= 2*srcStride+2;\
89  while(w--){\
90  ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
91  tmp += 4;\
92  src += 4;\
93  }\
94  tmp -= 3*4;\
95  ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
96 }\
97 \
98 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
99  src -= 2*srcStride;\
100  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
101  src += 4;\
102  dst += 4;\
103  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
104 }\
105 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
106  int w = size>>4;\
107  do{\
108  ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
109  tmp += 8;\
110  dst += 8;\
111  }while(w--);\
112 }\
113 \
114 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
115  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
116 }\
117 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
118  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
119  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
120 }\
121 \
122 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
123  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
124  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
125  src += 8*srcStride;\
126  dst += 8*dstStride;\
127  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
128  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
129 }\
130 \
131 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
132  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
133  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
134  src += 8*dstStride;\
135  dst += 8*dstStride;\
136  src2 += 8*src2Stride;\
137  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
138  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
139 }\
140 \
141 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
142  ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
143  ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
144 }\
145 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
146  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
147 }\
148 \
149 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
150  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
151 }\
152 \
153 static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
154 {\
155  ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
156  ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
157 }\
158 
159 
160 #if ARCH_X86_64
161 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
162 
163 void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
164 void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
165 
166 #else // ARCH_X86_64
167 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
168 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
169  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
170  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
171  src += 8*dstStride;\
172  dst += 8*dstStride;\
173  src2 += 8*src2Stride;\
174  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
175  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
176 }
177 #endif // ARCH_X86_64
178 
179 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
180 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
181 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
182  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
183  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
184  src += 8*srcStride;\
185  dst += 8*dstStride;\
186  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
187  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
188 }\
189 
190 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
191 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
192  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
193 }\
194 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
195  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
196  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
197 }
198 
199 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
200  const uint8_t *src,
201  int tmpStride,
202  int srcStride,
203  int size)
204 {
205  int w = (size+8)>>3;
206  src -= 2*srcStride+2;
207  while(w--){
208  ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
209  tmp += 8;
210  src += 8;
211  }
212 }
213 
214 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
215 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
216  put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
217  ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
218 }\
219 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
220  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
221 }\
222 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
223  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
224 }\
225 
226 #define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
227 #define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
228 #define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
229 #define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
230 
231 #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
232 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
233 #define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
234 #define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
235 
236 #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
237 #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
238 
239 #define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
240 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
241 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
242 
243 #define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
244 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
245 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
246 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
247 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
248 
249 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
250  ptrdiff_t stride)
251 {
252  ff_put_pixels16_sse2(dst, src, stride, 16);
253 }
254 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
255  ptrdiff_t stride)
256 {
257  ff_avg_pixels16_sse2(dst, src, stride, 16);
258 }
259 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
260 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
261 
262 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
263 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
264 {\
265  ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
266 }\
267 
268 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
269 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
270 {\
271  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
272 }\
273 \
274 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
275 {\
276  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
277 }\
278 \
279 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
280 {\
281  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
282 }\
283 
284 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
285 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
286 {\
287  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
288  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
289  ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
290 }\
291 \
292 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
293 {\
294  ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
295 }\
296 \
297 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
298 {\
299  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
300  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
301  ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
302 }\
303 
304 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
305 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
306 {\
307  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
308  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
309  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
310 }\
311 \
312 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
313 {\
314  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
315  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
316  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
317 }\
318 \
319 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
320 {\
321  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
322  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
323  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
324 }\
325 \
326 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
327 {\
328  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
329  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
330  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
331 }\
332 \
333 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
334 {\
335  LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
336  ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
337 }\
338 \
339 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
340 {\
341  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
342  uint8_t * const halfHV= temp;\
343  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
344  av_assert2(((uintptr_t)temp & 7) == 0);\
345  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
346  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
347 }\
348 \
349 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
350 {\
351  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
352  uint8_t * const halfHV= temp;\
353  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
354  av_assert2(((uintptr_t)temp & 7) == 0);\
355  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
356  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
357 }\
358 \
359 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
360 {\
361  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
362  uint8_t * const halfHV= temp;\
363  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
364  av_assert2(((uintptr_t)temp & 7) == 0);\
365  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
366  ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
367 }\
368 \
369 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
370 {\
371  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
372  uint8_t * const halfHV= temp;\
373  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
374  av_assert2(((uintptr_t)temp & 7) == 0);\
375  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
376  ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
377 }\
378 
379 #define H264_MC(QPEL, SIZE, MMX, ALIGN)\
380 QPEL(put_, SIZE, MMX, ALIGN) \
381 QPEL(avg_, SIZE, MMX, ALIGN) \
382 
383 #define H264_MC_816(QPEL, XMM)\
384 QPEL(put_, 8, XMM, 16)\
385 QPEL(put_, 16,XMM, 16)\
386 QPEL(avg_, 8, XMM, 16)\
387 QPEL(avg_, 16,XMM, 16)\
388 
389 QPEL_H264(put_, PUT_OP, mmxext)
390 QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
391 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
392 QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
393 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
394 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
395 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
396 QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
397 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
398 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
399 
400 H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
401 H264_MC(H264_MC_C_H, 8, mmxext, 8)
402 H264_MC(H264_MC_C_H, 16, mmxext, 8)
403 H264_MC_816(H264_MC_V, sse2)
404 H264_MC_816(H264_MC_HV, sse2)
405 H264_MC_816(H264_MC_H, ssse3)
406 H264_MC_816(H264_MC_HV, ssse3)
407 
408 
409 //10bit
410 #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
411 void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
412  (uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
413 
414 #define LUMA_MC_4(DEPTH, TYPE, OPT) \
415  LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
416  LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT)
417 
418 #define LUMA_MC_816(DEPTH, TYPE, OPT) \
419  LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
420  LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
421  LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
422  LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
423 
424 LUMA_MC_4(10, mc00, mmxext)
425 LUMA_MC_4(10, mc10, mmxext)
426 LUMA_MC_4(10, mc20, mmxext)
427 LUMA_MC_4(10, mc30, mmxext)
428 LUMA_MC_4(10, mc01, mmxext)
429 LUMA_MC_4(10, mc11, mmxext)
430 LUMA_MC_4(10, mc21, mmxext)
431 LUMA_MC_4(10, mc31, mmxext)
432 LUMA_MC_4(10, mc02, mmxext)
433 LUMA_MC_4(10, mc12, mmxext)
434 LUMA_MC_4(10, mc22, mmxext)
435 LUMA_MC_4(10, mc32, mmxext)
436 LUMA_MC_4(10, mc03, mmxext)
437 LUMA_MC_4(10, mc13, mmxext)
438 LUMA_MC_4(10, mc23, mmxext)
439 LUMA_MC_4(10, mc33, mmxext)
440 
441 LUMA_MC_816(10, mc00, sse2)
442 LUMA_MC_816(10, mc10, sse2)
443 LUMA_MC_816(10, mc10, sse2_cache64)
444 LUMA_MC_816(10, mc10, ssse3_cache64)
445 LUMA_MC_816(10, mc20, sse2)
446 LUMA_MC_816(10, mc20, sse2_cache64)
447 LUMA_MC_816(10, mc20, ssse3_cache64)
448 LUMA_MC_816(10, mc30, sse2)
449 LUMA_MC_816(10, mc30, sse2_cache64)
450 LUMA_MC_816(10, mc30, ssse3_cache64)
451 LUMA_MC_816(10, mc01, sse2)
452 LUMA_MC_816(10, mc11, sse2)
453 LUMA_MC_816(10, mc21, sse2)
454 LUMA_MC_816(10, mc31, sse2)
455 LUMA_MC_816(10, mc02, sse2)
456 LUMA_MC_816(10, mc12, sse2)
457 LUMA_MC_816(10, mc22, sse2)
458 LUMA_MC_816(10, mc32, sse2)
459 LUMA_MC_816(10, mc03, sse2)
460 LUMA_MC_816(10, mc13, sse2)
461 LUMA_MC_816(10, mc23, sse2)
462 LUMA_MC_816(10, mc33, sse2)
463 
464 #endif /* HAVE_X86ASM */
465 
466 #define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX) \
467  do { \
468  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
469  c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
470  c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
471  c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
472  } while (0)
473 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
474  do { \
475  SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX); \
476  c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
477  c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
478  c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
479  c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
480  c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
481  c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
482  c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
483  c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
484  c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
485  c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
486  c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
487  c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
488  } while (0)
489 
490 #define H264_QPEL_FUNCS(x, y, CPU) \
491  do { \
492  c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
493  c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
494  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
495  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
496  } while (0)
497 
498 #define H264_QPEL_FUNCS_10(x, y, CPU) \
499  do { \
500  c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
501  c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
502  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
503  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
504  } while (0)
505 
507 {
508 #if HAVE_X86ASM
509  int high_bit_depth = bit_depth > 8;
510  int cpu_flags = av_get_cpu_flags();
511 
512  if (EXTERNAL_MMXEXT(cpu_flags)) {
513  if (!high_bit_depth) {
514  SET_QPEL_FUNCS0123(put_h264_qpel, 0, 16, mmxext, );
515  SET_QPEL_FUNCS0123(put_h264_qpel, 1, 8, mmxext, );
516  SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
517  SET_QPEL_FUNCS0123(avg_h264_qpel, 0, 16, mmxext, );
518  SET_QPEL_FUNCS0123(avg_h264_qpel, 1, 8, mmxext, );
519  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
520  } else if (bit_depth == 10) {
521  SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
522  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
523  }
524  }
525 
526  if (EXTERNAL_SSE2(cpu_flags)) {
527  if (!high_bit_depth) {
528  H264_QPEL_FUNCS(0, 1, sse2);
529  H264_QPEL_FUNCS(0, 2, sse2);
530  H264_QPEL_FUNCS(0, 3, sse2);
531  H264_QPEL_FUNCS(1, 1, sse2);
532  H264_QPEL_FUNCS(1, 2, sse2);
533  H264_QPEL_FUNCS(1, 3, sse2);
534  H264_QPEL_FUNCS(2, 1, sse2);
535  H264_QPEL_FUNCS(2, 2, sse2);
536  H264_QPEL_FUNCS(2, 3, sse2);
537  H264_QPEL_FUNCS(3, 1, sse2);
538  H264_QPEL_FUNCS(3, 2, sse2);
539  H264_QPEL_FUNCS(3, 3, sse2);
540  }
541 
542  if (bit_depth == 10) {
543  SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
544  SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
545  SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
546  SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
547  H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
548  H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
549  H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
550  }
551  }
552 
554  if (!high_bit_depth) {
555  H264_QPEL_FUNCS(0, 0, sse2);
556  }
557  }
558 
559  if (EXTERNAL_SSSE3(cpu_flags)) {
560  if (!high_bit_depth) {
561  H264_QPEL_FUNCS(1, 0, ssse3);
562  H264_QPEL_FUNCS(1, 1, ssse3);
563  H264_QPEL_FUNCS(1, 2, ssse3);
564  H264_QPEL_FUNCS(1, 3, ssse3);
565  H264_QPEL_FUNCS(2, 0, ssse3);
566  H264_QPEL_FUNCS(2, 1, ssse3);
567  H264_QPEL_FUNCS(2, 2, ssse3);
568  H264_QPEL_FUNCS(2, 3, ssse3);
569  H264_QPEL_FUNCS(3, 0, ssse3);
570  H264_QPEL_FUNCS(3, 1, ssse3);
571  H264_QPEL_FUNCS(3, 2, ssse3);
572  H264_QPEL_FUNCS(3, 3, ssse3);
573  }
574 
575  if (bit_depth == 10) {
576  H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
577  H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
578  H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
579  }
580  }
581 
582  if (EXTERNAL_AVX(cpu_flags)) {
583  /* AVX implies 64 byte cache lines without the need to avoid unaligned
584  * memory accesses that cross the boundary between two cache lines.
585  * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
586  * having to treat SSE2 functions with such properties as AVX. */
587  if (bit_depth == 10) {
588  H264_QPEL_FUNCS_10(1, 0, sse2);
589  H264_QPEL_FUNCS_10(2, 0, sse2);
590  H264_QPEL_FUNCS_10(3, 0, sse2);
591  }
592  }
593 #endif
594 }
bit_depth
static void bit_depth(AudioStatsContext *s, uint64_t mask, uint64_t imask, AVRational *depth)
Definition: af_astats.c:226
cpu.h
EXTERNAL_SSE2_FAST
#define EXTERNAL_SSE2_FAST(flags)
Definition: cpu.h:60
mem_internal.h
src1
const pixel * src1
Definition: h264pred_template.c:421
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
w
uint8_t w
Definition: llviddspenc.c:38
pixels.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
h264qpel.h
ff_h264qpel_init_x86
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
Definition: h264_qpel.c:506
av_cold
#define av_cold
Definition: attributes.h:90
ff_put_pixels16_l2_mmxext
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
H264_MC
#define H264_MC(OPNAME, SIZE)
Definition: h264qpel_template.c:380
H264_QPEL_FUNCS
#define H264_QPEL_FUNCS(x, y, CPU)
Definition: h264_qpel.c:490
SET_QPEL_FUNCS0123
#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:466
SET_QPEL_FUNCS
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:473
ff_put_pixels16_sse2
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_avg_pixels8_l2_mmxext
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
ff_avg_pixels16_sse2
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cpu.h
size
int size
Definition: twinvq_data.h:10344
asm.h
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
src2
const pixel * src2
Definition: h264pred_template.c:422
av_always_inline
#define av_always_inline
Definition: attributes.h:49
H264QpelContext
Definition: h264qpel.h:27
stride
#define stride
Definition: h264pred_template.c:537
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
ff_put_pixels8_l2_mmxext
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
fpel.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
h
h
Definition: vp9dsp_template.c:2038
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
H264_QPEL_FUNCS_10
#define H264_QPEL_FUNCS_10(x, y, CPU)
Definition: h264_qpel.c:498
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_avg_pixels16_l2_mmxext
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)