FFmpeg
h264_qpel.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3  * Copyright (c) 2011 Daniel Kang
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/avassert.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/h264qpel.h"
29 #include "libavcodec/pixels.h"
30 #include "fpel.h"
31 
32 #if HAVE_X86ASM
33 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
34  int dstStride, int src1Stride, int h);
35 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
36  int dstStride, int src1Stride, int h);
37 void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
38  int dstStride, int src1Stride, int h);
39 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
40  int dstStride, int src1Stride, int h);
41 void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
42  int dstStride, int src1Stride, int h);
43 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
44  int dstStride, int src1Stride, int h);
45 #define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
46 #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
47 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
48 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
49 #define ff_put_pixels16_mmxext ff_put_pixels16_mmx
50 #define ff_put_pixels8_mmxext(...)
51 #define ff_put_pixels4_mmxext(...)
52 
53 #define DEF_QPEL(OPNAME)\
54 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
55 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
56 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
57 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
58 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
59 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
60 void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
61 void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
62 void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
63 void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
64 void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
65 void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
66 void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
67 void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
68 void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
69 
70 DEF_QPEL(avg)
71 DEF_QPEL(put)
72 
73 #define QPEL_H264(OPNAME, OP, MMX)\
74 static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
75  int w=3;\
76  src -= 2*srcStride+2;\
77  while(w--){\
78  ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
79  tmp += 4;\
80  src += 4;\
81  }\
82  tmp -= 3*4;\
83  ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
84 }\
85 \
86 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
87  int w = size>>4;\
88  do{\
89  ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
90  tmp += 8;\
91  dst += 8;\
92  }while(w--);\
93 }\
94 \
95 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
96  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
97  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
98  src += 8*srcStride;\
99  dst += 8*dstStride;\
100  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
101  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
102 }\
103 \
104 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
105  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
106  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
107  src += 8*dstStride;\
108  dst += 8*dstStride;\
109  src2 += 8*src2Stride;\
110  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
111  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
112 }\
113 \
114 static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
115 {\
116  ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
117  ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
118 }\
119 
120 
121 #if ARCH_X86_64
122 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
123 
124 void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
125 void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
126 
127 #else // ARCH_X86_64
128 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
129 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
130  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
131  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
132  src += 8*dstStride;\
133  dst += 8*dstStride;\
134  src2 += 8*src2Stride;\
135  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
136  ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
137 }
138 #endif // ARCH_X86_64
139 
140 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
141 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
142 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
143  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
144  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
145  src += 8*srcStride;\
146  dst += 8*dstStride;\
147  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
148  ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
149 }\
150 
151 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
152 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
153  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
154 }\
155 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
156  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
157  ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
158 }
159 
160 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
161  const uint8_t *src,
162  int tmpStride,
163  int srcStride,
164  int size)
165 {
166  int w = (size+8)>>3;
167  src -= 2*srcStride+2;
168  while(w--){
169  ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
170  tmp += 8;
171  src += 8;
172  }
173 }
174 
175 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
176 static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
177  put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
178  ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
179 }\
180 static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
181  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
182 }\
183 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
184  ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
185 }\
186 
187 #define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
188 #define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
189 #define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
190 #define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
191 
192 #define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
193 #define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
194 #define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
195 #define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
196 
197 #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
198 #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
199 
200 #define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
201 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
202 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
203 
204 #define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
205 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
206 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
207 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
208 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
209 
210 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
211  ptrdiff_t stride)
212 {
213  ff_put_pixels16_sse2(dst, src, stride, 16);
214 }
215 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
216  ptrdiff_t stride)
217 {
218  ff_avg_pixels16_sse2(dst, src, stride, 16);
219 }
220 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
221 
222 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
223 static void av_unused OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
224 {\
225  ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
226 }\
227 
228 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
229 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
230 {\
231  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
232 }\
233 \
234 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
235 {\
236  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
237 }\
238 \
239 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
240 {\
241  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
242 }\
243 
244 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
245 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
246 {\
247  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
248  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
249  ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
250 }\
251 \
252 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
253 {\
254  ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
255 }\
256 \
257 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
258 {\
259  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
260  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
261  ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
262 }\
263 
264 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
265 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
266 {\
267  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
268  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
269  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
270 }\
271 \
272 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
273 {\
274  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
275  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
276  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
277 }\
278 \
279 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
280 {\
281  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
282  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
283  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
284 }\
285 \
286 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
287 {\
288  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
289  ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
290  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
291 }\
292 \
293 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
294 {\
295  LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
296  ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
297 }\
298 \
299 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
300 {\
301  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
302  uint8_t * const halfHV= temp;\
303  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
304  av_assert2(((uintptr_t)temp & 7) == 0);\
305  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
306  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
307 }\
308 \
309 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
310 {\
311  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
312  uint8_t * const halfHV= temp;\
313  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
314  av_assert2(((uintptr_t)temp & 7) == 0);\
315  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
316  ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
317 }\
318 \
319 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
320 {\
321  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
322  uint8_t * const halfHV= temp;\
323  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
324  av_assert2(((uintptr_t)temp & 7) == 0);\
325  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
326  ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
327 }\
328 \
329 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
330 {\
331  LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
332  uint8_t * const halfHV= temp;\
333  int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
334  av_assert2(((uintptr_t)temp & 7) == 0);\
335  ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
336  ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
337 }\
338 
339 #define H264_MC(QPEL, SIZE, MMX, ALIGN)\
340 QPEL(put_, SIZE, MMX, ALIGN) \
341 QPEL(avg_, SIZE, MMX, ALIGN) \
342 
343 #define H264_MC_816(QPEL, XMM)\
344 QPEL(put_, 8, XMM, 16)\
345 QPEL(put_, 16,XMM, 16)\
346 QPEL(avg_, 8, XMM, 16)\
347 QPEL(avg_, 16,XMM, 16)\
348 
349 QPEL_H264(put_, PUT_OP, mmxext)
350 QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
351 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
352 QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
353 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
354 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
355 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
356 QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
357 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
358 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
359 
360 H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
361 H264_MC(H264_MC_C_H, 8, mmxext, 8)
362 H264_MC(H264_MC_C_H, 16, mmxext, 8)
363 H264_MC_816(H264_MC_V, sse2)
364 H264_MC_816(H264_MC_HV, sse2)
365 H264_MC_816(H264_MC_H, ssse3)
366 H264_MC_816(H264_MC_HV, ssse3)
367 
368 
369 //10bit
370 #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
371 void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
372  (uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
373 
374 #define LUMA_MC_4(DEPTH, TYPE, OPT) \
375  LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
376  LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT)
377 
378 #define LUMA_MC_816(DEPTH, TYPE, OPT) \
379  LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
380  LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
381  LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
382  LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
383 
384 LUMA_MC_4(10, mc00, mmxext)
385 LUMA_MC_4(10, mc10, mmxext)
386 LUMA_MC_4(10, mc20, mmxext)
387 LUMA_MC_4(10, mc30, mmxext)
388 LUMA_MC_4(10, mc01, mmxext)
389 LUMA_MC_4(10, mc11, mmxext)
390 LUMA_MC_4(10, mc21, mmxext)
391 LUMA_MC_4(10, mc31, mmxext)
392 LUMA_MC_4(10, mc02, mmxext)
393 LUMA_MC_4(10, mc12, mmxext)
394 LUMA_MC_4(10, mc22, mmxext)
395 LUMA_MC_4(10, mc32, mmxext)
396 LUMA_MC_4(10, mc03, mmxext)
397 LUMA_MC_4(10, mc13, mmxext)
398 LUMA_MC_4(10, mc23, mmxext)
399 LUMA_MC_4(10, mc33, mmxext)
400 
401 LUMA_MC_816(10, mc00, sse2)
402 LUMA_MC_816(10, mc10, sse2)
403 LUMA_MC_816(10, mc10, sse2_cache64)
404 LUMA_MC_816(10, mc10, ssse3_cache64)
405 LUMA_MC_816(10, mc20, sse2)
406 LUMA_MC_816(10, mc20, sse2_cache64)
407 LUMA_MC_816(10, mc20, ssse3_cache64)
408 LUMA_MC_816(10, mc30, sse2)
409 LUMA_MC_816(10, mc30, sse2_cache64)
410 LUMA_MC_816(10, mc30, ssse3_cache64)
411 LUMA_MC_816(10, mc01, sse2)
412 LUMA_MC_816(10, mc11, sse2)
413 LUMA_MC_816(10, mc21, sse2)
414 LUMA_MC_816(10, mc31, sse2)
415 LUMA_MC_816(10, mc02, sse2)
416 LUMA_MC_816(10, mc12, sse2)
417 LUMA_MC_816(10, mc22, sse2)
418 LUMA_MC_816(10, mc32, sse2)
419 LUMA_MC_816(10, mc03, sse2)
420 LUMA_MC_816(10, mc13, sse2)
421 LUMA_MC_816(10, mc23, sse2)
422 LUMA_MC_816(10, mc33, sse2)
423 
424 #endif /* HAVE_X86ASM */
425 
426 #define SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX) \
427  do { \
428  c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
429  c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
430  c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
431  } while (0)
432 #define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX) \
433  do { \
434  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
435  SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX); \
436  } while (0)
437 #define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX) \
438  do { \
439  SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX); \
440  c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
441  c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
442  c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
443  c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
444  c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
445  c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
446  c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
447  c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
448  c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
449  c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
450  c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
451  c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
452  } while (0)
453 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
454  do { \
455  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
456  SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX); \
457  } while (0)
458 
459 #define H264_QPEL_FUNCS(x, y, CPU) \
460  do { \
461  c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
462  c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
463  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
464  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
465  } while (0)
466 
467 #define H264_QPEL_FUNCS_10(x, y, CPU) \
468  do { \
469  c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
470  c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
471  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
472  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
473  } while (0)
474 
476 {
477 #if HAVE_X86ASM
478  int high_bit_depth = bit_depth > 8;
479  int cpu_flags = av_get_cpu_flags();
480 
481  if (EXTERNAL_MMXEXT(cpu_flags)) {
482  if (!high_bit_depth) {
483  SET_QPEL_FUNCS0123(put_h264_qpel, 0, 16, mmxext, );
484  SET_QPEL_FUNCS123 (put_h264_qpel, 1, 8, mmxext, );
485  SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, );
486  SET_QPEL_FUNCS0123(avg_h264_qpel, 0, 16, mmxext, );
487  SET_QPEL_FUNCS0123(avg_h264_qpel, 1, 8, mmxext, );
488  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
489  } else if (bit_depth == 10) {
490  SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
491  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
492  }
493  }
494 
495  if (EXTERNAL_SSE2(cpu_flags)) {
496  if (!high_bit_depth) {
497  H264_QPEL_FUNCS(0, 1, sse2);
498  H264_QPEL_FUNCS(0, 2, sse2);
499  H264_QPEL_FUNCS(0, 3, sse2);
500  H264_QPEL_FUNCS(1, 1, sse2);
501  H264_QPEL_FUNCS(1, 2, sse2);
502  H264_QPEL_FUNCS(1, 3, sse2);
503  H264_QPEL_FUNCS(2, 1, sse2);
504  H264_QPEL_FUNCS(2, 2, sse2);
505  H264_QPEL_FUNCS(2, 3, sse2);
506  H264_QPEL_FUNCS(3, 1, sse2);
507  H264_QPEL_FUNCS(3, 2, sse2);
508  H264_QPEL_FUNCS(3, 3, sse2);
509  }
510 
511  if (bit_depth == 10) {
512  SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
513  SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
514  SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
515  SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
516  H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
517  H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
518  H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
519  }
520  }
521 
523  if (!high_bit_depth) {
524  c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2;
525  c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2;
526  c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_sse2;
527  }
528  }
529 
530  if (EXTERNAL_SSSE3(cpu_flags)) {
531  if (!high_bit_depth) {
532  H264_QPEL_FUNCS(1, 0, ssse3);
533  H264_QPEL_FUNCS(1, 1, ssse3);
534  H264_QPEL_FUNCS(1, 2, ssse3);
535  H264_QPEL_FUNCS(1, 3, ssse3);
536  H264_QPEL_FUNCS(2, 0, ssse3);
537  H264_QPEL_FUNCS(2, 1, ssse3);
538  H264_QPEL_FUNCS(2, 2, ssse3);
539  H264_QPEL_FUNCS(2, 3, ssse3);
540  H264_QPEL_FUNCS(3, 0, ssse3);
541  H264_QPEL_FUNCS(3, 1, ssse3);
542  H264_QPEL_FUNCS(3, 2, ssse3);
543  H264_QPEL_FUNCS(3, 3, ssse3);
544  }
545 
546  if (bit_depth == 10) {
547  H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
548  H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
549  H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
550  }
551  }
552 
553  if (EXTERNAL_AVX(cpu_flags)) {
554  /* AVX implies 64 byte cache lines without the need to avoid unaligned
555  * memory accesses that cross the boundary between two cache lines.
556  * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
557  * having to treat SSE2 functions with such properties as AVX. */
558  if (bit_depth == 10) {
559  H264_QPEL_FUNCS_10(1, 0, sse2);
560  H264_QPEL_FUNCS_10(2, 0, sse2);
561  H264_QPEL_FUNCS_10(3, 0, sse2);
562  }
563  }
564 #endif
565 }
cpu.h
EXTERNAL_SSE2_FAST
#define EXTERNAL_SSE2_FAST(flags)
Definition: cpu.h:60
mem_internal.h
src1
const pixel * src1
Definition: h264pred_template.c:421
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
w
uint8_t w
Definition: llviddspenc.c:38
pixels.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:245
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
h264qpel.h
ff_h264qpel_init_x86
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
Definition: h264_qpel.c:475
avassert.h
av_cold
#define av_cold
Definition: attributes.h:90
ff_put_pixels16_l2_mmxext
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
H264_MC
#define H264_MC(OPNAME, SIZE)
Definition: h264qpel_template.c:391
H264_QPEL_FUNCS
#define H264_QPEL_FUNCS(x, y, CPU)
Definition: h264_qpel.c:459
SET_QPEL_FUNCS0123
#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:432
SET_QPEL_FUNCS
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:453
ff_put_pixels16_sse2
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_avg_pixels8_l2_mmxext
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
ff_avg_pixels16_sse2
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cpu.h
size
int size
Definition: twinvq_data.h:10344
asm.h
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
SET_QPEL_FUNCS_1PP
#define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:437
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
src2
const pixel * src2
Definition: h264pred_template.c:422
av_always_inline
#define av_always_inline
Definition: attributes.h:49
SET_QPEL_FUNCS123
#define SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX)
Definition: h264_qpel.c:426
H264QpelContext
Definition: h264qpel.h:27
stride
#define stride
Definition: h264pred_template.c:537
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
ff_put_pixels8_l2_mmxext
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)
fpel.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
h
h
Definition: vp9dsp_template.c:2038
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
H264_QPEL_FUNCS_10
#define H264_QPEL_FUNCS_10(x, y, CPU)
Definition: h264_qpel.c:467
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_avg_pixels16_l2_mmxext
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dstStride, int src1Stride, int h)