FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
me_cmp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 #if HAVE_ALTIVEC_H
25 #include <altivec.h>
26 #endif
27 
28 #include "libavutil/attributes.h"
29 #include "libavutil/cpu.h"
30 #include "libavutil/ppc/cpu.h"
33 #include "libavcodec/avcodec.h"
34 #include "libavcodec/mpegvideo.h"
35 #include "libavcodec/me_cmp.h"
36 
37 #if HAVE_ALTIVEC
38 
39 #if HAVE_BIGENDIAN
40 #define GET_PERM(per1, per2, pix) {\
41  per1 = vec_lvsl(0, pix);\
42  per2 = vec_add(per1, vec_splat_u8(1));\
43 }
44 #define LOAD_PIX(v, iv, pix, per1, per2) {\
45  vector unsigned char pix2l = vec_ld(0, pix);\
46  vector unsigned char pix2r = vec_ld(16, pix);\
47  v = vec_perm(pix2l, pix2r, per1);\
48  iv = vec_perm(pix2l, pix2r, per2);\
49 }
50 #else
51 #define GET_PERM(per1, per2, pix) {}
52 #define LOAD_PIX(v, iv, pix, per1, per2) {\
53  v = vec_vsx_ld(0, pix);\
54  iv = vec_vsx_ld(1, pix);\
55 }
56 #endif
57 static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58  ptrdiff_t stride, int h)
59 {
60  int i;
61  int __attribute__((aligned(16))) s = 0;
62  const vector unsigned char zero =
63  (const vector unsigned char) vec_splat_u8(0);
64  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
65  vector signed int sumdiffs;
66  vector unsigned char perm1, perm2, pix2v, pix2iv;
67 
68  GET_PERM(perm1, perm2, pix2);
69  for (i = 0; i < h; i++) {
70  /* Read unaligned pixels into our vectors. The vectors are as follows:
71  * pix1v: pix1[0] - pix1[15]
72  * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */
73  vector unsigned char pix1v = vec_ld(0, pix1);
74  LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
75 
76  /* Calculate the average vector. */
77  vector unsigned char avgv = vec_avg(pix2v, pix2iv);
78 
79  /* Calculate a sum of abs differences vector. */
80  vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
81  vec_min(pix1v, avgv));
82 
83  /* Add each 4 pixel group together and put 4 results into sad. */
84  sad = vec_sum4s(t5, sad);
85 
86  pix1 += stride;
87  pix2 += stride;
88  }
89  /* Sum up the four partial sums, and put the result into s. */
90  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
91  sumdiffs = vec_splat(sumdiffs, 3);
92  vec_ste(sumdiffs, 0, &s);
93 
94  return s;
95 }
96 
97 static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
98  ptrdiff_t stride, int h)
99 {
100  int i;
101  int __attribute__((aligned(16))) s = 0;
102  const vector unsigned char zero =
103  (const vector unsigned char) vec_splat_u8(0);
104  vector unsigned char pix1v, pix3v, avgv, t5;
105  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
106  vector signed int sumdiffs;
107 
108  uint8_t *pix3 = pix2 + stride;
109 
110  /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
111  * iteration becomes pix2 in the next iteration. We can use this
112  * fact to avoid a potentially expensive unaligned read, each
113  * time around the loop.
114  * Read unaligned pixels into our vectors. The vectors are as follows:
115  * pix2v: pix2[0] - pix2[15]
116  * Split the pixel vectors into shorts. */
117  vector unsigned char pix2v = VEC_LD(0, pix2);
118 
119  for (i = 0; i < h; i++) {
120  /* Read unaligned pixels into our vectors. The vectors are as follows:
121  * pix1v: pix1[0] - pix1[15]
122  * pix3v: pix3[0] - pix3[15] */
123  pix1v = vec_ld(0, pix1);
124  pix3v = VEC_LD(0, pix3);
125 
126  /* Calculate the average vector. */
127  avgv = vec_avg(pix2v, pix3v);
128 
129  /* Calculate a sum of abs differences vector. */
130  t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
131 
132  /* Add each 4 pixel group together and put 4 results into sad. */
133  sad = vec_sum4s(t5, sad);
134 
135  pix1 += stride;
136  pix2v = pix3v;
137  pix3 += stride;
138  }
139 
140  /* Sum up the four partial sums, and put the result into s. */
141  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
142  sumdiffs = vec_splat(sumdiffs, 3);
143  vec_ste(sumdiffs, 0, &s);
144  return s;
145 }
146 
147 static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
148  ptrdiff_t stride, int h)
149 {
150  int i;
151  int __attribute__((aligned(16))) s = 0;
152  uint8_t *pix3 = pix2 + stride;
153  const vector unsigned char zero =
154  (const vector unsigned char) vec_splat_u8(0);
155  const vector unsigned short two =
156  (const vector unsigned short) vec_splat_u16(2);
157  vector unsigned char avgv, t5;
158  vector unsigned char pix1v, pix3v, pix3iv;
159  vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
160  vector unsigned short avghv, avglv;
161  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
162  vector signed int sumdiffs;
163  vector unsigned char perm1, perm2, pix2v, pix2iv;
164  GET_PERM(perm1, perm2, pix2);
165 
166  /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
167  * iteration becomes pix2 in the next iteration. We can use this
168  * fact to avoid a potentially expensive unaligned read, as well
169  * as some splitting, and vector addition each time around the loop.
170  * Read unaligned pixels into our vectors. The vectors are as follows:
171  * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16]
172  * Split the pixel vectors into shorts. */
173  LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
174  vector unsigned short pix2hv =
175  (vector unsigned short) VEC_MERGEH(zero, pix2v);
176  vector unsigned short pix2lv =
177  (vector unsigned short) VEC_MERGEL(zero, pix2v);
178  vector unsigned short pix2ihv =
179  (vector unsigned short) VEC_MERGEH(zero, pix2iv);
180  vector unsigned short pix2ilv =
181  (vector unsigned short) VEC_MERGEL(zero, pix2iv);
182 
183  vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
184  vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
185  vector unsigned short t3, t4;
186 
187  for (i = 0; i < h; i++) {
188  /* Read unaligned pixels into our vectors. The vectors are as follows:
189  * pix1v: pix1[0] - pix1[15]
190  * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */
191  pix1v = vec_ld(0, pix1);
192  LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
193 
194  /* Note that AltiVec does have vec_avg, but this works on vector pairs
195  * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
196  * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
197  * it should be 1. Instead, we have to split the pixel vectors into
198  * vectors of shorts and do the averaging by hand. */
199 
200  /* Split the pixel vectors into shorts. */
201  pix3hv = (vector unsigned short) VEC_MERGEH(zero, pix3v);
202  pix3lv = (vector unsigned short) VEC_MERGEL(zero, pix3v);
203  pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
204  pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
205 
206  /* Do the averaging on them. */
207  t3 = vec_add(pix3hv, pix3ihv);
208  t4 = vec_add(pix3lv, pix3ilv);
209 
210  avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
211  avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
212 
213  /* Pack the shorts back into a result. */
214  avgv = vec_pack(avghv, avglv);
215 
216  /* Calculate a sum of abs differences vector. */
217  t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
218 
219  /* Add each 4 pixel group together and put 4 results into sad. */
220  sad = vec_sum4s(t5, sad);
221 
222  pix1 += stride;
223  pix3 += stride;
224  /* Transfer the calculated values for pix3 into pix2. */
225  t1 = t3;
226  t2 = t4;
227  }
228  /* Sum up the four partial sums, and put the result into s. */
229  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
230  sumdiffs = vec_splat(sumdiffs, 3);
231  vec_ste(sumdiffs, 0, &s);
232 
233  return s;
234 }
235 
236 static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
237  ptrdiff_t stride, int h)
238 {
239  int i;
240  int __attribute__((aligned(16))) s;
241  const vector unsigned int zero =
242  (const vector unsigned int) vec_splat_u32(0);
243  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
244  vector signed int sumdiffs;
245 
246  for (i = 0; i < h; i++) {
247  /* Read potentially unaligned pixels into t1 and t2. */
248  vector unsigned char t1 =vec_ld(0, pix1);
249  vector unsigned char t2 = VEC_LD(0, pix2);
250 
251  /* Calculate a sum of abs differences vector. */
252  vector unsigned char t3 = vec_max(t1, t2);
253  vector unsigned char t4 = vec_min(t1, t2);
254  vector unsigned char t5 = vec_sub(t3, t4);
255 
256  /* Add each 4 pixel group together and put 4 results into sad. */
257  sad = vec_sum4s(t5, sad);
258 
259  pix1 += stride;
260  pix2 += stride;
261  }
262 
263  /* Sum up the four partial sums, and put the result into s. */
264  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
265  sumdiffs = vec_splat(sumdiffs, 3);
266  vec_ste(sumdiffs, 0, &s);
267 
268  return s;
269 }
270 
271 static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
272  ptrdiff_t stride, int h)
273 {
274  int i;
275  int __attribute__((aligned(16))) s;
276  const vector unsigned int zero =
277  (const vector unsigned int) vec_splat_u32(0);
278  const vector unsigned char permclear =
279  (vector unsigned char)
280  { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
281  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
282  vector signed int sumdiffs;
283 
284  for (i = 0; i < h; i++) {
285  /* Read potentially unaligned pixels into t1 and t2.
286  * Since we're reading 16 pixels, and actually only want 8,
287  * mask out the last 8 pixels. The 0s don't change the sum. */
288  vector unsigned char pix1l = VEC_LD(0, pix1);
289  vector unsigned char pix2l = VEC_LD(0, pix2);
290  vector unsigned char t1 = vec_and(pix1l, permclear);
291  vector unsigned char t2 = vec_and(pix2l, permclear);
292 
293  /* Calculate a sum of abs differences vector. */
294  vector unsigned char t3 = vec_max(t1, t2);
295  vector unsigned char t4 = vec_min(t1, t2);
296  vector unsigned char t5 = vec_sub(t3, t4);
297 
298  /* Add each 4 pixel group together and put 4 results into sad. */
299  sad = vec_sum4s(t5, sad);
300 
301  pix1 += stride;
302  pix2 += stride;
303  }
304 
305  /* Sum up the four partial sums, and put the result into s. */
306  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
307  sumdiffs = vec_splat(sumdiffs, 3);
308  vec_ste(sumdiffs, 0, &s);
309 
310  return s;
311 }
312 
313 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
314  * It's the sad8_altivec code above w/ squaring added. */
315 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
316  ptrdiff_t stride, int h)
317 {
318  int i;
319  int __attribute__((aligned(16))) s;
320  const vector unsigned int zero =
321  (const vector unsigned int) vec_splat_u32(0);
322  const vector unsigned char permclear =
323  (vector unsigned char)
324  { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
325  vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
326  vector signed int sumsqr;
327 
328  for (i = 0; i < h; i++) {
329  /* Read potentially unaligned pixels into t1 and t2.
330  * Since we're reading 16 pixels, and actually only want 8,
331  * mask out the last 8 pixels. The 0s don't change the sum. */
332  vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
333  vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
334 
335  /* Since we want to use unsigned chars, we can take advantage
336  * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
337 
338  /* Calculate abs differences vector. */
339  vector unsigned char t3 = vec_max(t1, t2);
340  vector unsigned char t4 = vec_min(t1, t2);
341  vector unsigned char t5 = vec_sub(t3, t4);
342 
343  /* Square the values and add them to our sum. */
344  sum = vec_msum(t5, t5, sum);
345 
346  pix1 += stride;
347  pix2 += stride;
348  }
349 
350  /* Sum up the four partial sums, and put the result into s. */
351  sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
352  sumsqr = vec_splat(sumsqr, 3);
353  vec_ste(sumsqr, 0, &s);
354 
355  return s;
356 }
357 
358 /* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
359  * It's the sad16_altivec code above w/ squaring added. */
360 static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
361  ptrdiff_t stride, int h)
362 {
363  int i;
364  int __attribute__((aligned(16))) s;
365  const vector unsigned int zero =
366  (const vector unsigned int) vec_splat_u32(0);
367  vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
368  vector signed int sumsqr;
369 
370  for (i = 0; i < h; i++) {
371  /* Read potentially unaligned pixels into t1 and t2. */
372  vector unsigned char t1 = vec_ld(0, pix1);
373  vector unsigned char t2 = VEC_LD(0, pix2);
374 
375  /* Since we want to use unsigned chars, we can take advantage
376  * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
377 
378  /* Calculate abs differences vector. */
379  vector unsigned char t3 = vec_max(t1, t2);
380  vector unsigned char t4 = vec_min(t1, t2);
381  vector unsigned char t5 = vec_sub(t3, t4);
382 
383  /* Square the values and add them to our sum. */
384  sum = vec_msum(t5, t5, sum);
385 
386  pix1 += stride;
387  pix2 += stride;
388  }
389 
390  /* Sum up the four partial sums, and put the result into s. */
391  sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
392  sumsqr = vec_splat(sumsqr, 3);
393 
394  vec_ste(sumsqr, 0, &s);
395  return s;
396 }
397 
398 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
399  uint8_t *src, ptrdiff_t stride, int h)
400 {
401  int __attribute__((aligned(16))) sum;
402  register const vector unsigned char vzero =
403  (const vector unsigned char) vec_splat_u8(0);
404  register vector signed short temp0, temp1, temp2, temp3, temp4,
405  temp5, temp6, temp7;
406  {
407  register const vector signed short vprod1 =
408  (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
409  register const vector signed short vprod2 =
410  (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
411  register const vector signed short vprod3 =
412  (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
413  register const vector unsigned char perm1 =
414  (const vector unsigned char)
415  { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
416  0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
417  register const vector unsigned char perm2 =
418  (const vector unsigned char)
419  { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
420  0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
421  register const vector unsigned char perm3 =
422  (const vector unsigned char)
423  { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
424  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
425 
426 
427 #define ONEITERBUTTERFLY(i, res) \
428  { \
429  register vector unsigned char srcO = unaligned_load(stride * i, src); \
430  register vector unsigned char dstO = unaligned_load(stride * i, dst);\
431  \
432  /* Promote the unsigned chars to signed shorts. */ \
433  /* We're in the 8x8 function, we only care for the first 8. */ \
434  register vector signed short srcV = \
435  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
436  (vector signed char) srcO); \
437  register vector signed short dstV = \
438  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
439  (vector signed char) dstO); \
440  \
441  /* subtractions inside the first butterfly */ \
442  register vector signed short but0 = vec_sub(srcV, dstV); \
443  register vector signed short op1 = vec_perm(but0, but0, perm1); \
444  register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
445  register vector signed short op2 = vec_perm(but1, but1, perm2); \
446  register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
447  register vector signed short op3 = vec_perm(but2, but2, perm3); \
448  res = vec_mladd(but2, vprod3, op3); \
449  }
450 
451  ONEITERBUTTERFLY(0, temp0);
452  ONEITERBUTTERFLY(1, temp1);
453  ONEITERBUTTERFLY(2, temp2);
454  ONEITERBUTTERFLY(3, temp3);
455  ONEITERBUTTERFLY(4, temp4);
456  ONEITERBUTTERFLY(5, temp5);
457  ONEITERBUTTERFLY(6, temp6);
458  ONEITERBUTTERFLY(7, temp7);
459  }
460 #undef ONEITERBUTTERFLY
461  {
462  register vector signed int vsum;
463  register vector signed short line0 = vec_add(temp0, temp1);
464  register vector signed short line1 = vec_sub(temp0, temp1);
465  register vector signed short line2 = vec_add(temp2, temp3);
466  register vector signed short line3 = vec_sub(temp2, temp3);
467  register vector signed short line4 = vec_add(temp4, temp5);
468  register vector signed short line5 = vec_sub(temp4, temp5);
469  register vector signed short line6 = vec_add(temp6, temp7);
470  register vector signed short line7 = vec_sub(temp6, temp7);
471 
472  register vector signed short line0B = vec_add(line0, line2);
473  register vector signed short line2B = vec_sub(line0, line2);
474  register vector signed short line1B = vec_add(line1, line3);
475  register vector signed short line3B = vec_sub(line1, line3);
476  register vector signed short line4B = vec_add(line4, line6);
477  register vector signed short line6B = vec_sub(line4, line6);
478  register vector signed short line5B = vec_add(line5, line7);
479  register vector signed short line7B = vec_sub(line5, line7);
480 
481  register vector signed short line0C = vec_add(line0B, line4B);
482  register vector signed short line4C = vec_sub(line0B, line4B);
483  register vector signed short line1C = vec_add(line1B, line5B);
484  register vector signed short line5C = vec_sub(line1B, line5B);
485  register vector signed short line2C = vec_add(line2B, line6B);
486  register vector signed short line6C = vec_sub(line2B, line6B);
487  register vector signed short line3C = vec_add(line3B, line7B);
488  register vector signed short line7C = vec_sub(line3B, line7B);
489 
490  vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
491  vsum = vec_sum4s(vec_abs(line1C), vsum);
492  vsum = vec_sum4s(vec_abs(line2C), vsum);
493  vsum = vec_sum4s(vec_abs(line3C), vsum);
494  vsum = vec_sum4s(vec_abs(line4C), vsum);
495  vsum = vec_sum4s(vec_abs(line5C), vsum);
496  vsum = vec_sum4s(vec_abs(line6C), vsum);
497  vsum = vec_sum4s(vec_abs(line7C), vsum);
498  vsum = vec_sums(vsum, (vector signed int) vzero);
499  vsum = vec_splat(vsum, 3);
500 
501  vec_ste(vsum, 0, &sum);
502  }
503  return sum;
504 }
505 
506 /*
507  * 16x8 works with 16 elements; it allows to avoid replicating loads, and
508  * gives the compiler more room for scheduling. It's only used from
509  * inside hadamard8_diff16_altivec.
510  *
511  * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
512  * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
513  * registers by itself. The following code includes hand-made register
514  * allocation. It's not clean, but on a 7450 the resulting code is much faster
515  * (best case falls from 700+ cycles to 550).
516  *
517  * xlc doesn't add spill code, but it doesn't know how to schedule for the
518  * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
519  * 25% fewer instructions...)
520  *
521  * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
522  * but xlc goes to around 660 on the regular C code...
523  */
524 static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
525  uint8_t *src, ptrdiff_t stride, int h)
526 {
527  int __attribute__((aligned(16))) sum;
528  register vector signed short
529  temp0 __asm__ ("v0"),
530  temp1 __asm__ ("v1"),
531  temp2 __asm__ ("v2"),
532  temp3 __asm__ ("v3"),
533  temp4 __asm__ ("v4"),
534  temp5 __asm__ ("v5"),
535  temp6 __asm__ ("v6"),
536  temp7 __asm__ ("v7");
537  register vector signed short
538  temp0S __asm__ ("v8"),
539  temp1S __asm__ ("v9"),
540  temp2S __asm__ ("v10"),
541  temp3S __asm__ ("v11"),
542  temp4S __asm__ ("v12"),
543  temp5S __asm__ ("v13"),
544  temp6S __asm__ ("v14"),
545  temp7S __asm__ ("v15");
546  register const vector unsigned char vzero __asm__ ("v31") =
547  (const vector unsigned char) vec_splat_u8(0);
548  {
549  register const vector signed short vprod1 __asm__ ("v16") =
550  (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
551 
552  register const vector signed short vprod2 __asm__ ("v17") =
553  (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
554 
555  register const vector signed short vprod3 __asm__ ("v18") =
556  (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
557 
558  register const vector unsigned char perm1 __asm__ ("v19") =
559  (const vector unsigned char)
560  { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
561  0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
562 
563  register const vector unsigned char perm2 __asm__ ("v20") =
564  (const vector unsigned char)
565  { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
566  0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
567 
568  register const vector unsigned char perm3 __asm__ ("v21") =
569  (const vector unsigned char)
570  { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
571  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
572 
573 #define ONEITERBUTTERFLY(i, res1, res2) \
574  { \
575  register vector unsigned char srcO __asm__ ("v22") = \
576  unaligned_load(stride * i, src); \
577  register vector unsigned char dstO __asm__ ("v23") = \
578  unaligned_load(stride * i, dst);\
579  \
580  /* Promote the unsigned chars to signed shorts. */ \
581  register vector signed short srcV __asm__ ("v24") = \
582  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
583  (vector signed char) srcO); \
584  register vector signed short dstV __asm__ ("v25") = \
585  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
586  (vector signed char) dstO); \
587  register vector signed short srcW __asm__ ("v26") = \
588  (vector signed short) VEC_MERGEL((vector signed char) vzero, \
589  (vector signed char) srcO); \
590  register vector signed short dstW __asm__ ("v27") = \
591  (vector signed short) VEC_MERGEL((vector signed char) vzero, \
592  (vector signed char) dstO); \
593  \
594  /* subtractions inside the first butterfly */ \
595  register vector signed short but0 __asm__ ("v28") = \
596  vec_sub(srcV, dstV); \
597  register vector signed short but0S __asm__ ("v29") = \
598  vec_sub(srcW, dstW); \
599  register vector signed short op1 __asm__ ("v30") = \
600  vec_perm(but0, but0, perm1); \
601  register vector signed short but1 __asm__ ("v22") = \
602  vec_mladd(but0, vprod1, op1); \
603  register vector signed short op1S __asm__ ("v23") = \
604  vec_perm(but0S, but0S, perm1); \
605  register vector signed short but1S __asm__ ("v24") = \
606  vec_mladd(but0S, vprod1, op1S); \
607  register vector signed short op2 __asm__ ("v25") = \
608  vec_perm(but1, but1, perm2); \
609  register vector signed short but2 __asm__ ("v26") = \
610  vec_mladd(but1, vprod2, op2); \
611  register vector signed short op2S __asm__ ("v27") = \
612  vec_perm(but1S, but1S, perm2); \
613  register vector signed short but2S __asm__ ("v28") = \
614  vec_mladd(but1S, vprod2, op2S); \
615  register vector signed short op3 __asm__ ("v29") = \
616  vec_perm(but2, but2, perm3); \
617  register vector signed short op3S __asm__ ("v30") = \
618  vec_perm(but2S, but2S, perm3); \
619  res1 = vec_mladd(but2, vprod3, op3); \
620  res2 = vec_mladd(but2S, vprod3, op3S); \
621  }
622 
623  ONEITERBUTTERFLY(0, temp0, temp0S);
624  ONEITERBUTTERFLY(1, temp1, temp1S);
625  ONEITERBUTTERFLY(2, temp2, temp2S);
626  ONEITERBUTTERFLY(3, temp3, temp3S);
627  ONEITERBUTTERFLY(4, temp4, temp4S);
628  ONEITERBUTTERFLY(5, temp5, temp5S);
629  ONEITERBUTTERFLY(6, temp6, temp6S);
630  ONEITERBUTTERFLY(7, temp7, temp7S);
631  }
632 #undef ONEITERBUTTERFLY
633  {
634  register vector signed int vsum;
635 
636  register vector signed short line0 = vec_add(temp0, temp1);
637  register vector signed short line1 = vec_sub(temp0, temp1);
638  register vector signed short line2 = vec_add(temp2, temp3);
639  register vector signed short line3 = vec_sub(temp2, temp3);
640  register vector signed short line4 = vec_add(temp4, temp5);
641  register vector signed short line5 = vec_sub(temp4, temp5);
642  register vector signed short line6 = vec_add(temp6, temp7);
643  register vector signed short line7 = vec_sub(temp6, temp7);
644 
645  register vector signed short line0B = vec_add(line0, line2);
646  register vector signed short line2B = vec_sub(line0, line2);
647  register vector signed short line1B = vec_add(line1, line3);
648  register vector signed short line3B = vec_sub(line1, line3);
649  register vector signed short line4B = vec_add(line4, line6);
650  register vector signed short line6B = vec_sub(line4, line6);
651  register vector signed short line5B = vec_add(line5, line7);
652  register vector signed short line7B = vec_sub(line5, line7);
653 
654  register vector signed short line0C = vec_add(line0B, line4B);
655  register vector signed short line4C = vec_sub(line0B, line4B);
656  register vector signed short line1C = vec_add(line1B, line5B);
657  register vector signed short line5C = vec_sub(line1B, line5B);
658  register vector signed short line2C = vec_add(line2B, line6B);
659  register vector signed short line6C = vec_sub(line2B, line6B);
660  register vector signed short line3C = vec_add(line3B, line7B);
661  register vector signed short line7C = vec_sub(line3B, line7B);
662 
663  register vector signed short line0S = vec_add(temp0S, temp1S);
664  register vector signed short line1S = vec_sub(temp0S, temp1S);
665  register vector signed short line2S = vec_add(temp2S, temp3S);
666  register vector signed short line3S = vec_sub(temp2S, temp3S);
667  register vector signed short line4S = vec_add(temp4S, temp5S);
668  register vector signed short line5S = vec_sub(temp4S, temp5S);
669  register vector signed short line6S = vec_add(temp6S, temp7S);
670  register vector signed short line7S = vec_sub(temp6S, temp7S);
671 
672  register vector signed short line0BS = vec_add(line0S, line2S);
673  register vector signed short line2BS = vec_sub(line0S, line2S);
674  register vector signed short line1BS = vec_add(line1S, line3S);
675  register vector signed short line3BS = vec_sub(line1S, line3S);
676  register vector signed short line4BS = vec_add(line4S, line6S);
677  register vector signed short line6BS = vec_sub(line4S, line6S);
678  register vector signed short line5BS = vec_add(line5S, line7S);
679  register vector signed short line7BS = vec_sub(line5S, line7S);
680 
681  register vector signed short line0CS = vec_add(line0BS, line4BS);
682  register vector signed short line4CS = vec_sub(line0BS, line4BS);
683  register vector signed short line1CS = vec_add(line1BS, line5BS);
684  register vector signed short line5CS = vec_sub(line1BS, line5BS);
685  register vector signed short line2CS = vec_add(line2BS, line6BS);
686  register vector signed short line6CS = vec_sub(line2BS, line6BS);
687  register vector signed short line3CS = vec_add(line3BS, line7BS);
688  register vector signed short line7CS = vec_sub(line3BS, line7BS);
689 
690  vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
691  vsum = vec_sum4s(vec_abs(line1C), vsum);
692  vsum = vec_sum4s(vec_abs(line2C), vsum);
693  vsum = vec_sum4s(vec_abs(line3C), vsum);
694  vsum = vec_sum4s(vec_abs(line4C), vsum);
695  vsum = vec_sum4s(vec_abs(line5C), vsum);
696  vsum = vec_sum4s(vec_abs(line6C), vsum);
697  vsum = vec_sum4s(vec_abs(line7C), vsum);
698 
699  vsum = vec_sum4s(vec_abs(line0CS), vsum);
700  vsum = vec_sum4s(vec_abs(line1CS), vsum);
701  vsum = vec_sum4s(vec_abs(line2CS), vsum);
702  vsum = vec_sum4s(vec_abs(line3CS), vsum);
703  vsum = vec_sum4s(vec_abs(line4CS), vsum);
704  vsum = vec_sum4s(vec_abs(line5CS), vsum);
705  vsum = vec_sum4s(vec_abs(line6CS), vsum);
706  vsum = vec_sum4s(vec_abs(line7CS), vsum);
707  vsum = vec_sums(vsum, (vector signed int) vzero);
708  vsum = vec_splat(vsum, 3);
709 
710  vec_ste(vsum, 0, &sum);
711  }
712  return sum;
713 }
714 
715 static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
716  uint8_t *src, ptrdiff_t stride, int h)
717 {
718  int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
719 
720  if (h == 16) {
721  dst += 8 * stride;
722  src += 8 * stride;
723  score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
724  }
725  return score;
726 }
727 #endif /* HAVE_ALTIVEC */
728 
730 {
731 #if HAVE_ALTIVEC
733  return;
734 
735  c->pix_abs[0][1] = sad16_x2_altivec;
736  c->pix_abs[0][2] = sad16_y2_altivec;
737  c->pix_abs[0][3] = sad16_xy2_altivec;
738  c->pix_abs[0][0] = sad16_altivec;
739  c->pix_abs[1][0] = sad8_altivec;
740 
741  c->sad[0] = sad16_altivec;
742  c->sad[1] = sad8_altivec;
743  c->sse[0] = sse16_altivec;
744  c->sse[1] = sse8_altivec;
745 
746  c->hadamard8_diff[0] = hadamard8_diff16_altivec;
747  c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
748 #endif /* HAVE_ALTIVEC */
749 }