FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
motion_est_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "dsputil_alpha.h"
23 #include "asm.h"
24 
25 void get_pixels_mvi(int16_t *restrict block,
26  const uint8_t *restrict pixels, int line_size)
27 {
28  int h = 8;
29 
30  do {
31  uint64_t p;
32 
33  p = ldq(pixels);
34  stq(unpkbw(p), block);
35  stq(unpkbw(p >> 32), block + 4);
36 
37  pixels += line_size;
38  block += 8;
39  } while (--h);
40 }
41 
42 void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
43  int stride) {
44  int h = 8;
45  uint64_t mask = 0x4040;
46 
47  mask |= mask << 16;
48  mask |= mask << 32;
49  do {
50  uint64_t x, y, c, d, a;
51  uint64_t signs;
52 
53  x = ldq(s1);
54  y = ldq(s2);
55  c = cmpbge(x, y);
56  d = x - y;
57  a = zap(mask, c); /* We use 0x4040404040404040 here... */
58  d += 4 * a; /* ...so we can use s4addq here. */
59  signs = zap(-1, c);
60 
61  stq(unpkbw(d) | (unpkbw(signs) << 8), block);
62  stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
63 
64  s1 += stride;
65  s2 += stride;
66  block += 8;
67  } while (--h);
68 }
69 
70 static inline uint64_t avg2(uint64_t a, uint64_t b)
71 {
72  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
73 }
74 
75 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
76 {
77  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
78  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
79  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
80  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
81  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
82  + (l2 & BYTE_VEC(0x03))
83  + (l3 & BYTE_VEC(0x03))
84  + (l4 & BYTE_VEC(0x03))
85  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
86  return r1 + r2;
87 }
88 
89 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
90 {
91  int result = 0;
92 
93  if ((size_t) pix2 & 0x7) {
94  /* works only when pix2 is actually unaligned */
95  do { /* do 8 pixel a time */
96  uint64_t p1, p2;
97 
98  p1 = ldq(pix1);
99  p2 = uldq(pix2);
100  result += perr(p1, p2);
101 
102  pix1 += line_size;
103  pix2 += line_size;
104  } while (--h);
105  } else {
106  do {
107  uint64_t p1, p2;
108 
109  p1 = ldq(pix1);
110  p2 = ldq(pix2);
111  result += perr(p1, p2);
112 
113  pix1 += line_size;
114  pix2 += line_size;
115  } while (--h);
116  }
117 
118  return result;
119 }
120 
121 #if 0 /* now done in assembly */
122 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
123 {
124  int result = 0;
125  int h = 16;
126 
127  if ((size_t) pix2 & 0x7) {
128  /* works only when pix2 is actually unaligned */
129  do { /* do 16 pixel a time */
130  uint64_t p1_l, p1_r, p2_l, p2_r;
131  uint64_t t;
132 
133  p1_l = ldq(pix1);
134  p1_r = ldq(pix1 + 8);
135  t = ldq_u(pix2 + 8);
136  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
137  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
138  pix1 += line_size;
139  pix2 += line_size;
140 
141  result += perr(p1_l, p2_l)
142  + perr(p1_r, p2_r);
143  } while (--h);
144  } else {
145  do {
146  uint64_t p1_l, p1_r, p2_l, p2_r;
147 
148  p1_l = ldq(pix1);
149  p1_r = ldq(pix1 + 8);
150  p2_l = ldq(pix2);
151  p2_r = ldq(pix2 + 8);
152  pix1 += line_size;
153  pix2 += line_size;
154 
155  result += perr(p1_l, p2_l)
156  + perr(p1_r, p2_r);
157  } while (--h);
158  }
159 
160  return result;
161 }
162 #endif
163 
164 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
165 {
166  int result = 0;
167  uint64_t disalign = (size_t) pix2 & 0x7;
168 
169  switch (disalign) {
170  case 0:
171  do {
172  uint64_t p1_l, p1_r, p2_l, p2_r;
173  uint64_t l, r;
174 
175  p1_l = ldq(pix1);
176  p1_r = ldq(pix1 + 8);
177  l = ldq(pix2);
178  r = ldq(pix2 + 8);
179  p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
180  p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
181  pix1 += line_size;
182  pix2 += line_size;
183 
184  result += perr(p1_l, p2_l)
185  + perr(p1_r, p2_r);
186  } while (--h);
187  break;
188  case 7:
189  /* |.......l|lllllllr|rrrrrrr*|
190  This case is special because disalign1 would be 8, which
191  gets treated as 0 by extqh. At least it is a bit faster
192  that way :) */
193  do {
194  uint64_t p1_l, p1_r, p2_l, p2_r;
195  uint64_t l, m, r;
196 
197  p1_l = ldq(pix1);
198  p1_r = ldq(pix1 + 8);
199  l = ldq_u(pix2);
200  m = ldq_u(pix2 + 8);
201  r = ldq_u(pix2 + 16);
202  p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
203  p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
204  pix1 += line_size;
205  pix2 += line_size;
206 
207  result += perr(p1_l, p2_l)
208  + perr(p1_r, p2_r);
209  } while (--h);
210  break;
211  default:
212  do {
213  uint64_t disalign1 = disalign + 1;
214  uint64_t p1_l, p1_r, p2_l, p2_r;
215  uint64_t l, m, r;
216 
217  p1_l = ldq(pix1);
218  p1_r = ldq(pix1 + 8);
219  l = ldq_u(pix2);
220  m = ldq_u(pix2 + 8);
221  r = ldq_u(pix2 + 16);
222  p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
223  extql(l, disalign1) | extqh(m, disalign1));
224  p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
225  extql(m, disalign1) | extqh(r, disalign1));
226  pix1 += line_size;
227  pix2 += line_size;
228 
229  result += perr(p1_l, p2_l)
230  + perr(p1_r, p2_r);
231  } while (--h);
232  break;
233  }
234  return result;
235 }
236 
237 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
238 {
239  int result = 0;
240 
241  if ((size_t) pix2 & 0x7) {
242  uint64_t t, p2_l, p2_r;
243  t = ldq_u(pix2 + 8);
244  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
245  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
246 
247  do {
248  uint64_t p1_l, p1_r, np2_l, np2_r;
249  uint64_t t;
250 
251  p1_l = ldq(pix1);
252  p1_r = ldq(pix1 + 8);
253  pix2 += line_size;
254  t = ldq_u(pix2 + 8);
255  np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
256  np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
257 
258  result += perr(p1_l, avg2(p2_l, np2_l))
259  + perr(p1_r, avg2(p2_r, np2_r));
260 
261  pix1 += line_size;
262  p2_l = np2_l;
263  p2_r = np2_r;
264 
265  } while (--h);
266  } else {
267  uint64_t p2_l, p2_r;
268  p2_l = ldq(pix2);
269  p2_r = ldq(pix2 + 8);
270  do {
271  uint64_t p1_l, p1_r, np2_l, np2_r;
272 
273  p1_l = ldq(pix1);
274  p1_r = ldq(pix1 + 8);
275  pix2 += line_size;
276  np2_l = ldq(pix2);
277  np2_r = ldq(pix2 + 8);
278 
279  result += perr(p1_l, avg2(p2_l, np2_l))
280  + perr(p1_r, avg2(p2_r, np2_r));
281 
282  pix1 += line_size;
283  p2_l = np2_l;
284  p2_r = np2_r;
285  } while (--h);
286  }
287  return result;
288 }
289 
290 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
291 {
292  int result = 0;
293 
294  uint64_t p1_l, p1_r;
295  uint64_t p2_l, p2_r, p2_x;
296 
297  p1_l = ldq(pix1);
298  p1_r = ldq(pix1 + 8);
299 
300  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
301  p2_l = uldq(pix2);
302  p2_r = uldq(pix2 + 8);
303  p2_x = (uint64_t) pix2[16] << 56;
304  } else {
305  p2_l = ldq(pix2);
306  p2_r = ldq(pix2 + 8);
307  p2_x = ldq(pix2 + 16) << 56;
308  }
309 
310  do {
311  uint64_t np1_l, np1_r;
312  uint64_t np2_l, np2_r, np2_x;
313 
314  pix1 += line_size;
315  pix2 += line_size;
316 
317  np1_l = ldq(pix1);
318  np1_r = ldq(pix1 + 8);
319 
320  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
321  np2_l = uldq(pix2);
322  np2_r = uldq(pix2 + 8);
323  np2_x = (uint64_t) pix2[16] << 56;
324  } else {
325  np2_l = ldq(pix2);
326  np2_r = ldq(pix2 + 8);
327  np2_x = ldq(pix2 + 16) << 56;
328  }
329 
330  result += perr(p1_l,
331  avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
332  np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
333  + perr(p1_r,
334  avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
335  np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
336 
337  p1_l = np1_l;
338  p1_r = np1_r;
339  p2_l = np2_l;
340  p2_r = np2_r;
341  p2_x = np2_x;
342  } while (--h);
343 
344  return result;
345 }