26 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ 28 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ 29 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ 30 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ 31 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ 32 tempA1 = vec_mergeh (src_a, src_e); \ 33 tempB1 = vec_mergel (src_a, src_e); \ 34 tempC1 = vec_mergeh (src_b, src_f); \ 35 tempD1 = vec_mergel (src_b, src_f); \ 36 tempE1 = vec_mergeh (src_c, src_g); \ 37 tempF1 = vec_mergel (src_c, src_g); \ 38 tempG1 = vec_mergeh (src_d, src_h); \ 39 tempH1 = vec_mergel (src_d, src_h); \ 40 tempA2 = vec_mergeh (tempA1, tempE1); \ 41 tempB2 = vec_mergel (tempA1, tempE1); \ 42 tempC2 = vec_mergeh (tempB1, tempF1); \ 43 tempD2 = vec_mergel (tempB1, tempF1); \ 44 tempE2 = vec_mergeh (tempC1, tempG1); \ 45 tempF2 = vec_mergel (tempC1, tempG1); \ 46 tempG2 = vec_mergeh (tempD1, tempH1); \ 47 tempH2 = vec_mergel (tempD1, tempH1); \ 48 src_a = vec_mergeh (tempA2, tempE2); \ 49 src_b = vec_mergel (tempA2, tempE2); \ 50 src_c = vec_mergeh (tempB2, tempF2); \ 51 src_d = vec_mergel (tempB2, tempF2); \ 52 src_e = vec_mergeh (tempC2, tempG2); \ 53 src_f = vec_mergel (tempC2, tempG2); \ 54 src_g = vec_mergeh (tempD2, tempH2); \ 55 src_h = vec_mergel (tempD2, tempH2); \ 76 vector
signed short v_dcOffset;
77 vector
signed short v2QP;
78 vector
unsigned short v4QP;
79 vector
unsigned short v_dcThreshold;
80 const int properStride = (stride % 16);
81 const int srcAlign = ((
unsigned long)src2 % 16);
82 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
83 const vector
signed int zero = vec_splat_s32(0);
84 const vector
signed short mask = vec_splat_s16(1);
85 vector
signed int v_numEq = vec_splat_s32(0);
86 vector
signed short v_data = vec_ld(0,
data);
87 vector
signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,
88 v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
98 vector
unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,
99 v_srcA4, v_srcA5, v_srcA6, v_srcA7;
101 v_dcOffset = vec_splat(v_data, 0);
102 v_dcThreshold = (vector
unsigned short)vec_splat(v_data, 1);
103 v2QP = vec_splat(v_data, 2);
104 v4QP = (vector
unsigned short)vec_splat(v_data, 3);
108 #define LOAD_LINE(i) \ 110 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ 111 vector unsigned char v_srcA2##i; \ 112 vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ 114 v_srcA2##i = vec_ld(j##i + 16, src2); \ 116 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ 118 (vector signed short)vec_mergeh((vector signed char)zero, \ 119 (vector signed char)v_srcA##i); } 121 #define LOAD_LINE_ALIGNED(i) \ 122 v_srcA##i = vec_ld(j##i, src2); \ 124 (vector signed short)vec_mergeh((vector signed char)zero, \ 125 (vector signed char)v_srcA##i) 130 if (properStride && srcAlign) {
150 #undef LOAD_LINE_ALIGNED 153 const vector signed short v_diff##i = \ 154 vec_sub(v_srcAss##i, v_srcAss##j); \ 155 const vector signed short v_sum##i = \ 156 vec_add(v_diff##i, v_dcOffset); \ 157 const vector signed short v_comp##i = \ 158 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ 160 const vector signed short v_part##i = vec_and(mask, v_comp##i); 171 v_numEq = vec_sum4s(v_part0, v_numEq);
172 v_numEq = vec_sum4s(v_part1, v_numEq);
173 v_numEq = vec_sum4s(v_part2, v_numEq);
174 v_numEq = vec_sum4s(v_part3, v_numEq);
175 v_numEq = vec_sum4s(v_part4, v_numEq);
176 v_numEq = vec_sum4s(v_part5, v_numEq);
177 v_numEq = vec_sum4s(v_part6, v_numEq);
182 v_numEq = vec_sums(v_numEq, zero);
184 v_numEq = vec_splat(v_numEq, 3);
185 vec_ste(v_numEq, 0, &numEq);
188 const vector
unsigned char mmoP1 = (
const vector
unsigned char)
189 {0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
190 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B};
191 const vector
unsigned char mmoP2 = (
const vector
unsigned char)
192 {0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
193 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f};
194 const vector
unsigned char mmoP = (
const vector
unsigned char)
195 vec_lvsl(8, (
unsigned char*)0);
197 vector
signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
198 vector
signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
199 vector
signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
200 vector
signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
201 vector
signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
202 vector
signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
203 vector
signed short mmoDiff = vec_sub(mmoL, mmoR);
204 vector
unsigned short mmoSum = (vector
unsigned short)vec_add(mmoDiff, v2QP);
206 if (vec_any_gt(mmoSum, v4QP))
224 const vector
signed int zero = vec_splat_s32(0);
225 const int properStride = (stride % 16);
226 const int srcAlign = ((
unsigned long)src2 % 16);
228 vector
signed short vqp = vec_ld(0, qp);
229 vector
signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
232 vector
unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
233 vector
unsigned char perml0, perml1, perml2, perml3, perml4,
234 perml5, perml6, perml7, perml8, perml9;
246 vqp = vec_splat(vqp, 0);
250 #define LOAD_LINE(i) \ 251 perml##i = vec_lvsl(i * stride, src2); \ 252 vbA##i = vec_ld(i * stride, src2); \ 253 vbB##i = vec_ld(i * stride + 16, src2); \ 254 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ 256 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 257 (vector unsigned char)vbT##i) 259 #define LOAD_LINE_ALIGNED(i) \ 260 vbT##i = vec_ld(j##i, src2); \ 262 (vector signed short)vec_mergeh((vector signed char)zero, \ 263 (vector signed char)vbT##i) 268 if (properStride && srcAlign) {
292 #undef LOAD_LINE_ALIGNED 294 const vector
unsigned short v_2 = vec_splat_u16(2);
295 const vector
unsigned short v_4 = vec_splat_u16(4);
297 const vector
signed short v_diff01 = vec_sub(vb0, vb1);
298 const vector
unsigned short v_cmp01 =
299 (
const vector
unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
300 const vector
signed short v_first = vec_sel(vb1, vb0, v_cmp01);
301 const vector
signed short v_diff89 = vec_sub(vb8, vb9);
302 const vector
unsigned short v_cmp89 =
303 (
const vector
unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
304 const vector
signed short v_last = vec_sel(vb8, vb9, v_cmp89);
306 const vector
signed short temp01 = vec_mladd(v_first, (vector
signed short)v_4, vb1);
307 const vector
signed short temp02 = vec_add(vb2, vb3);
308 const vector
signed short temp03 = vec_add(temp01, (vector
signed short)v_4);
309 const vector
signed short v_sumsB0 = vec_add(temp02, temp03);
311 const vector
signed short temp11 = vec_sub(v_sumsB0, v_first);
312 const vector
signed short v_sumsB1 = vec_add(temp11, vb4);
314 const vector
signed short temp21 = vec_sub(v_sumsB1, v_first);
315 const vector
signed short v_sumsB2 = vec_add(temp21, vb5);
317 const vector
signed short temp31 = vec_sub(v_sumsB2, v_first);
318 const vector
signed short v_sumsB3 = vec_add(temp31, vb6);
320 const vector
signed short temp41 = vec_sub(v_sumsB3, v_first);
321 const vector
signed short v_sumsB4 = vec_add(temp41, vb7);
323 const vector
signed short temp51 = vec_sub(v_sumsB4, vb1);
324 const vector
signed short v_sumsB5 = vec_add(temp51, vb8);
326 const vector
signed short temp61 = vec_sub(v_sumsB5, vb2);
327 const vector
signed short v_sumsB6 = vec_add(temp61, v_last);
329 const vector
signed short temp71 = vec_sub(v_sumsB6, vb3);
330 const vector
signed short v_sumsB7 = vec_add(temp71, v_last);
332 const vector
signed short temp81 = vec_sub(v_sumsB7, vb4);
333 const vector
signed short v_sumsB8 = vec_add(temp81, v_last);
335 const vector
signed short temp91 = vec_sub(v_sumsB8, vb5);
336 const vector
signed short v_sumsB9 = vec_add(temp91, v_last);
338 #define COMPUTE_VR(i, j, k) \ 339 const vector signed short temps1##i = \ 340 vec_add(v_sumsB##i, v_sumsB##k); \ 341 const vector signed short temps2##i = \ 342 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ 343 const vector signed short vr##j = vec_sra(temps2##i, v_4) 354 const vector
signed char neg1 = vec_splat_s8(-1);
355 const vector
unsigned char permHH = (
const vector
unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
356 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
358 #define PACK_AND_STORE(i) \ 359 { const vector unsigned char perms##i = \ 360 vec_lvsr(i * stride, src2); \ 361 const vector unsigned char vf##i = \ 362 vec_packsu(vr##i, (vector signed short)zero); \ 363 const vector unsigned char vg##i = \ 364 vec_perm(vf##i, vbT##i, permHH); \ 365 const vector unsigned char mask##i = \ 366 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ 367 const vector unsigned char vg2##i = \ 368 vec_perm(vg##i, vg##i, perms##i); \ 369 const vector unsigned char svA##i = \ 370 vec_sel(vbA##i, vg2##i, mask##i); \ 371 const vector unsigned char svB##i = \ 372 vec_sel(vg2##i, vbB##i, mask##i); \ 373 vec_st(svA##i, i * stride, src2); \ 374 vec_st(svB##i, i * stride + 16, src2);} 376 #define PACK_AND_STORE_ALIGNED(i) \ 377 { const vector unsigned char vf##i = \ 378 vec_packsu(vr##i, (vector signed short)zero); \ 379 const vector unsigned char vg##i = \ 380 vec_perm(vf##i, vbT##i, permHH); \ 381 vec_st(vg##i, i * stride, src2);} 386 if (properStride && srcAlign) {
405 #undef PACK_AND_STORE 406 #undef PACK_AND_STORE_ALIGNED 421 uint8_t *src2 = src + stride*3;
422 const vector
signed int zero = vec_splat_s32(0);
424 vector
signed short vqp = vec_splat(
425 (vector
signed short)vec_ld(0, qp), 0);
427 #define LOAD_LINE(i) \ 428 const vector unsigned char perm##i = \ 429 vec_lvsl(i * stride, src2); \ 430 const vector unsigned char vbA##i = \ 431 vec_ld(i * stride, src2); \ 432 const vector unsigned char vbB##i = \ 433 vec_ld(i * stride + 16, src2); \ 434 const vector unsigned char vbT##i = \ 435 vec_perm(vbA##i, vbB##i, perm##i); \ 436 const vector signed short vb##i = \ 437 (vector signed short)vec_mergeh((vector unsigned char)zero, \ 438 (vector unsigned char)vbT##i) 450 const vector
signed short v_1 = vec_splat_s16(1);
451 const vector
signed short v_2 = vec_splat_s16(2);
452 const vector
signed short v_5 = vec_splat_s16(5);
453 const vector
signed short v_32 = vec_sl(v_1,
454 (vector
unsigned short)v_5);
456 const vector
signed short l3minusl6 = vec_sub(vb3, vb6);
457 const vector
signed short l5minusl4 = vec_sub(vb5, vb4);
458 const vector
signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector
signed short)zero);
459 const vector
signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
460 const vector
signed short absmE = vec_abs(mE);
462 const vector
signed short l1minusl4 = vec_sub(vb1, vb4);
463 const vector
signed short l3minusl2 = vec_sub(vb3, vb2);
464 const vector
signed short l5minusl8 = vec_sub(vb5, vb8);
465 const vector
signed short l7minusl6 = vec_sub(vb7, vb6);
466 const vector
signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector
signed short)zero);
467 const vector
signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector
signed short)zero);
468 const vector
signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
469 const vector
signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
471 const vector
signed short ddiff = vec_sub(absmE,
474 const vector
signed short ddiffclamp = vec_max(ddiff, (vector
signed short)zero);
475 const vector
signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
476 const vector
signed short d = vec_sra(dtimes64, vec_splat_u16(6));
477 const vector
signed short minusd = vec_sub((vector
signed short)zero, d);
478 const vector
signed short finald = vec_sel(minusd,
480 vec_cmpgt(vec_sub((vector
signed short)zero, mE),
481 (vector
signed short)zero));
483 const vector
signed short qtimes2 = vec_sub(vb4, vb5);
486 const vector
signed short rounddown = vec_sel((vector
signed short)zero,
488 vec_cmplt(qtimes2, (vector
signed short)zero));
489 const vector
signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
491 const vector
signed short dclamp_P1 = vec_max((vector
signed short)zero, finald);
492 const vector
signed short dclamp_P = vec_min(dclamp_P1, q);
493 const vector
signed short dclamp_N1 = vec_min((vector
signed short)zero, finald);
494 const vector
signed short dclamp_N = vec_max(dclamp_N1, q);
496 const vector
signed short dclampedfinal = vec_sel(dclamp_N,
498 vec_cmpgt(q, (vector
signed short)zero));
499 const vector
signed short dornotd = vec_sel((vector
signed short)zero,
501 vec_cmplt(absmE, vqp));
503 const vector
signed short vb4minusd = vec_sub(vb4, dornotd);
504 const vector
signed short vb5plusd = vec_add(vb5, dornotd);
506 const vector
unsigned char st4 = vec_packsu(vb4minusd, (vector
signed short)zero);
507 const vector
unsigned char st5 = vec_packsu(vb5plusd, (vector
signed short)zero);
509 const vector
signed char neg1 = vec_splat_s8(-1);
510 const vector
unsigned char permHH = (
const vector
unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
511 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
514 { const vector unsigned char perms##i = \ 515 vec_lvsr(i * stride, src2); \ 516 const vector unsigned char vg##i = \ 517 vec_perm(st##i, vbT##i, permHH); \ 518 const vector unsigned char mask##i = \ 519 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ 520 const vector unsigned char vg2##i = \ 521 vec_perm(vg##i, vg##i, perms##i); \ 522 const vector unsigned char svA##i = \ 523 vec_sel(vbA##i, vg2##i, mask##i); \ 524 const vector unsigned char svB##i = \ 525 vec_sel(vg2##i, vbB##i, mask##i); \ 526 vec_st(svA##i, i * stride, src2); \ 527 vec_st(svB##i, i * stride + 16, src2);} 534 const vector
signed int vsint32_8 = vec_splat_s32(8);
535 const vector
unsigned int vuint32_4 = vec_splat_u32(4);
536 const vector
signed char neg1 = vec_splat_s8(-1);
538 const vector
unsigned char permA1 = (vector
unsigned char)
539 {0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
540 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
541 const vector
unsigned char permA2 = (vector
unsigned char)
542 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
543 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
544 const vector
unsigned char permA1inc = (vector
unsigned char)
545 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
546 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
547 const vector
unsigned char permA2inc = (vector
unsigned char)
548 {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
549 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
550 const vector
unsigned char magic = (vector
unsigned char)
551 {0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
552 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
553 const vector
unsigned char extractPerm = (vector
unsigned char)
554 {0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
555 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
556 const vector
unsigned char extractPermInc = (vector
unsigned char)
557 {0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
558 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
559 const vector
unsigned char identity = vec_lvsl(0,(
unsigned char *)0);
560 const vector
unsigned char tenRight = (vector
unsigned char)
561 {0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
562 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
563 const vector
unsigned char eightLeft = (vector
unsigned char)
564 {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
565 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
577 const vector
signed int zero = vec_splat_s32(0);
578 vector
unsigned char v_dt = vec_splat(vec_ld(0, dt), 0);
580 #define LOAD_LINE(i) \ 581 const vector unsigned char perm##i = \ 582 vec_lvsl(i * stride, srcCopy); \ 583 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ 584 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ 585 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) 599 vector
unsigned char v_avg;
602 vector
signed int vQP2 = vec_ld(0, tQP2);
603 vQP2 = vec_splat(vQP2, 0);
606 const vector
unsigned char trunc_perm = (vector
unsigned char)
607 {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
608 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
609 const vector
unsigned char trunc_src12 = vec_perm(
src1, src2, trunc_perm);
610 const vector
unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
611 const vector
unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
612 const vector
unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
614 #define EXTRACT(op) do { \ 615 const vector unsigned char s_1 = vec_##op(trunc_src12, trunc_src34); \ 616 const vector unsigned char s_2 = vec_##op(trunc_src56, trunc_src78); \ 617 const vector unsigned char s_6 = vec_##op(s_1, s_2); \ 618 const vector unsigned char s_8h = vec_mergeh(s_6, s_6); \ 619 const vector unsigned char s_8l = vec_mergel(s_6, s_6); \ 620 const vector unsigned char s_9 = vec_##op(s_8h, s_8l); \ 621 const vector unsigned char s_9h = vec_mergeh(s_9, s_9); \ 622 const vector unsigned char s_9l = vec_mergel(s_9, s_9); \ 623 const vector unsigned char s_10 = vec_##op(s_9h, s_9l); \ 624 const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \ 625 const vector unsigned char s_10l = vec_mergel(s_10, s_10); \ 626 const vector unsigned char s_11 = vec_##op(s_10h, s_10l); \ 627 const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \ 628 const vector unsigned char s_11l = vec_mergel(s_11, s_11); \ 629 v_##op = vec_##op(s_11h, s_11l); \ 632 vector
unsigned char v_min;
633 vector
unsigned char v_max;
638 if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
641 v_avg = vec_avg(v_min, v_max);
645 const vector
unsigned short mask1 = (vector
unsigned short)
646 {0x0001, 0x0002, 0x0004, 0x0008,
647 0x0010, 0x0020, 0x0040, 0x0080};
648 const vector
unsigned short mask2 = (vector
unsigned short)
649 {0x0100, 0x0200, 0x0000, 0x0000,
650 0x0000, 0x0000, 0x0000, 0x0000};
652 const vector
unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
653 const vector
unsigned int vuint32_1 = vec_splat_u32(1);
655 vector
signed int sumA2;
656 vector
signed int sumB2;
657 vector
signed int sum0, sum1, sum2, sum3, sum4;
658 vector
signed int sum5, sum6, sum7, sum8, sum9;
662 const vector unsigned char cmp = \ 663 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ 664 const vector unsigned short cmpHi = \ 665 (vector unsigned short)vec_mergeh(cmp, cmp); \ 666 const vector unsigned short cmpLi = \ 667 (vector unsigned short)vec_mergel(cmp, cmp); \ 668 const vector signed short cmpHf = \ 669 (vector signed short)vec_and(cmpHi, mask1); \ 670 const vector signed short cmpLf = \ 671 (vector signed short)vec_and(cmpLi, mask2); \ 672 const vector signed int sump = vec_sum4s(cmpHf, zero); \ 673 const vector signed int sumq = vec_sum4s(cmpLf, sump); \ 674 sum##i = vec_sums(sumq, zero); \ 690 const vector
signed int sump02 = vec_mergel(sum0, sum2);
691 const vector
signed int sump13 = vec_mergel(sum1, sum3);
692 const vector
signed int sumA = vec_mergel(sump02, sump13);
694 const vector
signed int sump46 = vec_mergel(sum4, sum6);
695 const vector
signed int sump57 = vec_mergel(sum5, sum7);
696 const vector
signed int sumB = vec_mergel(sump46, sump57);
698 const vector
signed int sump8A = vec_mergel(sum8, zero);
699 const vector
signed int sump9B = vec_mergel(sum9, zero);
700 const vector
signed int sumC = vec_mergel(sump8A, sump9B);
702 const vector
signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
703 const vector
signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
704 const vector
signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
705 const vector
signed int t2A = vec_or(sumA, tA);
706 const vector
signed int t2B = vec_or(sumB, tB);
707 const vector
signed int t2C = vec_or(sumC, tC);
708 const vector
signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
709 vec_sl(t2A, vuint32_1));
710 const vector
signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
711 vec_sl(t2B, vuint32_1));
712 const vector
signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
713 vec_sl(t2C, vuint32_1));
714 const vector
signed int yA = vec_and(t2A, t3A);
715 const vector
signed int yB = vec_and(t2B, t3B);
716 const vector
signed int yC = vec_and(t2C, t3C);
718 const vector
unsigned char strangeperm1 = vec_lvsl(4, (
unsigned char*)0);
719 const vector
unsigned char strangeperm2 = vec_lvsl(8, (
unsigned char*)0);
720 const vector
signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
721 const vector
signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
722 const vector
signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
723 const vector
signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
724 const vector
signed int sumAp = vec_and(yA,
725 vec_and(sumAd4,sumAd8));
726 const vector
signed int sumBp = vec_and(yB,
727 vec_and(sumBd4,sumBd8));
728 sumA2 = vec_or(sumAp,
731 sumB2 = vec_or(sumBp,
736 vec_st(sumB2, 16,
S);
743 vector unsigned char tenRightM = tenRight; \ 744 vector unsigned char permA1M = permA1; \ 745 vector unsigned char permA2M = permA2; \ 746 vector unsigned char extractPermM = extractPerm 748 #define F2(i, j, k, l) \ 749 if (S[i] & (1 << (l+1))) { \ 750 const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \ 751 const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \ 752 const vector signed int a_sump = \ 753 (vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\ 754 vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \ 755 const vector signed int p = \ 756 (vector signed int)vec_perm(src##j, (vector unsigned char)zero, \ 758 const vector signed int sum = vec_add(p, vQP2); \ 759 const vector signed int diff = vec_sub(p, vQP2); \ 760 vector signed int newpm; \ 761 vector unsigned char newpm2, mask; \ 762 F = vec_splat(F, 3); \ 763 if (vec_all_lt(sum, F)) \ 765 else if (vec_all_gt(diff, F)) \ 768 newpm2 = vec_splat((vector unsigned char)newpm, 15); \ 769 mask = vec_add(identity, tenRightM); \ 770 src##j = vec_perm(src##j, newpm2, mask); \ 772 permA1M = vec_add(permA1M, permA1inc); \ 773 permA2M = vec_add(permA2M, permA2inc); \ 774 tenRightM = vec_sro(tenRightM, eightLeft); \ 775 extractPermM = vec_add(extractPermM, extractPermInc) 777 #define ITER(i, j, k) do { \ 798 #define STORE_LINE(i) do { \ 799 const vector unsigned char permST = \ 800 vec_lvsr(i * stride, srcCopy); \ 801 const vector unsigned char maskST = \ 802 vec_perm((vector unsigned char)zero, \ 803 (vector unsigned char)neg1, permST); \ 804 src##i = vec_perm(src##i ,src##i, permST); \ 805 sA##i= vec_sel(sA##i, src##i, maskST); \ 806 sB##i= vec_sel(src##i, sB##i, maskST); \ 807 vec_st(sA##i, i * stride, srcCopy); \ 808 vec_st(sB##i, i * stride + 16, srcCopy); \ 825 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) 826 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) 827 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) 830 uint8_t *tempBlurred, uint32_t *tempBlurredPast,
int *maxNoise)
832 const vector
signed char neg1 = vec_splat_s8(-1);
833 const vector
unsigned char permHH = (
const vector
unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
834 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
836 const vector
signed int zero = vec_splat_s32(0);
837 const vector
signed short vsint16_1 = vec_splat_s16(1);
838 vector
signed int v_dp =
zero;
839 vector
signed int v_sysdp =
zero;
842 #define LOAD_LINE(src, i) \ 843 register int j##src##i = i * stride; \ 844 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ 845 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ 846 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ 847 const vector unsigned char v_##src##A##i = \ 848 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ 849 vector signed short v_##src##Ass##i = \ 850 (vector signed short)vec_mergeh((vector signed char)zero, \ 851 (vector signed char)v_##src##A##i) 872 #define ACCUMULATE_DIFFS(i) do { \ 873 vector signed short v_d = vec_sub(v_tempBlurredAss##i, \ 875 v_dp = vec_msums(v_d, v_d, v_dp); \ 876 v_sysdp = vec_msums(v_d, vsint16_1, v_sysdp); \ 887 #undef ACCUMULATE_DIFFS 889 tempBlurredPast[127]= maxNoise[0];
890 tempBlurredPast[128]= maxNoise[1];
891 tempBlurredPast[129]= maxNoise[2];
893 v_dp = vec_sums(v_dp, zero);
894 v_sysdp = vec_sums(v_sysdp, zero);
896 v_dp = vec_splat(v_dp, 3);
897 v_sysdp = vec_splat(v_sysdp, 3);
899 vec_ste(v_dp, 0, &d);
900 vec_ste(v_sysdp, 0, &sysd);
904 +(*(tempBlurredPast-256))
905 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
906 +(*(tempBlurredPast+256))
911 if (d > maxNoise[1]) {
912 if (d < maxNoise[2]) {
913 #define OP(i) v_tempBlurredAss##i = vec_avg(v_tempBlurredAss##i, v_srcAss##i); 925 #define OP(i) v_tempBlurredAss##i = v_srcAss##i; 938 if (d < maxNoise[0]) {
939 const vector
signed short vsint16_7 = vec_splat_s16(7);
940 const vector
signed short vsint16_4 = vec_splat_s16(4);
941 const vector
unsigned short vuint16_3 = vec_splat_u16(3);
944 const vector signed short v_temp = \ 945 vec_mladd(v_tempBlurredAss##i, vsint16_7, v_srcAss##i); \ 946 const vector signed short v_temp2 = vec_add(v_temp, vsint16_4); \ 947 v_tempBlurredAss##i = vec_sr(v_temp2, vuint16_3); \ 960 const vector
signed short vsint16_3 = vec_splat_s16(3);
961 const vector
signed short vsint16_2 = vec_splat_s16(2);
964 const vector signed short v_temp = \ 965 vec_mladd(v_tempBlurredAss##i, vsint16_3, v_srcAss##i); \ 966 const vector signed short v_temp2 = vec_add(v_temp, vsint16_2); \ 967 v_tempBlurredAss##i = \ 968 vec_sr(v_temp2, (vector unsigned short)vsint16_2); \ 983 #define PACK_AND_STORE(src, i) do { \ 984 const vector unsigned char perms = vec_lvsr(i * stride, src); \ 985 const vector unsigned char vf = \ 986 vec_packsu(v_tempBlurredAss##1, (vector signed short)zero); \ 987 const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \ 988 const vector unsigned char mask = \ 989 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \ 990 const vector unsigned char vg2 = vec_perm(vg, vg, perms); \ 991 const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \ 992 const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \ 993 vec_st(svA, i * stride, src); \ 994 vec_st(svB, i * stride + 16, src); \ 1013 #undef PACK_AND_STORE 1017 const vector
unsigned char zero = vec_splat_u8(0);
1019 #define LOAD_DOUBLE_LINE(i, j) \ 1020 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ 1021 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ 1022 vector unsigned char srcA##i = vec_ld(i * stride, src); \ 1023 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ 1024 vector unsigned char srcC##i = vec_ld(j * stride, src); \ 1025 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ 1026 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ 1027 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) 1033 #undef LOAD_DOUBLE_LINE 1035 vector
unsigned char tempA = vec_mergeh(
src0, zero);
1036 vector
unsigned char tempB = vec_mergel(
src0, zero);
1037 vector
unsigned char tempC = vec_mergeh(
src1, zero);
1038 vector
unsigned char tempD = vec_mergel(
src1, zero);
1039 vector
unsigned char tempE = vec_mergeh(src2, zero);
1040 vector
unsigned char tempF = vec_mergel(src2, zero);
1041 vector
unsigned char tempG = vec_mergeh(src3, zero);
1042 vector
unsigned char tempH = vec_mergel(src3, zero);
1043 vector
unsigned char tempI = vec_mergeh(src4, zero);
1044 vector
unsigned char tempJ = vec_mergel(src4, zero);
1045 vector
unsigned char tempK = vec_mergeh(src5, zero);
1046 vector
unsigned char tempL = vec_mergel(src5, zero);
1047 vector
unsigned char tempM = vec_mergeh(src6, zero);
1048 vector
unsigned char tempN = vec_mergel(src6, zero);
1049 vector
unsigned char tempO = vec_mergeh(src7, zero);
1050 vector
unsigned char tempP = vec_mergel(src7, zero);
1052 vector
unsigned char temp0 = vec_mergeh(tempA, tempI);
1053 vector
unsigned char temp1 = vec_mergel(tempA, tempI);
1054 vector
unsigned char temp2 = vec_mergeh(tempB, tempJ);
1055 vector
unsigned char temp3 = vec_mergel(tempB, tempJ);
1056 vector
unsigned char temp4 = vec_mergeh(tempC, tempK);
1057 vector
unsigned char temp5 = vec_mergel(tempC, tempK);
1058 vector
unsigned char temp6 = vec_mergeh(tempD, tempL);
1059 vector
unsigned char temp7 = vec_mergel(tempD, tempL);
1060 vector
unsigned char temp8 = vec_mergeh(tempE, tempM);
1061 vector
unsigned char temp9 = vec_mergel(tempE, tempM);
1062 vector
unsigned char temp10 = vec_mergeh(tempF, tempN);
1063 vector
unsigned char temp11 = vec_mergel(tempF, tempN);
1064 vector
unsigned char temp12 = vec_mergeh(tempG, tempO);
1065 vector
unsigned char temp13 = vec_mergel(tempG, tempO);
1066 vector
unsigned char temp14 = vec_mergeh(tempH, tempP);
1067 vector
unsigned char temp15 = vec_mergel(tempH, tempP);
1069 tempA = vec_mergeh(temp0, temp8);
1070 tempB = vec_mergel(temp0, temp8);
1071 tempC = vec_mergeh(temp1, temp9);
1072 tempD = vec_mergel(temp1, temp9);
1073 tempE = vec_mergeh(temp2, temp10);
1074 tempF = vec_mergel(temp2, temp10);
1075 tempG = vec_mergeh(temp3, temp11);
1076 tempH = vec_mergel(temp3, temp11);
1077 tempI = vec_mergeh(temp4, temp12);
1078 tempJ = vec_mergel(temp4, temp12);
1079 tempK = vec_mergeh(temp5, temp13);
1080 tempL = vec_mergel(temp5, temp13);
1081 tempM = vec_mergeh(temp6, temp14);
1082 tempN = vec_mergel(temp6, temp14);
1083 tempO = vec_mergeh(temp7, temp15);
1084 tempP = vec_mergel(temp7, temp15);
1086 temp0 = vec_mergeh(tempA, tempI);
1087 temp1 = vec_mergel(tempA, tempI);
1088 temp2 = vec_mergeh(tempB, tempJ);
1089 temp3 = vec_mergel(tempB, tempJ);
1090 temp4 = vec_mergeh(tempC, tempK);
1091 temp5 = vec_mergel(tempC, tempK);
1092 temp6 = vec_mergeh(tempD, tempL);
1093 temp7 = vec_mergel(tempD, tempL);
1094 temp8 = vec_mergeh(tempE, tempM);
1095 temp9 = vec_mergel(tempE, tempM);
1096 temp10 = vec_mergeh(tempF, tempN);
1097 temp11 = vec_mergel(tempF, tempN);
1098 temp12 = vec_mergeh(tempG, tempO);
1099 temp13 = vec_mergel(tempG, tempO);
1100 temp14 = vec_mergeh(tempH, tempP);
1101 temp15 = vec_mergel(tempH, tempP);
1103 vec_st(temp0, 0, dst);
1104 vec_st(temp1, 16, dst);
1105 vec_st(temp2, 32, dst);
1106 vec_st(temp3, 48, dst);
1107 vec_st(temp4, 64, dst);
1108 vec_st(temp5, 80, dst);
1109 vec_st(temp6, 96, dst);
1110 vec_st(temp7, 112, dst);
1111 vec_st(temp8, 128, dst);
1112 vec_st(temp9, 144, dst);
1113 vec_st(temp10, 160, dst);
1114 vec_st(temp11, 176, dst);
1115 vec_st(temp12, 192, dst);
1116 vec_st(temp13, 208, dst);
1117 vec_st(temp14, 224, dst);
1118 vec_st(temp15, 240, dst);
1122 const vector
unsigned char zero = vec_splat_u8(0);
1123 const vector
signed char neg1 = vec_splat_s8(-1);
1125 #define LOAD_DOUBLE_LINE(i, j) \ 1126 vector unsigned char src##i = vec_ld(i * 16, src); \ 1127 vector unsigned char src##j = vec_ld(j * 16, src) 1137 #undef LOAD_DOUBLE_LINE 1139 vector
unsigned char tempA = vec_mergeh(
src0, src8);
1140 vector
unsigned char tempB;
1141 vector
unsigned char tempC = vec_mergeh(
src1, src9);
1142 vector
unsigned char tempD;
1143 vector
unsigned char tempE = vec_mergeh(src2, src10);
1144 vector
unsigned char tempG = vec_mergeh(src3, src11);
1145 vector
unsigned char tempI = vec_mergeh(src4, src12);
1146 vector
unsigned char tempJ;
1147 vector
unsigned char tempK = vec_mergeh(src5, src13);
1148 vector
unsigned char tempL;
1149 vector
unsigned char tempM = vec_mergeh(src6, src14);
1150 vector
unsigned char tempO = vec_mergeh(src7, src15);
1152 vector
unsigned char temp0 = vec_mergeh(tempA, tempI);
1153 vector
unsigned char temp1 = vec_mergel(tempA, tempI);
1154 vector
unsigned char temp2;
1155 vector
unsigned char temp3;
1156 vector
unsigned char temp4 = vec_mergeh(tempC, tempK);
1157 vector
unsigned char temp5 = vec_mergel(tempC, tempK);
1158 vector
unsigned char temp6;
1159 vector
unsigned char temp7;
1160 vector
unsigned char temp8 = vec_mergeh(tempE, tempM);
1161 vector
unsigned char temp9 = vec_mergel(tempE, tempM);
1162 vector
unsigned char temp12 = vec_mergeh(tempG, tempO);
1163 vector
unsigned char temp13 = vec_mergel(tempG, tempO);
1165 tempA = vec_mergeh(temp0, temp8);
1166 tempB = vec_mergel(temp0, temp8);
1167 tempC = vec_mergeh(temp1, temp9);
1168 tempD = vec_mergel(temp1, temp9);
1169 tempI = vec_mergeh(temp4, temp12);
1170 tempJ = vec_mergel(temp4, temp12);
1171 tempK = vec_mergeh(temp5, temp13);
1172 tempL = vec_mergel(temp5, temp13);
1174 temp0 = vec_mergeh(tempA, tempI);
1175 temp1 = vec_mergel(tempA, tempI);
1176 temp2 = vec_mergeh(tempB, tempJ);
1177 temp3 = vec_mergel(tempB, tempJ);
1178 temp4 = vec_mergeh(tempC, tempK);
1179 temp5 = vec_mergel(tempC, tempK);
1180 temp6 = vec_mergeh(tempD, tempL);
1181 temp7 = vec_mergel(tempD, tempL);
1184 #define STORE_DOUBLE_LINE(i, j) do { \ 1185 vector unsigned char dstAi = vec_ld(i * stride, dst); \ 1186 vector unsigned char dstBi = vec_ld(i * stride + 16, dst); \ 1187 vector unsigned char dstAj = vec_ld(j * stride, dst); \ 1188 vector unsigned char dstBj = vec_ld(j * stride+ 16, dst); \ 1189 vector unsigned char aligni = vec_lvsr(i * stride, dst); \ 1190 vector unsigned char alignj = vec_lvsr(j * stride, dst); \ 1191 vector unsigned char maski = \ 1192 vec_perm(zero, (vector unsigned char)neg1, aligni); \ 1193 vector unsigned char maskj = \ 1194 vec_perm(zero, (vector unsigned char)neg1, alignj); \ 1195 vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni); \ 1196 vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj); \ 1197 vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski); \ 1198 vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski); \ 1199 vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj); \ 1200 vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj); \ 1201 vec_st(dstAFi, i * stride, dst); \ 1202 vec_st(dstBFi, i * stride + 16, dst); \ 1203 vec_st(dstAFj, j * stride, dst); \ 1204 vec_st(dstBFj, j * stride + 16, dst); \ static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
ptrdiff_t const GLvoid * data
#define ACCUMULATE_DIFFS(i)
Convenience header that includes libavutil's core.
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
#define PACK_AND_STORE(i)
static void dering_altivec(uint8_t src[], int stride, PPContext *c)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
#define PACK_AND_STORE_ALIGNED(i)
static const uint16_t mask[17]
#define STORE_DOUBLE_LINE(i, j)
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
#define LOAD_DOUBLE_LINE(i, j)
static void tempNoiseReducer_altivec(uint8_t *src, int stride, uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
GLint GLenum GLboolean GLsizei stride
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
#define LOAD_LINE_ALIGNED(i)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
#define COMPUTE_VR(i, j, k)