FFmpeg
vp3dsp_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "vp3dsp_mips.h"
23 #include "libavutil/intreadwrite.h"
24 #include "libavcodec/rnd_avg.h"
25 
26 static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
27 {
28  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29  v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30  r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31  v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
32  v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
33  v16u8 sign_l;
34  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35  v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
36  v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
37  v4i32 sign_t;
38  v16i8 zero = {0};
39  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40  v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41  v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42  v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43  v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44  v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45  v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46  v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47  v4i32 cnst8w = {8, 8, 8, 8};
48  v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49  v4i32 cnst128w = {128, 128, 128, 128};
50 
51  /* Extended input data */
52  LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
53  sign = __msa_clti_s_h(r0, 0);
54  r0_r = (v4i32) __msa_ilvr_h(sign, r0);
55  r0_l = (v4i32) __msa_ilvl_h(sign, r0);
56  sign = __msa_clti_s_h(r1, 0);
57  r1_r = (v4i32) __msa_ilvr_h(sign, r1);
58  r1_l = (v4i32) __msa_ilvl_h(sign, r1);
59  sign = __msa_clti_s_h(r2, 0);
60  r2_r = (v4i32) __msa_ilvr_h(sign, r2);
61  r2_l = (v4i32) __msa_ilvl_h(sign, r2);
62  sign = __msa_clti_s_h(r3, 0);
63  r3_r = (v4i32) __msa_ilvr_h(sign, r3);
64  r3_l = (v4i32) __msa_ilvl_h(sign, r3);
65  sign = __msa_clti_s_h(r4, 0);
66  r4_r = (v4i32) __msa_ilvr_h(sign, r4);
67  r4_l = (v4i32) __msa_ilvl_h(sign, r4);
68  sign = __msa_clti_s_h(r5, 0);
69  r5_r = (v4i32) __msa_ilvr_h(sign, r5);
70  r5_l = (v4i32) __msa_ilvl_h(sign, r5);
71  sign = __msa_clti_s_h(r6, 0);
72  r6_r = (v4i32) __msa_ilvr_h(sign, r6);
73  r6_l = (v4i32) __msa_ilvl_h(sign, r6);
74  sign = __msa_clti_s_h(r7, 0);
75  r7_r = (v4i32) __msa_ilvr_h(sign, r7);
76  r7_l = (v4i32) __msa_ilvl_h(sign, r7);
77 
78  /* Right part */
79  A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
80  B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
81  C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
82  D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
83  Ad = ((A - C) * cnst46341w) >> 16;
84  Bd = ((B - D) * cnst46341w) >> 16;
85  Cd = A + C;
86  Dd = B + D;
87  E = ((r0_r + r4_r) * cnst46341w) >> 16;
88  F = ((r0_r - r4_r) * cnst46341w) >> 16;
89  G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
90  H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
91  Ed = E - G;
92  Gd = E + G;
93  Add = F + Ad;
94  Bdd = Bd - H;
95  Fd = F - Ad;
96  Hd = Bd + H;
97  r0_r = Gd + Cd;
98  r7_r = Gd - Cd;
99  r1_r = Add + Hd;
100  r2_r = Add - Hd;
101  r3_r = Ed + Dd;
102  r4_r = Ed - Dd;
103  r5_r = Fd + Bdd;
104  r6_r = Fd - Bdd;
105 
106  /* Left part */
107  A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
108  B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
109  C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
110  D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
111  Ad = ((A - C) * cnst46341w) >> 16;
112  Bd = ((B - D) * cnst46341w) >> 16;
113  Cd = A + C;
114  Dd = B + D;
115  E = ((r0_l + r4_l) * cnst46341w) >> 16;
116  F = ((r0_l - r4_l) * cnst46341w) >> 16;
117  G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
118  H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
119  Ed = E - G;
120  Gd = E + G;
121  Add = F + Ad;
122  Bdd = Bd - H;
123  Fd = F - Ad;
124  Hd = Bd + H;
125  r0_l = Gd + Cd;
126  r7_l = Gd - Cd;
127  r1_l = Add + Hd;
128  r2_l = Add - Hd;
129  r3_l = Ed + Dd;
130  r4_l = Ed - Dd;
131  r5_l = Fd + Bdd;
132  r6_l = Fd - Bdd;
133 
134  /* Row 0 to 3 */
135  TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
136  r0_r, r1_r, r2_r, r3_r);
137  TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
138  r0_l, r1_l, r2_l, r3_l);
139  A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
140  B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
141  C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
142  D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
143  Ad = ((A - C) * cnst46341w) >> 16;
144  Bd = ((B - D) * cnst46341w) >> 16;
145  Cd = A + C;
146  Dd = B + D;
147  E = ((r0_r + r0_l) * cnst46341w) >> 16;
148  E += cnst8w;
149  F = ((r0_r - r0_l) * cnst46341w) >> 16;
150  F += cnst8w;
151  if (type == 1) { // HACK
152  E += cnst2048w;
153  F += cnst2048w;
154  }
155  G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
156  H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
157  Ed = E - G;
158  Gd = E + G;
159  Add = F + Ad;
160  Bdd = Bd - H;
161  Fd = F - Ad;
162  Hd = Bd + H;
163  A = (Gd + Cd) >> 4;
164  B = (Gd - Cd) >> 4;
165  C = (Add + Hd) >> 4;
166  D = (Add - Hd) >> 4;
167  E = (Ed + Dd) >> 4;
168  F = (Ed - Dd) >> 4;
169  G = (Fd + Bdd) >> 4;
170  H = (Fd - Bdd) >> 4;
171  if (type != 1) {
172  LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
173  ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
174  f0, f1, f2, f3);
175  ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
176  f4, f5, f6, f7);
177  ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
178  c0, c1, c2, c3);
179  ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
180  c4, c5, c6, c7);
181  A += c0;
182  B += c7;
183  C += c1;
184  D += c2;
185  E += c3;
186  F += c4;
187  G += c5;
188  H += c6;
189  }
190  CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
191  sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
192  sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
193  sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
194  sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
195  sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
196  sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
197  sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
198  Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
199  if (type == 1) {
200  Bdd = Add + cnst128w;
201  CLIP_SW_0_255(Bdd);
202  Ad = Bdd;
203  Bd = Bdd;
204  Cd = Bdd;
205  Dd = Bdd;
206  Ed = Bdd;
207  Fd = Bdd;
208  Gd = Bdd;
209  Hd = Bdd;
210  } else {
211  Ad = Add + c0;
212  Bd = Add + c1;
213  Cd = Add + c2;
214  Dd = Add + c3;
215  Ed = Add + c4;
216  Fd = Add + c5;
217  Gd = Add + c6;
218  Hd = Add + c7;
219  CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
220  }
221  Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
222  Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
223  Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
224  Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
225  Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
226  Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
227  Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
228  Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
229  sign_t = __msa_ceqi_w(sign_t, 0);
230  A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
231  B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
232  C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
233  D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
234  E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
235  F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
236  G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
237  H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
238  r0_r = Ad + A;
239  r1_r = Bd + C;
240  r2_r = Cd + D;
241  r3_r = Dd + E;
242  r0_l = Ed + F;
243  r1_l = Fd + G;
244  r2_l = Gd + H;
245  r3_l = Hd + B;
246 
247  /* Row 4 to 7 */
248  TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
249  r4_r, r5_r, r6_r, r7_r);
250  TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
251  r4_l, r5_l, r6_l, r7_l);
252  A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
253  B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
254  C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
255  D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
256  Ad = ((A - C) * cnst46341w) >> 16;
257  Bd = ((B - D) * cnst46341w) >> 16;
258  Cd = A + C;
259  Dd = B + D;
260  E = ((r4_r + r4_l) * cnst46341w) >> 16;
261  E += cnst8w;
262  F = ((r4_r - r4_l) * cnst46341w) >> 16;
263  F += cnst8w;
264  if (type == 1) { // HACK
265  E += cnst2048w;
266  F += cnst2048w;
267  }
268  G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
269  H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
270  Ed = E - G;
271  Gd = E + G;
272  Add = F + Ad;
273  Bdd = Bd - H;
274  Fd = F - Ad;
275  Hd = Bd + H;
276  A = (Gd + Cd) >> 4;
277  B = (Gd - Cd) >> 4;
278  C = (Add + Hd) >> 4;
279  D = (Add - Hd) >> 4;
280  E = (Ed + Dd) >> 4;
281  F = (Ed - Dd) >> 4;
282  G = (Fd + Bdd) >> 4;
283  H = (Fd - Bdd) >> 4;
284  if (type != 1) {
285  ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
286  c0, c1, c2, c3);
287  ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
288  c4, c5, c6, c7);
289  A += c0;
290  B += c7;
291  C += c1;
292  D += c2;
293  E += c3;
294  F += c4;
295  G += c5;
296  H += c6;
297  }
298  CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
299  sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
300  sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
301  sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
302  sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
303  sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
304  sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
305  sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
306  Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
307  if (type == 1) {
308  Bdd = Add + cnst128w;
309  CLIP_SW_0_255(Bdd);
310  Ad = Bdd;
311  Bd = Bdd;
312  Cd = Bdd;
313  Dd = Bdd;
314  Ed = Bdd;
315  Fd = Bdd;
316  Gd = Bdd;
317  Hd = Bdd;
318  } else {
319  Ad = Add + c0;
320  Bd = Add + c1;
321  Cd = Add + c2;
322  Dd = Add + c3;
323  Ed = Add + c4;
324  Fd = Add + c5;
325  Gd = Add + c6;
326  Hd = Add + c7;
327  CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
328  }
329  Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
330  Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
331  Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
332  Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
333  Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
334  Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
335  Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
336  Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
337  sign_t = __msa_ceqi_w(sign_t, 0);
338  A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
339  B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
340  C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
341  D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
342  E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
343  F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
344  G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
345  H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
346  r4_r = Ad + A;
347  r5_r = Bd + C;
348  r6_r = Cd + D;
349  r7_r = Dd + E;
350  r4_l = Ed + F;
351  r5_l = Fd + G;
352  r6_l = Gd + H;
353  r7_l = Hd + B;
354  VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
355  VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
356  VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
357  VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
358 
359  /* Final sequence of operations over-write original dst */
360  ST_D1(d0, 0, dst);
361  ST_D1(d1, 0, dst + stride);
362  ST_D1(d2, 0, dst + 2 * stride);
363  ST_D1(d3, 0, dst + 3 * stride);
364  ST_D1(d4, 0, dst + 4 * stride);
365  ST_D1(d5, 0, dst + 5 * stride);
366  ST_D1(d6, 0, dst + 6 * stride);
367  ST_D1(d7, 0, dst + 7 * stride);
368 }
369 
370 void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
371 {
372  idct_msa(dest, line_size, block, 1);
373  memset(block, 0, sizeof(*block) * 64);
374 }
375 
376 void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
377 {
378  idct_msa(dest, line_size, block, 2);
379  memset(block, 0, sizeof(*block) * 64);
380 }
381 
382 void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
383 {
384  int i = (block[0] + 15) >> 5;
385  v4i32 dc = {i, i, i, i};
386  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
387  v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
388  v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
389  v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
390  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
391  v16i8 zero = {0};
392 
393  LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
394  ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
395  c0, c1, c2, c3);
396  ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
397  c4, c5, c6, c7);
398  /* Right part */
399  ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
400  e0, e1, e2, e3);
401  ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
402  e4, e5, e6, e7);
403  e0 += dc;
404  e1 += dc;
405  e2 += dc;
406  e3 += dc;
407  e4 += dc;
408  e5 += dc;
409  e6 += dc;
410  e7 += dc;
411  CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
412 
413  /* Left part */
414  ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
415  r0, r1, r2, r3);
416  ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
417  r4, r5, r6, r7);
418  r0 += dc;
419  r1 += dc;
420  r2 += dc;
421  r3 += dc;
422  r4 += dc;
423  r5 += dc;
424  r6 += dc;
425  r7 += dc;
426  CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
427  VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
428  VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
429  VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
430  VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
431 
432  /* Final sequence of operations over-write original dst */
433  ST_D1(d0, 0, dest);
434  ST_D1(d1, 0, dest + line_size);
435  ST_D1(d2, 0, dest + 2 * line_size);
436  ST_D1(d3, 0, dest + 3 * line_size);
437  ST_D1(d4, 0, dest + 4 * line_size);
438  ST_D1(d5, 0, dest + 5 * line_size);
439  ST_D1(d6, 0, dest + 6 * line_size);
440  ST_D1(d7, 0, dest + 7 * line_size);
441 
442  block[0] = 0;
443 }
444 
445 void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
446  int *bounding_values)
447 {
448  int nstride = -stride;
449  v4i32 e0, e1, f0, f1, g0, g1;
450  v16i8 zero = {0};
451  v16i8 d0, d1, d2, d3;
452  v8i16 c0, c1, c2, c3;
453  v8i16 r0;
454  v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
455  cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
456  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
457  int16_t temp_16[8];
458  int temp_32[8];
459 
460  LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
461  ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
462  c0, c1, c2, c3);
463  r0 = (c0 - c3) + (c2 - c1) * cnst3h;
464  r0 += cnst4h;
465  r0 = r0 >> 3;
466  /* Get filter_value from bounding_values one by one */
467  ST_SH(r0, temp_16);
468  for (int i = 0; i < 8; i++)
469  temp_32[i] = bounding_values[temp_16[i]];
470  LD_SW2(temp_32, 4, e0, e1);
471  ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
472  ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
473  f0 += e0;
474  f1 += e1;
475  g0 -= e0;
476  g1 -= e1;
477  CLIP_SW4_0_255(f0, f1, g0, g1);
478  VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
479 
480  /* Final move to first_pixel */
481  ST_D1(d1, 0, first_pixel + nstride);
482  ST_D1(d2, 0, first_pixel);
483 }
484 
485 void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
486  int *bounding_values)
487 {
488  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
489  v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
490  v8i16 r0;
491  v4i32 e0, e1, f0, f1, g0, g1;
492  v16i8 zero = {0};
493  v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
494  cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
495  v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
496  int16_t temp_16[8];
497  int temp_32[8];
498 
499  LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
500  ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
501  c0, c1, c2, c3);
502  ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
503  c4, c5, c6, c7);
504  TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
505  c0, c1, c2, c3, c4, c5, c6, c7);
506  r0 = (c0 - c3) + (c2 - c1) * cnst3h;
507  r0 += cnst4h;
508  r0 = r0 >> 3;
509 
510  /* Get filter_value from bounding_values one by one */
511  ST_SH(r0, temp_16);
512  for (int i = 0; i < 8; i++)
513  temp_32[i] = bounding_values[temp_16[i]];
514  LD_SW2(temp_32, 4, e0, e1);
515  ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
516  ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
517  f0 += e0;
518  f1 += e1;
519  g0 -= e0;
520  g1 -= e1;
521  CLIP_SW4_0_255(f0, f1, g0, g1);
522  VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
523  /* Final move to first_pixel */
524  ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
525  ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
526 }
527 
528 void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
529  const uint8_t *src2, ptrdiff_t stride, int h)
530 {
531  if (h == 8) {
532  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
533  v16i8 c0, c1, c2, c3;
534  v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
535  v4i32 e0, e1, e2;
536  v4i32 f0, f1, f2;
537  v4u32 t0, t1, t2, t3;
538  v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
539  int32_t value = 0xfefefefe;
540  v4i32 fmask = {value, value, value, value};
541 
542  LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
543  VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
544  VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
545  a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
546  a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
547  a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
548  a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
549 
550  LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
551  VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
552  VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
553  b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
554  b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
555  b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
556  b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
557 
558  e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
559  e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
560  t0 = ((v4u32)e0) >> 1;
561  e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
562  t0 = t0 + (v4u32)e2;
563 
564  e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
565  e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
566  t1 = ((v4u32)e1) >> 1;
567  e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
568  t1 = t1 + (v4u32)e2;
569 
570  f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
571  f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
572  t2 = ((v4u32)f0) >> 1;
573  f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
574  t2 = t2 + (v4u32)f2;
575 
576  f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
577  f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
578  t3 = ((v4u32)f1) >> 1;
579  f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
580  t3 = t3 + (v4u32)f2;
581 
582  ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
583  ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
584  } else {
585  int i;
586 
587  for (i = 0; i < h; i++) {
588  uint32_t a, b;
589 
590  a = AV_RN32(&src1[i * stride]);
591  b = AV_RN32(&src2[i * stride]);
592  AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
593  a = AV_RN32(&src1[i * stride + 4]);
594  b = AV_RN32(&src2[i * stride + 4]);
595  AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
596  }
597  }
598 }
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
A
#define A(x)
Definition: vpx_arith.h:28
no_rnd_avg32
static uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
Definition: rnd_avg.h:36
ff_put_no_rnd_pixels_l2_msa
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t stride, int h)
Definition: vp3dsp_idct_msa.c:528
src1
const pixel * src1
Definition: h264pred_template.c:421
ILVR_H4_SW
#define ILVR_H4_SW(...)
Definition: generic_macros_msa.h:1409
vp3dsp_mips.h
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:41
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
F
#define F(x)
AV_WN32A
#define AV_WN32A(p, v)
Definition: intreadwrite.h:536
t1
#define t1
Definition: regdef.h:29
c1
static const uint64_t c1
Definition: murmur3.c:52
D
D(D(float, sse)
Definition: rematrix_init.c:30
ILVR_B4_SW
#define ILVR_B4_SW(...)
Definition: generic_macros_msa.h:1363
ff_vp3_idct_dc_add_msa
void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: vp3dsp_idct_msa.c:382
ILVL_H4_SW
#define ILVL_H4_SW(...)
Definition: generic_macros_msa.h:1302
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2035
generic_macros_msa.h
type
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
Definition: writing_filters.txt:86
TRANSPOSE8x8_SH_SH
#define TRANSPOSE8x8_SH_SH(...)
Definition: generic_macros_msa.h:2505
a1
#define a1
Definition: regdef.h:47
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
mask
static const uint16_t mask[17]
Definition: lzw.c:38
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:2037
intreadwrite.h
B
#define B
Definition: huffyuv.h:42
ILVR_H2_SW
#define ILVR_H2_SW(...)
Definition: generic_macros_msa.h:1393
TRANSPOSE4x4_SW_SW
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2513
E
#define E
Definition: avdct.c:33
AV_RN32
#define AV_RN32(p)
Definition: intreadwrite.h:362
CLIP_SW8_0_255
#define CLIP_SW8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:984
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
idct_msa
static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
Definition: vp3dsp_idct_msa.c:26
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
LD_SW2
#define LD_SW2(...)
Definition: generic_macros_msa.h:281
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:2036
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
H
#define H
Definition: pixlet.c:39
a0
#define a0
Definition: regdef.h:46
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
ff_vp3_h_loop_filter_msa
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values)
Definition: vp3dsp_idct_msa.c:485
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
ST_H4
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:417
t3
#define t3
Definition: regdef.h:31
src2
const pixel * src2
Definition: h264pred_template.c:422
a2
#define a2
Definition: regdef.h:48
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
value
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default value
Definition: writing_filters.txt:86
stride
#define stride
Definition: h264pred_template.c:537
rnd_avg.h
ILVL_H2_SW
#define ILVL_H2_SW(...)
Definition: generic_macros_msa.h:1293
c2
static const uint64_t c2
Definition: murmur3.c:53
t2
#define t2
Definition: regdef.h:30
ff_vp3_v_loop_filter_msa
void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values)
Definition: vp3dsp_idct_msa.c:445
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
G
#define G
Definition: huffyuv.h:43
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
ff_vp3_idct_put_msa
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: vp3dsp_idct_msa.c:370
zero
#define zero
Definition: regdef.h:64
CLIP_SW_0_255
#define CLIP_SW_0_255(in)
Definition: generic_macros_msa.h:966
ST_SH
#define ST_SH(...)
Definition: generic_macros_msa.h:43
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2034
h
h
Definition: vp9dsp_template.c:2038
a3
#define a3
Definition: regdef.h:49
ff_vp3_idct_add_msa
void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: vp3dsp_idct_msa.c:376