FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
postprocess.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /**
24  * @file
25  * postprocessing.
26  */
27 
28 /*
29  C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49 
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58 
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66  (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73 
74 //Changelog: use git log
75 
76 #include "config.h"
77 #include "libavutil/avutil.h"
78 #include "libavutil/avassert.h"
79 #include <inttypes.h>
80 #include <stdio.h>
81 #include <stdlib.h>
82 #include <string.h>
83 //#undef HAVE_MMXEXT_INLINE
84 //#define HAVE_AMD3DNOW_INLINE
85 //#undef HAVE_MMX_INLINE
86 //#undef ARCH_X86
87 //#define DEBUG_BRIGHTNESS
88 #include "postprocess.h"
89 #include "postprocess_internal.h"
90 #include "libavutil/avstring.h"
91 
92 unsigned postproc_version(void)
93 {
96 }
97 
98 const char *postproc_configuration(void)
99 {
100  return FFMPEG_CONFIGURATION;
101 }
102 
103 const char *postproc_license(void)
104 {
105 #define LICENSE_PREFIX "libpostproc license: "
106  return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
107 }
108 
109 #if HAVE_ALTIVEC_H
110 #include <altivec.h>
111 #endif
112 
113 #define GET_MODE_BUFFER_SIZE 500
114 #define OPTIONS_ARRAY_SIZE 10
115 #define BLOCK_SIZE 8
116 #define TEMP_STRIDE 8
117 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
118 
119 #if ARCH_X86 && HAVE_INLINE_ASM
120 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
121 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
122 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
123 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
124 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
125 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
126 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
127 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
128 #endif
129 
130 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
131 
132 
133 static const struct PPFilter filters[]=
134 {
135  {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
136  {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
137 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
138  {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
139  {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
140  {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
141  {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
142  {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
143  {"dr", "dering", 1, 5, 6, DERING},
144  {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
145  {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
146  {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
147  {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
148  {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
149  {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
150  {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
151  {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
152  {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
153  {"be", "bitexact", 1, 0, 0, BITEXACT},
154  {"vi", "visualize", 1, 0, 0, VISUALIZE},
155  {NULL, NULL,0,0,0,0} //End Marker
156 };
157 
158 static const char * const replaceTable[]=
159 {
160  "default", "hb:a,vb:a,dr:a",
161  "de", "hb:a,vb:a,dr:a",
162  "fast", "h1:a,v1:a,dr:a",
163  "fa", "h1:a,v1:a,dr:a",
164  "ac", "ha:a:128:7,va:a,dr:a",
165  NULL //End Marker
166 };
167 
168 
169 #if ARCH_X86 && HAVE_INLINE_ASM
170 static inline void prefetchnta(const void *p)
171 {
172  __asm__ volatile( "prefetchnta (%0)\n\t"
173  : : "r" (p)
174  );
175 }
176 
177 static inline void prefetcht0(const void *p)
178 {
179  __asm__ volatile( "prefetcht0 (%0)\n\t"
180  : : "r" (p)
181  );
182 }
183 
184 static inline void prefetcht1(const void *p)
185 {
186  __asm__ volatile( "prefetcht1 (%0)\n\t"
187  : : "r" (p)
188  );
189 }
190 
191 static inline void prefetcht2(const void *p)
192 {
193  __asm__ volatile( "prefetcht2 (%0)\n\t"
194  : : "r" (p)
195  );
196 }
197 #endif
198 
199 /* The horizontal functions exist only in C because the MMX
200  * code is faster with vertical filters and transposing. */
201 
202 /**
203  * Check if the given 8x8 Block is mostly "flat"
204  */
205 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
206 {
207  int numEq= 0;
208  int y;
209  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
210  const int dcThreshold= dcOffset*2 + 1;
211 
212  for(y=0; y<BLOCK_SIZE; y++){
213  numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
214  numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
215  numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
216  numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
217  numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
218  numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
219  numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
220  src+= stride;
221  }
222  return numEq > c->ppMode.flatnessThreshold;
223 }
224 
225 /**
226  * Check if the middle 8x8 Block in the given 8x16 block is flat
227  */
228 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
229 {
230  int numEq= 0;
231  int y;
232  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
233  const int dcThreshold= dcOffset*2 + 1;
234 
235  src+= stride*4; // src points to begin of the 8x8 Block
236  for(y=0; y<BLOCK_SIZE-1; y++){
237  numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
238  numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
239  numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
240  numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
241  numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
242  numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
243  numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
244  numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
245  src+= stride;
246  }
247  return numEq > c->ppMode.flatnessThreshold;
248 }
249 
250 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
251 {
252  int i;
253  for(i=0; i<2; i++){
254  if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
255  src += stride;
256  if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
257  src += stride;
258  if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
259  src += stride;
260  if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
261  src += stride;
262  }
263  return 1;
264 }
265 
266 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
267 {
268  int x;
269  src+= stride*4;
270  for(x=0; x<BLOCK_SIZE; x+=4){
271  if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
272  if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
273  if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
274  if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
275  }
276  return 1;
277 }
278 
279 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
280 {
281  if( isHorizDC_C(src, stride, c) ){
282  return isHorizMinMaxOk_C(src, stride, c->QP);
283  }else{
284  return 2;
285  }
286 }
287 
288 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
289 {
290  if( isVertDC_C(src, stride, c) ){
291  return isVertMinMaxOk_C(src, stride, c->QP);
292  }else{
293  return 2;
294  }
295 }
296 
297 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
298 {
299  int y;
300  for(y=0; y<BLOCK_SIZE; y++){
301  const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
302 
303  if(FFABS(middleEnergy) < 8*c->QP){
304  const int q=(dst[3] - dst[4])/2;
305  const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
306  const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
307 
308  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
309  d= FFMAX(d, 0);
310 
311  d= (5*d + 32) >> 6;
312  d*= FFSIGN(-middleEnergy);
313 
314  if(q>0)
315  {
316  d = FFMAX(d, 0);
317  d = FFMIN(d, q);
318  }
319  else
320  {
321  d = FFMIN(d, 0);
322  d = FFMAX(d, q);
323  }
324 
325  dst[3]-= d;
326  dst[4]+= d;
327  }
328  dst+= stride;
329  }
330 }
331 
332 /**
333  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
334  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
335  */
336 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
337 {
338  int y;
339  for(y=0; y<BLOCK_SIZE; y++){
340  const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
341  const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
342 
343  int sums[10];
344  sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
345  sums[1] = sums[0] - first + dst[3];
346  sums[2] = sums[1] - first + dst[4];
347  sums[3] = sums[2] - first + dst[5];
348  sums[4] = sums[3] - first + dst[6];
349  sums[5] = sums[4] - dst[0] + dst[7];
350  sums[6] = sums[5] - dst[1] + last;
351  sums[7] = sums[6] - dst[2] + last;
352  sums[8] = sums[7] - dst[3] + last;
353  sums[9] = sums[8] - dst[4] + last;
354 
355  dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
356  dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
357  dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
358  dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
359  dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
360  dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
361  dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
362  dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
363 
364  dst+= stride;
365  }
366 }
367 
368 /**
369  * Experimental Filter 1 (Horizontal)
370  * will not damage linear gradients
371  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
372  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
373  * MMX2 version does correct clipping C version does not
374  * not identical with the vertical one
375  */
376 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
377 {
378  int y;
379  static uint64_t lut[256];
380  if(!lut[255])
381  {
382  int i;
383  for(i=0; i<256; i++)
384  {
385  int v= i < 128 ? 2*i : 2*(i-256);
386 /*
387 //Simulate 112242211 9-Tap filter
388  uint64_t a= (v/16) & 0xFF;
389  uint64_t b= (v/8) & 0xFF;
390  uint64_t c= (v/4) & 0xFF;
391  uint64_t d= (3*v/8) & 0xFF;
392 */
393 //Simulate piecewise linear interpolation
394  uint64_t a= (v/16) & 0xFF;
395  uint64_t b= (v*3/16) & 0xFF;
396  uint64_t c= (v*5/16) & 0xFF;
397  uint64_t d= (7*v/16) & 0xFF;
398  uint64_t A= (0x100 - a)&0xFF;
399  uint64_t B= (0x100 - b)&0xFF;
400  uint64_t C= (0x100 - c)&0xFF;
401  uint64_t D= (0x100 - c)&0xFF;
402 
403  lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
404  (D<<24) | (C<<16) | (B<<8) | (A);
405  //lut[i] = (v<<32) | (v<<24);
406  }
407  }
408 
409  for(y=0; y<BLOCK_SIZE; y++){
410  int a= src[1] - src[2];
411  int b= src[3] - src[4];
412  int c= src[5] - src[6];
413 
414  int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
415 
416  if(d < QP){
417  int v = d * FFSIGN(-b);
418 
419  src[1] +=v/8;
420  src[2] +=v/4;
421  src[3] +=3*v/8;
422  src[4] -=3*v/8;
423  src[5] -=v/4;
424  src[6] -=v/8;
425  }
426  src+=stride;
427  }
428 }
429 
430 /**
431  * accurate deblock filter
432  */
434  int stride, const PPContext *c, int mode)
435 {
436  int y;
437  const int QP= c->QP;
438  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
439  const int dcThreshold= dcOffset*2 + 1;
440 //START_TIMER
441  src+= step*4; // src points to begin of the 8x8 Block
442  for(y=0; y<8; y++){
443  int numEq= 0;
444 
445  numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
446  numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
447  numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
448  numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
449  numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
450  numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
451  numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
452  numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
453  numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
454  if(numEq > c->ppMode.flatnessThreshold){
455  int min, max, x;
456 
457  if(src[0] > src[step]){
458  max= src[0];
459  min= src[step];
460  }else{
461  max= src[step];
462  min= src[0];
463  }
464  for(x=2; x<8; x+=2){
465  if(src[x*step] > src[(x+1)*step]){
466  if(src[x *step] > max) max= src[ x *step];
467  if(src[(x+1)*step] < min) min= src[(x+1)*step];
468  }else{
469  if(src[(x+1)*step] > max) max= src[(x+1)*step];
470  if(src[ x *step] < min) min= src[ x *step];
471  }
472  }
473  if(max-min < 2*QP){
474  const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
475  const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
476 
477  int sums[10];
478  sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
479  sums[1] = sums[0] - first + src[3*step];
480  sums[2] = sums[1] - first + src[4*step];
481  sums[3] = sums[2] - first + src[5*step];
482  sums[4] = sums[3] - first + src[6*step];
483  sums[5] = sums[4] - src[0*step] + src[7*step];
484  sums[6] = sums[5] - src[1*step] + last;
485  sums[7] = sums[6] - src[2*step] + last;
486  sums[8] = sums[7] - src[3*step] + last;
487  sums[9] = sums[8] - src[4*step] + last;
488 
489  if (mode & VISUALIZE) {
490  src[0*step] =
491  src[1*step] =
492  src[2*step] =
493  src[3*step] =
494  src[4*step] =
495  src[5*step] =
496  src[6*step] =
497  src[7*step] = 128;
498  }
499  src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
500  src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
501  src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
502  src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
503  src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
504  src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
505  src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
506  src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
507  }
508  }else{
509  const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
510 
511  if(FFABS(middleEnergy) < 8*QP){
512  const int q=(src[3*step] - src[4*step])/2;
513  const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
514  const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
515 
516  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
517  d= FFMAX(d, 0);
518 
519  d= (5*d + 32) >> 6;
520  d*= FFSIGN(-middleEnergy);
521 
522  if(q>0){
523  d = FFMAX(d, 0);
524  d = FFMIN(d, q);
525  }else{
526  d = FFMIN(d, 0);
527  d = FFMAX(d, q);
528  }
529 
530  if ((mode & VISUALIZE) && d) {
531  d= (d < 0) ? 32 : -32;
532  src[3*step]= av_clip_uint8(src[3*step] - d);
533  src[4*step]= av_clip_uint8(src[4*step] + d);
534  d = 0;
535  }
536 
537  src[3*step]-= d;
538  src[4*step]+= d;
539  }
540  }
541 
542  src += stride;
543  }
544 /*if(step==16){
545  STOP_TIMER("step16")
546 }else{
547  STOP_TIMER("stepX")
548 }*/
549 }
550 
551 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
552 //Plain C versions
553 //we always compile C for testing which needs bitexactness
554 #define TEMPLATE_PP_C 1
555 #include "postprocess_template.c"
556 
557 #if HAVE_ALTIVEC
558 # define TEMPLATE_PP_ALTIVEC 1
560 # include "postprocess_template.c"
561 #endif
562 
563 #if ARCH_X86 && HAVE_INLINE_ASM
564 # if CONFIG_RUNTIME_CPUDETECT
565 # define TEMPLATE_PP_MMX 1
566 # include "postprocess_template.c"
567 # define TEMPLATE_PP_MMXEXT 1
568 # include "postprocess_template.c"
569 # define TEMPLATE_PP_3DNOW 1
570 # include "postprocess_template.c"
571 # define TEMPLATE_PP_SSE2 1
572 # include "postprocess_template.c"
573 # else
574 # if HAVE_SSE2_INLINE
575 # define TEMPLATE_PP_SSE2 1
576 # include "postprocess_template.c"
577 # elif HAVE_MMXEXT_INLINE
578 # define TEMPLATE_PP_MMXEXT 1
579 # include "postprocess_template.c"
580 # elif HAVE_AMD3DNOW_INLINE
581 # define TEMPLATE_PP_3DNOW 1
582 # include "postprocess_template.c"
583 # elif HAVE_MMX_INLINE
584 # define TEMPLATE_PP_MMX 1
585 # include "postprocess_template.c"
586 # endif
587 # endif
588 #endif
589 
590 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
591  const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
592 
593 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
594  const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
595 {
596  pp_fn pp = postProcess_C;
597  PPContext *c= (PPContext *)vc;
598  PPMode *ppMode= (PPMode *)vm;
599  c->ppMode= *ppMode; //FIXME
600 
601  if (!(ppMode->lumMode & BITEXACT)) {
602 #if CONFIG_RUNTIME_CPUDETECT
603 #if ARCH_X86 && HAVE_INLINE_ASM
604  // ordered per speed fastest first
605  if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
606  else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
607  else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
608  else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
609 #elif HAVE_ALTIVEC
610  if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
611 #endif
612 #else /* CONFIG_RUNTIME_CPUDETECT */
613 #if HAVE_SSE2_INLINE
614  pp = postProcess_SSE2;
615 #elif HAVE_MMXEXT_INLINE
616  pp = postProcess_MMX2;
617 #elif HAVE_AMD3DNOW_INLINE
618  pp = postProcess_3DNow;
619 #elif HAVE_MMX_INLINE
620  pp = postProcess_MMX;
621 #elif HAVE_ALTIVEC
622  pp = postProcess_altivec;
623 #endif
624 #endif /* !CONFIG_RUNTIME_CPUDETECT */
625  }
626 
627  pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
628 }
629 
630 /* -pp Command line Help
631 */
632 const char pp_help[] =
633 "Available postprocessing filters:\n"
634 "Filters Options\n"
635 "short long name short long option Description\n"
636 "* * a autoq CPU power dependent enabler\n"
637 " c chrom chrominance filtering enabled\n"
638 " y nochrom chrominance filtering disabled\n"
639 " n noluma luma filtering disabled\n"
640 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
641 " 1. difference factor: default=32, higher -> more deblocking\n"
642 " 2. flatness threshold: default=39, lower -> more deblocking\n"
643 " the h & v deblocking filters share these\n"
644 " so you can't set different thresholds for h / v\n"
645 "vb vdeblock (2 threshold) vertical deblocking filter\n"
646 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
647 "va vadeblock (2 threshold) vertical deblocking filter\n"
648 "h1 x1hdeblock experimental h deblock filter 1\n"
649 "v1 x1vdeblock experimental v deblock filter 1\n"
650 "dr dering deringing filter\n"
651 "al autolevels automatic brightness / contrast\n"
652 " f fullyrange stretch luminance to (0..255)\n"
653 "lb linblenddeint linear blend deinterlacer\n"
654 "li linipoldeint linear interpolating deinterlace\n"
655 "ci cubicipoldeint cubic interpolating deinterlacer\n"
656 "md mediandeint median deinterlacer\n"
657 "fd ffmpegdeint ffmpeg deinterlacer\n"
658 "l5 lowpass5 FIR lowpass deinterlacer\n"
659 "de default hb:a,vb:a,dr:a\n"
660 "fa fast h1:a,v1:a,dr:a\n"
661 "ac ha:a:128:7,va:a,dr:a\n"
662 "tn tmpnoise (3 threshold) temporal noise reducer\n"
663 " 1. <= 2. <= 3. larger -> stronger filtering\n"
664 "fq forceQuant <quantizer> force quantizer\n"
665 "Usage:\n"
666 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
667 "long form example:\n"
668 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
669 "short form example:\n"
670 "vb:a/hb:a/lb de,-vb\n"
671 "more examples:\n"
672 "tn:64:128:256\n"
673 "\n"
674 ;
675 
676 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
677 {
679  char *p= temp;
680  static const char filterDelimiters[] = ",/";
681  static const char optionDelimiters[] = ":|";
682  struct PPMode *ppMode;
683  char *filterToken;
684 
685  if (!name) {
686  av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
687  return NULL;
688  }
689 
690  if (!strcmp(name, "help")) {
691  const char *p;
692  for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
693  av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
694  av_log(NULL, AV_LOG_INFO, "%s", temp);
695  }
696  return NULL;
697  }
698 
699  ppMode= av_malloc(sizeof(PPMode));
700 
701  ppMode->lumMode= 0;
702  ppMode->chromMode= 0;
703  ppMode->maxTmpNoise[0]= 700;
704  ppMode->maxTmpNoise[1]= 1500;
705  ppMode->maxTmpNoise[2]= 3000;
706  ppMode->maxAllowedY= 234;
707  ppMode->minAllowedY= 16;
708  ppMode->baseDcDiff= 256/8;
709  ppMode->flatnessThreshold= 56-16-1;
710  ppMode->maxClippedThreshold= 0.01;
711  ppMode->error=0;
712 
713  memset(temp, 0, GET_MODE_BUFFER_SIZE);
714  av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
715 
716  av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
717 
718  for(;;){
719  const char *filterName;
720  int q= 1000000; //PP_QUALITY_MAX;
721  int chrom=-1;
722  int luma=-1;
723  const char *option;
724  const char *options[OPTIONS_ARRAY_SIZE];
725  int i;
726  int filterNameOk=0;
727  int numOfUnknownOptions=0;
728  int enable=1; //does the user want us to enabled or disabled the filter
729  char *tokstate;
730 
731  filterToken= av_strtok(p, filterDelimiters, &tokstate);
732  if(!filterToken) break;
733  p+= strlen(filterToken) + 1; // p points to next filterToken
734  filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
735  if (!filterName) {
736  ppMode->error++;
737  break;
738  }
739  av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
740 
741  if(*filterName == '-'){
742  enable=0;
743  filterName++;
744  }
745 
746  for(;;){ //for all options
747  option= av_strtok(NULL, optionDelimiters, &tokstate);
748  if(!option) break;
749 
750  av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
751  if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
752  else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
753  else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
754  else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
755  else{
756  options[numOfUnknownOptions] = option;
757  numOfUnknownOptions++;
758  }
759  if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
760  }
761  options[numOfUnknownOptions] = NULL;
762 
763  /* replace stuff from the replace Table */
764  for(i=0; replaceTable[2*i]; i++){
765  if(!strcmp(replaceTable[2*i], filterName)){
766  int newlen= strlen(replaceTable[2*i + 1]);
767  int plen;
768  int spaceLeft;
769 
770  p--, *p=',';
771 
772  plen= strlen(p);
773  spaceLeft= p - temp + plen;
774  if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
775  ppMode->error++;
776  break;
777  }
778  memmove(p + newlen, p, plen+1);
779  memcpy(p, replaceTable[2*i + 1], newlen);
780  filterNameOk=1;
781  }
782  }
783 
784  for(i=0; filters[i].shortName; i++){
785  if( !strcmp(filters[i].longName, filterName)
786  || !strcmp(filters[i].shortName, filterName)){
787  ppMode->lumMode &= ~filters[i].mask;
788  ppMode->chromMode &= ~filters[i].mask;
789 
790  filterNameOk=1;
791  if(!enable) break; // user wants to disable it
792 
793  if(q >= filters[i].minLumQuality && luma)
794  ppMode->lumMode|= filters[i].mask;
795  if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
796  if(q >= filters[i].minChromQuality)
797  ppMode->chromMode|= filters[i].mask;
798 
799  if(filters[i].mask == LEVEL_FIX){
800  int o;
801  ppMode->minAllowedY= 16;
802  ppMode->maxAllowedY= 234;
803  for(o=0; options[o]; o++){
804  if( !strcmp(options[o],"fullyrange")
805  ||!strcmp(options[o],"f")){
806  ppMode->minAllowedY= 0;
807  ppMode->maxAllowedY= 255;
808  numOfUnknownOptions--;
809  }
810  }
811  }
812  else if(filters[i].mask == TEMP_NOISE_FILTER)
813  {
814  int o;
815  int numOfNoises=0;
816 
817  for(o=0; options[o]; o++){
818  char *tail;
819  ppMode->maxTmpNoise[numOfNoises]=
820  strtol(options[o], &tail, 0);
821  if(tail!=options[o]){
822  numOfNoises++;
823  numOfUnknownOptions--;
824  if(numOfNoises >= 3) break;
825  }
826  }
827  }
828  else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
829  || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
830  int o;
831 
832  for(o=0; options[o] && o<2; o++){
833  char *tail;
834  int val= strtol(options[o], &tail, 0);
835  if(tail==options[o]) break;
836 
837  numOfUnknownOptions--;
838  if(o==0) ppMode->baseDcDiff= val;
839  else ppMode->flatnessThreshold= val;
840  }
841  }
842  else if(filters[i].mask == FORCE_QUANT){
843  int o;
844  ppMode->forcedQuant= 15;
845 
846  for(o=0; options[o] && o<1; o++){
847  char *tail;
848  int val= strtol(options[o], &tail, 0);
849  if(tail==options[o]) break;
850 
851  numOfUnknownOptions--;
852  ppMode->forcedQuant= val;
853  }
854  }
855  }
856  }
857  if(!filterNameOk) ppMode->error++;
858  ppMode->error += numOfUnknownOptions;
859  }
860 
861  av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
862  if(ppMode->error){
863  av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
864  av_free(ppMode);
865  return NULL;
866  }
867  return ppMode;
868 }
869 
871  av_free(mode);
872 }
873 
874 static void reallocAlign(void **p, int size){
875  av_free(*p);
876  *p= av_mallocz(size);
877 }
878 
879 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
880  int mbWidth = (width+15)>>4;
881  int mbHeight= (height+15)>>4;
882  int i;
883 
884  c->stride= stride;
885  c->qpStride= qpStride;
886 
887  reallocAlign((void **)&c->tempDst, stride*24+32);
888  reallocAlign((void **)&c->tempSrc, stride*24);
889  reallocAlign((void **)&c->tempBlocks, 2*16*8);
890  reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
891  for(i=0; i<256; i++)
892  c->yHistogram[i]= width*height/64*15/256;
893 
894  for(i=0; i<3; i++){
895  //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
896  reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
897  reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
898  }
899 
900  reallocAlign((void **)&c->deintTemp, 2*width+32);
901  reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
902  reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
903  reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
904 }
905 
906 static const char * context_to_name(void * ptr) {
907  return "postproc";
908 }
909 
910 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
911 
912 pp_context *pp_get_context(int width, int height, int cpuCaps){
913  PPContext *c= av_malloc(sizeof(PPContext));
914  int stride= FFALIGN(width, 16); //assumed / will realloc if needed
915  int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
916 
917  memset(c, 0, sizeof(PPContext));
919  if(cpuCaps&PP_FORMAT){
920  c->hChromaSubSample= cpuCaps&0x3;
921  c->vChromaSubSample= (cpuCaps>>4)&0x3;
922  }else{
923  c->hChromaSubSample= 1;
924  c->vChromaSubSample= 1;
925  }
926  if (cpuCaps & PP_CPU_CAPS_AUTO) {
927  c->cpuCaps = av_get_cpu_flags();
928  } else {
929  c->cpuCaps = 0;
930  if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
931  if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
932  if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
933  if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
934  }
935 
936  reallocBuffers(c, width, height, stride, qpStride);
937 
938  c->frameNum=-1;
939 
940  return c;
941 }
942 
943 void pp_free_context(void *vc){
944  PPContext *c = (PPContext*)vc;
945  int i;
946 
947  for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
948  av_free(c->tempBlurred[i]);
949  for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
950  av_free(c->tempBlurredPast[i]);
951 
952  av_free(c->tempBlocks);
953  av_free(c->yHistogram);
954  av_free(c->tempDst);
955  av_free(c->tempSrc);
956  av_free(c->deintTemp);
957  av_free(c->stdQPTable);
958  av_free(c->nonBQPTable);
960 
961  memset(c, 0, sizeof(PPContext));
962 
963  av_free(c);
964 }
965 
966 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
967  uint8_t * dst[3], const int dstStride[3],
968  int width, int height,
969  const QP_STORE_T *QP_store, int QPStride,
970  pp_mode *vm, void *vc, int pict_type)
971 {
972  int mbWidth = (width+15)>>4;
973  int mbHeight= (height+15)>>4;
974  PPMode *mode = vm;
975  PPContext *c = vc;
976  int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
977  int absQPStride = FFABS(QPStride);
978 
979  // c->stride and c->QPStride are always positive
980  if(c->stride < minStride || c->qpStride < absQPStride)
981  reallocBuffers(c, width, height,
982  FFMAX(minStride, c->stride),
983  FFMAX(c->qpStride, absQPStride));
984 
985  if(!QP_store || (mode->lumMode & FORCE_QUANT)){
986  int i;
987  QP_store= c->forcedQPTable;
988  absQPStride = QPStride = 0;
989  if(mode->lumMode & FORCE_QUANT)
990  for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
991  else
992  for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
993  }
994 
995  if(pict_type & PP_PICT_TYPE_QP2){
996  int i;
997  const int count= FFMAX(mbHeight * absQPStride, mbWidth);
998  for(i=0; i<(count>>2); i++){
999  ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1000  }
1001  for(i<<=2; i<count; i++){
1002  c->stdQPTable[i] = QP_store[i]>>1;
1003  }
1004  QP_store= c->stdQPTable;
1005  QPStride= absQPStride;
1006  }
1007 
1008  if(0){
1009  int x,y;
1010  for(y=0; y<mbHeight; y++){
1011  for(x=0; x<mbWidth; x++){
1012  av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1013  }
1014  av_log(c, AV_LOG_INFO, "\n");
1015  }
1016  av_log(c, AV_LOG_INFO, "\n");
1017  }
1018 
1019  if((pict_type&7)!=3){
1020  if (QPStride >= 0){
1021  int i;
1022  const int count= FFMAX(mbHeight * QPStride, mbWidth);
1023  for(i=0; i<(count>>2); i++){
1024  ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1025  }
1026  for(i<<=2; i<count; i++){
1027  c->nonBQPTable[i] = QP_store[i] & 0x3F;
1028  }
1029  } else {
1030  int i,j;
1031  for(i=0; i<mbHeight; i++) {
1032  for(j=0; j<absQPStride; j++) {
1033  c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1034  }
1035  }
1036  }
1037  }
1038 
1039  av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1040  mode->lumMode, mode->chromMode);
1041 
1042  postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1043  width, height, QP_store, QPStride, 0, mode, c);
1044 
1045  if (!(src[1] && src[2] && dst[1] && dst[2]))
1046  return;
1047 
1048  width = (width )>>c->hChromaSubSample;
1049  height = (height)>>c->vChromaSubSample;
1050 
1051  if(mode->chromMode){
1052  postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1053  width, height, QP_store, QPStride, 1, mode, c);
1054  postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1055  width, height, QP_store, QPStride, 2, mode, c);
1056  }
1057  else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1058  linecpy(dst[1], src[1], height, srcStride[1]);
1059  linecpy(dst[2], src[2], height, srcStride[2]);
1060  }else{
1061  int y;
1062  for(y=0; y<height; y++){
1063  memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1064  memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1065  }
1066  }
1067 }