33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \ 34 "li %[tmp0], "#r1" \n\t" \ 35 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 36 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 37 "li %[tmp0], "#r2" \n\t" \ 38 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 39 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 40 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 41 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 42 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 43 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 44 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 45 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 47 "li %[tmp0], "#r3" \n\t" \ 48 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 49 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 50 "li %[tmp0], "#r4" \n\t" \ 51 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 52 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 53 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 54 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 55 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 56 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 57 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 58 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 60 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 61 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \ 62 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 63 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 64 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 65 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 66 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 67 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 68 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 69 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 70 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 71 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 72 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 73 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 74 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 75 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \ 78 "li %[tmp0], "#r1" \n\t" \ 79 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 80 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 81 "li %[tmp0], "#r2" \n\t" \ 82 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 83 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 84 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 85 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 86 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 87 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 88 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 89 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 91 "li %[tmp0], "#r3" \n\t" \ 92 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 93 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 94 "li %[tmp0], "#r4" \n\t" \ 95 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 96 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 97 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 98 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 99 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 100 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 101 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 102 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 104 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 105 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 106 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \ 107 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 108 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 109 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \ 110 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \ 111 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \ 112 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 113 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \ 114 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 115 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 116 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 117 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 118 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 119 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 120 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 121 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 122 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 123 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 133 dc = (3 * dc + 1) >> 1;
134 dc = (3 * dc + 16) >> 5;
137 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 138 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 139 "li %[count], 0x02 \n\t" 142 MMI_LDC1(%[ftmp1], %[dest], 0x00)
143 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 144 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
145 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 146 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
147 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 148 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
150 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 151 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 152 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 153 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 154 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 155 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 156 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 157 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 159 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 160 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 161 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 162 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 163 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 164 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 165 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 166 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 168 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 169 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 170 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 171 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 173 MMI_SDC1(%[ftmp1], %[dest], 0x00)
174 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 175 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
176 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 177 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
178 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 179 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
181 "addiu %[count], %[count], -0x01 \n\t" 182 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t" 183 "bnez %[count], 1b \n\t" 184 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
185 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
186 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
187 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
188 [ftmp8]
"=&f"(ftmp[8]),
189 [addr0]
"=&r"(addr[0]),
191 : [linesize]
"r"((
mips_reg)linesize),
197 #if _MIPS_SIM != _ABIO32 201 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
202 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
203 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
209 "li %[tmp0], 0x03 \n\t" 210 "mtc1 %[tmp0], %[ftmp0] \n\t" 213 MMI_LDC1(%[ftmp1], %[block], 0x00)
214 MMI_LDC1(%[ftmp11], %[block], 0x10)
215 MMI_LDC1(%[ftmp2], %[block], 0x20)
216 MMI_LDC1(%[ftmp12], %[block], 0x30)
217 MMI_LDC1(%[ftmp3], %[block], 0x40)
218 MMI_LDC1(%[ftmp13], %[block], 0x50)
219 MMI_LDC1(%[ftmp4], %[block], 0x60)
220 MMI_LDC1(%[ftmp14], %[block], 0x70)
221 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 222 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 223 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 224 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 226 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 227 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 228 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 229 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 233 0x000f0010, 0x00040009, %[
ff_pw_4])
237 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
241 0xfff00009, 0x000f0004, %[
ff_pw_4])
245 0xfff70004, 0xfff0000f, %[
ff_pw_4])
247 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
248 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
250 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
251 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
253 MMI_SDC1(%[ftmp15], %[
temp], 0x00)
254 MMI_SDC1(%[ftmp19], %[
temp], 0x08)
255 MMI_SDC1(%[ftmp16], %[
temp], 0x10)
256 MMI_SDC1(%[ftmp20], %[
temp], 0x18)
257 MMI_SDC1(%[ftmp17], %[
temp], 0x20)
258 MMI_SDC1(%[ftmp21], %[
temp], 0x28)
259 MMI_SDC1(%[ftmp18], %[
temp], 0x30)
260 MMI_SDC1(%[ftmp22], %[
temp], 0x38)
263 MMI_LDC1(%[ftmp1], %[block], 0x08)
264 MMI_LDC1(%[ftmp11], %[block], 0x18)
265 MMI_LDC1(%[ftmp2], %[block], 0x28)
266 MMI_LDC1(%[ftmp12], %[block], 0x38)
267 MMI_LDC1(%[ftmp3], %[block], 0x48)
268 MMI_LDC1(%[ftmp13], %[block], 0x58)
269 MMI_LDC1(%[ftmp4], %[block], 0x68)
270 MMI_LDC1(%[ftmp14], %[block], 0x78)
271 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 272 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 273 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 274 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 276 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 277 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 278 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 279 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 283 0x000f0010, 0x00040009, %[
ff_pw_4])
287 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
291 0xfff00009, 0x000f0004, %[
ff_pw_4])
295 0xfff70004, 0xfff0000f, %[
ff_pw_4])
297 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
298 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
300 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
301 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
303 MMI_SDC1(%[ftmp19], %[
temp], 0x48)
304 MMI_SDC1(%[ftmp20], %[
temp], 0x58)
305 MMI_SDC1(%[ftmp21], %[
temp], 0x68)
306 MMI_SDC1(%[ftmp22], %[
temp], 0x78)
310 "li %[tmp0], 0x07 \n\t" 311 "mtc1 %[tmp0], %[ftmp0] \n\t" 314 MMI_LDC1(%[ftmp1], %[
temp], 0x00)
315 MMI_LDC1(%[ftmp11], %[
temp], 0x10)
316 MMI_LDC1(%[ftmp2], %[
temp], 0x20)
317 MMI_LDC1(%[ftmp12], %[
temp], 0x30)
318 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 319 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 320 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t" 321 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t" 323 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 324 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 325 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t" 326 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t" 344 MMI_SDC1(%[ftmp15], %[block], 0x00)
345 MMI_SDC1(%[ftmp16], %[block], 0x10)
346 MMI_SDC1(%[ftmp17], %[block], 0x20)
347 MMI_SDC1(%[ftmp18], %[block], 0x30)
348 MMI_SDC1(%[ftmp19], %[block], 0x40)
349 MMI_SDC1(%[ftmp20], %[block], 0x50)
350 MMI_SDC1(%[ftmp21], %[block], 0x60)
351 MMI_SDC1(%[ftmp22], %[block], 0x70)
354 MMI_LDC1(%[ftmp1], %[
temp], 0x08)
355 MMI_LDC1(%[ftmp11], %[
temp], 0x18)
356 MMI_LDC1(%[ftmp2], %[
temp], 0x28)
357 MMI_LDC1(%[ftmp12], %[
temp], 0x38)
358 MMI_LDC1(%[ftmp3], %[
temp], 0x48)
359 MMI_LDC1(%[ftmp13], %[
temp], 0x58)
360 MMI_LDC1(%[ftmp4], %[
temp], 0x68)
361 MMI_LDC1(%[ftmp14], %[
temp], 0x78)
362 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 363 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 364 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 365 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 367 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 368 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 369 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 370 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 388 MMI_SDC1(%[ftmp15], %[block], 0x08)
389 MMI_SDC1(%[ftmp16], %[block], 0x18)
390 MMI_SDC1(%[ftmp17], %[block], 0x28)
391 MMI_SDC1(%[ftmp18], %[block], 0x38)
392 MMI_SDC1(%[ftmp19], %[block], 0x48)
393 MMI_SDC1(%[ftmp20], %[block], 0x58)
394 MMI_SDC1(%[ftmp21], %[block], 0x68)
395 MMI_SDC1(%[ftmp22], %[block], 0x78)
397 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
398 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
399 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
400 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
401 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
402 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
403 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
404 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
405 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
406 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
407 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
408 [ftmp22]
"=&f"(ftmp[22]),
411 [
ff_pw_4]
"f"(ff_pw_4_local), [block]
"r"(block),
424 dc = ( 3 * dc + 1) >> 1;
425 dc = (17 * dc + 64) >> 7;
428 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 429 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 431 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
432 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
433 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
434 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
436 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 437 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 438 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 439 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 440 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 441 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 442 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 443 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 445 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 446 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 447 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 448 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 449 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 450 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 451 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 452 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 454 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 455 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 456 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 457 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 459 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
460 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
461 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
462 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
463 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
464 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
465 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
466 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
467 [ftmp8]
"=&f"(ftmp[8])
468 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
469 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
475 #if _MIPS_SIM != _ABIO32 479 int16_t *dst =
block;
483 DECLARE_ALIGNED(16,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
484 DECLARE_ALIGNED(16,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
485 int16_t
coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
486 12, 15, 6, -4, -12, -16, -16, -9,
487 12, 9, -6, -16, -12, 4, 16, 15,
488 12, 4, -16, -9, 12, 15, -6, -16,
489 12, -4, -16, 9, 12, -15, -6, 16,
490 12, -9, -6, 16, -12, -4, 16, -15,
491 12, -15, 6, 4, -12, 16, -16, 9,
492 12, -16, 16, -15, 12, -9, 6, -4};
496 "li %[tmp0], 0x03 \n\t" 497 "mtc1 %[tmp0], %[ftmp0] \n\t" 500 MMI_LDC1(%[ftmp1], %[src], 0x00)
501 MMI_LDC1(%[ftmp2], %[src], 0x08)
504 MMI_LDC1(%[ftmp3], %[coeff], 0x00)
505 MMI_LDC1(%[ftmp4], %[coeff], 0x08)
506 MMI_LDC1(%[ftmp5], %[coeff], 0x10)
507 MMI_LDC1(%[ftmp6], %[coeff], 0x18)
508 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 509 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 510 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 511 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 512 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 513 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 514 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 515 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 516 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t" 517 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t" 520 MMI_LDC1(%[ftmp3], %[coeff], 0x20)
521 MMI_LDC1(%[ftmp4], %[coeff], 0x28)
522 MMI_LDC1(%[ftmp5], %[coeff], 0x30)
523 MMI_LDC1(%[ftmp6], %[coeff], 0x38)
524 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 525 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 526 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 527 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 528 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 529 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 530 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 531 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 532 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t" 533 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t" 536 MMI_LDC1(%[ftmp3], %[coeff], 0x40)
537 MMI_LDC1(%[ftmp4], %[coeff], 0x48)
538 MMI_LDC1(%[ftmp5], %[coeff], 0x50)
539 MMI_LDC1(%[ftmp6], %[coeff], 0x58)
540 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 541 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 542 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 543 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 544 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 545 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 546 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 547 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 548 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t" 549 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t" 552 MMI_LDC1(%[ftmp3], %[coeff], 0x60)
553 MMI_LDC1(%[ftmp4], %[coeff], 0x68)
554 MMI_LDC1(%[ftmp5], %[coeff], 0x70)
555 MMI_LDC1(%[ftmp6], %[coeff], 0x78)
556 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 557 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 558 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 559 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 560 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 561 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 562 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 563 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 564 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t" 565 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t" 568 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t" 569 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t" 570 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" 571 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" 572 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t" 573 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t" 574 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 575 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t" 576 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t" 577 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 578 MMI_SDC1(%[ftmp9], %[dst], 0x00)
579 MMI_SDC1(%[ftmp10], %[dst], 0x08)
583 "addiu %[count], %[count], -0x01 \n\t" 584 "bnez %[count], 1b \n\t" 585 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
586 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
587 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
588 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
589 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
590 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
591 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
592 [ftmp14]
"=&f"(ftmp[14]), [tmp0]
"=&r"(tmp[0]),
602 "li %[tmp0], 0x44 \n\t" 603 "mtc1 %[tmp0], %[ftmp15] \n\t" 606 "li %[tmp0], 0x07 \n\t" 607 "mtc1 %[tmp0], %[ftmp0] \n\t" 608 MMI_LDC1(%[ftmp1], %[src], 0x00)
609 MMI_LDC1(%[ftmp2], %[src], 0x10)
610 MMI_LDC1(%[ftmp3], %[src], 0x20)
611 MMI_LDC1(%[ftmp4], %[src], 0x30)
612 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 613 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 614 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 615 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 618 "li %[tmp0], 0x00160011 \n\t" 619 "mtc1 %[tmp0], %[ftmp3] \n\t" 620 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 621 "li %[tmp0], 0x000a0011 \n\t" 622 "mtc1 %[tmp0], %[ftmp4] \n\t" 623 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 624 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 625 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 626 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 627 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 628 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 629 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 630 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 631 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 632 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 633 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 634 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 635 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 636 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 639 "li %[tmp0], 0x000a0011 \n\t" 640 "mtc1 %[tmp0], %[ftmp3] \n\t" 641 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 642 "li %[tmp0], 0xffeaffef \n\t" 643 "mtc1 %[tmp0], %[ftmp4] \n\t" 644 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 645 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 646 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 647 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 648 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 649 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 650 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 651 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 652 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 653 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 654 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 655 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 656 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 657 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 660 "li %[tmp0], 0xfff60011 \n\t" 661 "mtc1 %[tmp0], %[ftmp3] \n\t" 662 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 663 "li %[tmp0], 0x0016ffef \n\t" 664 "mtc1 %[tmp0], %[ftmp4] \n\t" 665 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 666 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 667 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 668 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 669 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 670 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 671 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 672 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 673 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 674 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 675 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 676 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 677 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 678 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 681 "li %[tmp0], 0xffea0011 \n\t" 682 "mtc1 %[tmp0], %[ftmp3] \n\t" 683 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 684 "li %[tmp0], 0xfff60011 \n\t" 685 "mtc1 %[tmp0], %[ftmp4] \n\t" 686 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 687 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 688 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 689 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 690 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 691 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 692 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 693 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 694 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 695 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 696 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 697 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 698 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 699 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 701 MMI_LWC1(%[ftmp1], %[dest], 0x00)
702 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 703 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
704 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 705 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
706 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 707 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
708 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 709 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 710 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 711 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 712 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 713 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 714 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 715 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 716 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 717 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 718 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 719 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 720 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 721 MMI_SWC1(%[ftmp1], %[dest], 0x00)
722 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 723 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
724 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 725 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
726 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 727 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
730 "li %[tmp0], 0x07 \n\t" 731 "mtc1 %[tmp0], %[ftmp0] \n\t" 732 MMI_LDC1(%[ftmp1], %[src], 0x08)
733 MMI_LDC1(%[ftmp2], %[src], 0x18)
734 MMI_LDC1(%[ftmp3], %[src], 0x28)
735 MMI_LDC1(%[ftmp4], %[src], 0x38)
736 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 737 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 738 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 739 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 742 "li %[tmp0], 0x00160011 \n\t" 743 "mtc1 %[tmp0], %[ftmp3] \n\t" 744 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 745 "li %[tmp0], 0x000a0011 \n\t" 746 "mtc1 %[tmp0], %[ftmp4] \n\t" 747 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 748 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 749 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 750 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 751 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 752 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 753 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 754 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 755 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 756 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 757 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 758 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 759 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 760 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 763 "li %[tmp0], 0x000a0011 \n\t" 764 "mtc1 %[tmp0], %[ftmp3] \n\t" 765 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 766 "li %[tmp0], 0xffeaffef \n\t" 767 "mtc1 %[tmp0], %[ftmp4] \n\t" 768 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 769 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 770 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 771 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 772 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 773 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 774 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 775 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 776 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 777 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 778 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 779 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 780 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 781 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 784 "li %[tmp0], 0xfff60011 \n\t" 785 "mtc1 %[tmp0], %[ftmp3] \n\t" 786 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 787 "li %[tmp0], 0x0016ffef \n\t" 788 "mtc1 %[tmp0], %[ftmp4] \n\t" 789 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 790 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 791 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 792 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 793 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 794 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 795 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 796 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 797 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 798 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 799 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 800 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 801 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 802 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 805 "li %[tmp0], 0xffea0011 \n\t" 806 "mtc1 %[tmp0], %[ftmp3] \n\t" 807 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 808 "li %[tmp0], 0xfff60011 \n\t" 809 "mtc1 %[tmp0], %[ftmp4] \n\t" 810 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 811 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 812 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 813 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 814 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 815 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 816 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 817 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 818 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 819 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 820 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 821 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 822 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 823 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 825 MMI_LWC1(%[ftmp1], %[dest], 0x04)
826 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 827 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
828 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 829 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
830 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 831 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
832 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 833 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 834 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 835 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 836 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 837 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 838 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 839 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 840 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 841 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 842 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 843 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 844 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 845 MMI_SWC1(%[ftmp1], %[dest], 0x04)
846 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 847 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
848 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 849 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
850 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 851 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
853 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
854 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
855 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
856 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
857 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
858 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
859 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
860 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
863 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
876 dc = (17 * dc + 4) >> 3;
877 dc = (12 * dc + 64) >> 7;
880 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 881 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 883 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
884 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
885 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
886 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
887 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
888 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
889 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
890 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
892 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 893 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 894 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 895 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 896 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 897 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 898 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 899 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 901 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 902 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 903 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 904 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 905 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 906 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 907 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 908 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 910 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 911 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 912 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 913 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 914 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 915 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 916 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 917 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 919 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
920 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
921 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
922 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
923 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
924 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
925 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
926 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
927 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
928 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
929 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
930 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
932 [ftmp8]
"=&f"(ftmp[8])
933 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
934 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
935 [dest4]
"r"(dest+4*linesize), [dest5]
"r"(dest+5*linesize),
936 [dest6]
"r"(dest+6*linesize), [dest7]
"r"(dest+7*linesize),
942 #if _MIPS_SIM != _ABIO32 946 int16_t *dst =
block;
949 int16_t
coeff[16] = {17, 22, 17, 10,
953 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
954 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
955 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
960 "li %[tmp0], 0x03 \n\t" 961 "mtc1 %[tmp0], %[ftmp0] \n\t" 963 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
964 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
965 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
966 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
969 MMI_LDC1(%[ftmp1], %[src], 0x00)
970 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 971 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 972 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 973 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 974 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 975 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 976 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 977 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 978 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 979 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 980 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 981 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 982 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 983 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 984 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 985 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 986 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 987 MMI_SDC1(%[ftmp8], %[dst], 0x00)
991 "addiu %[count], %[count], -0x01 \n\t" 992 "bnez %[count], 1b \n\t" 993 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
994 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
995 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
996 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
997 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
998 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
999 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
1000 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
1001 : [
ff_pw_4]
"f"(ff_pw_4_local), [coeff]
"r"(coeff)
1009 "li %[tmp0], 0x07 \n\t" 1010 "mtc1 %[tmp0], %[ftmp0] \n\t" 1012 MMI_LDC1(%[ftmp1], %[src], 0x00)
1013 MMI_LDC1(%[ftmp2], %[src], 0x20)
1014 MMI_LDC1(%[ftmp3], %[src], 0x40)
1015 MMI_LDC1(%[ftmp4], %[src], 0x60)
1016 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1017 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1018 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1019 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1021 MMI_LDC1(%[ftmp1], %[src], 0x10)
1022 MMI_LDC1(%[ftmp2], %[src], 0x30)
1023 MMI_LDC1(%[ftmp3], %[src], 0x50)
1024 MMI_LDC1(%[ftmp4], %[src], 0x70)
1025 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1026 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1027 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t" 1028 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t" 1046 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1047 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1048 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1049 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1050 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1051 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1052 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1053 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1054 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1055 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1056 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1057 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1058 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1059 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1060 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1061 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1062 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1063 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1064 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1065 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1066 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1067 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1068 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1069 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1071 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t" 1072 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t" 1073 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t" 1074 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t" 1075 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t" 1076 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t" 1077 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t" 1078 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t" 1080 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1081 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1082 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1083 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1084 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1085 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1086 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1087 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1089 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1090 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1091 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1092 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1093 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1094 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1095 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1096 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1097 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1098 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1099 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1100 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1101 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1102 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1103 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1105 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1106 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1107 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1108 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1109 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1110 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1111 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1112 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1113 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
1114 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
1115 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
1116 [ftmp22]
"=&f"(ftmp[22]),
1119 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1132 dc = (17 * dc + 4) >> 3;
1133 dc = (17 * dc + 64) >> 7;
1136 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1137 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 1139 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1144 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1145 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1146 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1147 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1149 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 1150 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 1151 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 1152 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 1154 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1155 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1156 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1157 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1159 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1164 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1166 [ftmp4]
"=&f"(ftmp[4])
1167 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
1168 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
1177 int16_t *dst =
block;
1180 int16_t
coeff[16] = {17, 22, 17, 10,
1184 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1185 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1189 "li %[tmp0], 0x03 \n\t" 1190 "mtc1 %[tmp0], %[ftmp0] \n\t" 1191 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1192 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1193 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1194 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1197 MMI_LDC1(%[ftmp1], %[src], 0x00)
1198 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 1199 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 1200 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 1201 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 1202 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 1203 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 1204 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1205 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1206 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 1207 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 1208 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 1209 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 1210 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1211 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1212 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1213 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1214 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 1215 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1219 "addiu %[count], %[count], -0x01 \n\t" 1220 "bnez %[count], 1b \n\t" 1221 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1222 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1223 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1224 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1225 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1226 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1227 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
1228 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
1229 : [
ff_pw_4]
"f"(ff_pw_4_local), [coeff]
"r"(coeff)
1237 "li %[tmp0], 0x07 \n\t" 1238 "mtc1 %[tmp0], %[ftmp0] \n\t" 1239 "li %[tmp0], 0x44 \n\t" 1240 "mtc1 %[tmp0], %[ftmp15] \n\t" 1242 MMI_LDC1(%[ftmp1], %[src], 0x00)
1243 MMI_LDC1(%[ftmp2], %[src], 0x10)
1244 MMI_LDC1(%[ftmp3], %[src], 0x20)
1245 MMI_LDC1(%[ftmp4], %[src], 0x30)
1246 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1247 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1248 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1249 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1252 "li %[tmp0], 0x00160011 \n\t" 1253 "mtc1 %[tmp0], %[ftmp3] \n\t" 1254 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1255 "li %[tmp0], 0x000a0011 \n\t" 1256 "mtc1 %[tmp0], %[ftmp4] \n\t" 1257 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1258 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1259 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1260 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1261 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1262 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1263 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1264 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1265 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1266 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1267 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1268 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1269 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1270 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 1273 "li %[tmp0], 0x000a0011 \n\t" 1274 "mtc1 %[tmp0], %[ftmp3] \n\t" 1275 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1276 "li %[tmp0], 0xffeaffef \n\t" 1277 "mtc1 %[tmp0], %[ftmp4] \n\t" 1278 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1279 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1280 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1281 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1282 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1283 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1284 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1285 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1286 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1287 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1288 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1289 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1290 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1291 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 1294 "li %[tmp0], 0xfff60011 \n\t" 1295 "mtc1 %[tmp0], %[ftmp3] \n\t" 1296 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1297 "li %[tmp0], 0x0016ffef \n\t" 1298 "mtc1 %[tmp0], %[ftmp4] \n\t" 1299 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1300 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1301 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1302 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1303 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1304 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1305 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1306 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1307 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1308 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1309 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1310 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1311 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1312 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 1315 "li %[tmp0], 0xffea0011 \n\t" 1316 "mtc1 %[tmp0], %[ftmp3] \n\t" 1317 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1318 "li %[tmp0], 0xfff60011 \n\t" 1319 "mtc1 %[tmp0], %[ftmp4] \n\t" 1320 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1321 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1322 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1323 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1324 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1325 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1326 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1327 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1328 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1329 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1330 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1331 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1332 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1333 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 1335 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1336 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1337 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1338 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1339 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1340 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1341 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1342 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1343 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1344 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1345 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1346 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1347 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 1348 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 1349 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 1350 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 1351 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1352 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1353 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1354 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1356 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1357 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1358 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1359 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1360 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1361 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1362 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1364 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1365 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1366 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1367 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1368 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1369 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1370 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1371 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1374 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1386 for (i = 0; i < 8; i++) {
1391 d1 = (a - d + 3 +
rnd) >> 3;
1392 d2 = (a - d + b - c + 4 -
rnd) >> 3;
1408 int rnd1 = flags & 2 ? 3 : 4;
1409 int rnd2 = 7 - rnd1;
1410 for (i = 0; i < 8; i++) {
1418 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1419 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1420 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1421 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1423 right += right_stride;
1424 left += left_stride;
1439 for (i = 0; i < 8; i++) {
1444 d1 = (a - d + 3 +
rnd) >> 3;
1445 d2 = (a - d + b - c + 4 -
rnd) >> 3;
1447 src[-2 *
stride] = a - d1;
1461 int rnd1 = 4, rnd2 = 3;
1462 for (i = 0; i < 8; i++) {
1470 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1471 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1472 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1473 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1493 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1494 int a0_sign = a0 >> 31;
1496 a0 = (a0 ^ a0_sign) - a0_sign;
1498 int a1 =
FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1499 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1500 int a2 =
FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1501 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1502 if (a1 < a0 || a2 < a0) {
1504 int clip_sign = clip >> 31;
1506 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1509 int d = 5 * (a3 -
a0);
1510 int d_sign = (d >> 31);
1512 d = ((d ^ d_sign) - d_sign) >> 3;
1515 if (d_sign ^ clip_sign)
1519 d = (d ^ d_sign) - d_sign;
1545 for (i = 0; i <
len; i += 4) {
1607 #define OP_PUT(S, D) 1608 #define OP_AVG(S, D) \ 1609 "ldc1 $f16, "#S" \n\t" \ 1610 "pavgb "#D", "#D", $f16 \n\t" 1613 #define NORMALIZE_MMI(SHIFT) \ 1614 "paddh $f6, $f6, $f14 \n\t" \ 1615 "paddh $f8, $f8, $f14 \n\t" \ 1616 "psrah $f6, $f6, "SHIFT" \n\t" \ 1617 "psrah $f8, $f8, "SHIFT" \n\t" 1619 #define TRANSFER_DO_PACK(OP) \ 1620 "packushb $f6, $f6, $f8 \n\t" \ 1622 "sdc1 $f6, 0x00(%[dst]) \n\t" 1624 #define TRANSFER_DONT_PACK(OP) \ 1625 OP(0(%[dst]), $f6) \ 1626 OP(8(%[dst]), $f8) \ 1627 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1628 "sdc1 $f8, 0x08(%[dst]) \n\t" 1631 #define DO_UNPACK(reg) \ 1632 "punpcklbh "reg", "reg", $f0 \n\t" 1633 #define DONT_UNPACK(reg) 1636 #define LOAD_ROUNDER_MMI(ROUND) \ 1637 "lwc1 $f14, "ROUND" \n\t" \ 1638 "punpcklhw $f14, $f14, $f14 \n\t" \ 1639 "punpcklwd $f14, $f14, $f14 \n\t" 1642 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \ 1643 "paddh "#R1", "#R1", "#R2" \n\t" \ 1644 PTR_ADDU "$9, %[src], %[stride1] \n\t" \ 1645 MMI_ULWC1(R0, $9, 0x00) \ 1646 "pmullh "#R1", "#R1", $f6 \n\t" \ 1647 "punpcklbh "#R0", "#R0", $f0 \n\t" \ 1648 PTR_ADDU "$9, %[src], %[stride] \n\t" \ 1649 MMI_ULWC1(R3, $9, 0x00) \ 1650 "psubh "#R1", "#R1", "#R0" \n\t" \ 1651 "punpcklbh "#R3", "#R3", $f0 \n\t" \ 1652 "paddh "#R1", "#R1", $f14 \n\t" \ 1653 "psubh "#R1", "#R1", "#R3" \n\t" \ 1654 "psrah "#R1", "#R1", %[shift] \n\t" \ 1655 MMI_SDC1(R1, %[dst], OFF) \ 1656 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1667 "xor $f0, $f0, $f0 \n\t" 1670 "ldc1 $f12, %[ff_pw_9] \n\t" 1672 MMI_ULWC1($f4, %[src], 0x00)
1673 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1674 MMI_ULWC1($f6, %[src], 0x00)
1675 "punpcklbh $f4, $f4, $f0 \n\t" 1676 "punpcklbh $f6, $f6, $f0 \n\t" 1685 PTR_SUBU "%[src], %[src], %[stride2] \n\t" 1687 "addiu $8, $8, -0x01 \n\t" 1689 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1690 [
src]
"+r"(
src), [dst]
"+r"(dst)
1694 :
"$8",
"$9",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
1695 "$f14",
"$f16",
"memory" 1703 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \ 1704 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \ 1705 const int16_t *src, int rnd) \ 1708 DECLARE_VAR_ALL64; \ 1709 DECLARE_VAR_ADDRT; \ 1712 rnd -= (-1+9+9-1)*1024; \ 1715 LOAD_ROUNDER_MMI("%[rnd]") \ 1716 "ldc1 $f12, %[ff_pw_128] \n\t" \ 1717 "ldc1 $f10, %[ff_pw_9] \n\t" \ 1719 MMI_ULDC1($f2, %[src], 0x00) \ 1720 MMI_ULDC1($f4, %[src], 0x08) \ 1721 MMI_ULDC1($f6, %[src], 0x02) \ 1722 MMI_ULDC1($f8, %[src], 0x0a) \ 1723 MMI_ULDC1($f0, %[src], 0x06) \ 1724 "paddh $f2, $f2, $f0 \n\t" \ 1725 MMI_ULDC1($f0, %[src], 0x0e) \ 1726 "paddh $f4, $f4, $f0 \n\t" \ 1727 MMI_ULDC1($f0, %[src], 0x04) \ 1728 "paddh $f6, $f6, $f0 \n\t" \ 1729 MMI_ULDC1($f0, %[src], 0x0b) \ 1730 "paddh $f8, $f8, $f0 \n\t" \ 1731 "pmullh $f6, $f6, $f10 \n\t" \ 1732 "pmullh $f8, $f8, $f10 \n\t" \ 1733 "psubh $f6, $f6, $f2 \n\t" \ 1734 "psubh $f8, $f8, $f4 \n\t" \ 1735 "li $8, 0x07 \n\t" \ 1736 "mtc1 $8, $f16 \n\t" \ 1737 NORMALIZE_MMI("$f16") \ 1739 "paddh $f6, $f6, $f12 \n\t" \ 1740 "paddh $f8, $f8, $f12 \n\t" \ 1741 TRANSFER_DO_PACK(OP) \ 1742 "addiu %[h], %[h], -0x01 \n\t" \ 1743 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \ 1744 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1745 "bnez %[h], 1b \n\t" \ 1746 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1748 [src]"+r"(src), [dst]"+r"(dst) \ 1749 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1750 [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \ 1751 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \ 1763 #define VC1_SHIFT2(OP, OPNAME)\ 1764 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \ 1765 mips_reg stride, int rnd, \ 1768 DECLARE_VAR_LOW32; \ 1769 DECLARE_VAR_ADDRT; \ 1774 "xor $f0, $f0, $f0 \n\t" \ 1775 "li $10, 0x08 \n\t" \ 1776 LOAD_ROUNDER_MMI("%[rnd]") \ 1777 "ldc1 $f12, %[ff_pw_9] \n\t" \ 1779 MMI_ULWC1($f6, %[src], 0x00) \ 1780 MMI_ULWC1($f8, %[src], 0x04) \ 1781 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1782 MMI_ULWC1($f2, $9, 0x00) \ 1783 MMI_ULWC1($f4, $9, 0x04) \ 1784 PTR_ADDU "%[src], %[src], %[offset] \n\t" \ 1785 "punpcklbh $f6, $f6, $f0 \n\t" \ 1786 "punpcklbh $f8, $f8, $f0 \n\t" \ 1787 "punpcklbh $f2, $f2, $f0 \n\t" \ 1788 "punpcklbh $f4, $f4, $f0 \n\t" \ 1789 "paddh $f6, $f6, $f2 \n\t" \ 1790 "paddh $f8, $f8, $f4 \n\t" \ 1791 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \ 1792 MMI_ULWC1($f2, $9, 0x00) \ 1793 MMI_ULWC1($f4, $9, 0x04) \ 1794 "pmullh $f6, $f6, $f12 \n\t" \ 1795 "pmullh $f8, $f8, $f12 \n\t" \ 1796 "punpcklbh $f2, $f2, $f0 \n\t" \ 1797 "punpcklbh $f4, $f4, $f0 \n\t" \ 1798 "psubh $f6, $f6, $f2 \n\t" \ 1799 "psubh $f8, $f8, $f4 \n\t" \ 1800 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1801 MMI_ULWC1($f2, $9, 0x00) \ 1802 MMI_ULWC1($f4, $9, 0x04) \ 1803 "punpcklbh $f2, $f2, $f0 \n\t" \ 1804 "punpcklbh $f4, $f4, $f0 \n\t" \ 1805 "psubh $f6, $f6, $f2 \n\t" \ 1806 "psubh $f8, $f8, $f4 \n\t" \ 1807 "li $8, 0x04 \n\t" \ 1808 "mtc1 $8, $f16 \n\t" \ 1809 NORMALIZE_MMI("$f16") \ 1810 "packushb $f6, $f6, $f8 \n\t" \ 1812 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1813 "addiu $10, $10, -0x01 \n\t" \ 1814 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \ 1815 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1816 "bnez $10, 1b \n\t" \ 1817 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1818 [src]"+r"(src), [dst]"+r"(dst) \ 1819 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \ 1820 [stride]"r"(stride), [rnd]"m"(rnd), \ 1821 [stride1]"r"(stride-offset), \ 1822 [ff_pw_9]"m"(ff_pw_9) \ 1823 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \ 1824 "$f12", "$f14", "$f16", "memory" \ 1842 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \ 1843 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1844 LOAD($f2, $9, M*0) \ 1845 LOAD($f4, $9, M*4) \ 1848 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \ 1849 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \ 1850 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1851 LOAD($f6, $9, M*0) \ 1852 LOAD($f8, $9, M*4) \ 1855 "pmullh $f6, $f6, $f12 \n\t" \ 1856 "pmullh $f8, $f8, $f12 \n\t" \ 1857 "psubh $f6, $f6, $f2 \n\t" \ 1858 "psubh $f8, $f8, $f4 \n\t" \ 1859 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1860 LOAD($f2, $9, M*0) \ 1861 LOAD($f4, $9, M*4) \ 1864 "li $8, 0x02 \n\t" \ 1865 "mtc1 $8, $f16 \n\t" \ 1866 "psllh $f2, $f2, $f16 \n\t" \ 1867 "psllh $f4, $f4, $f16 \n\t" \ 1868 "psubh $f6, $f6, $f2 \n\t" \ 1869 "psubh $f8, $f8, $f4 \n\t" \ 1870 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1871 LOAD($f2, $9, M*0) \ 1872 LOAD($f4, $9, M*4) \ 1875 "pmullh $f2, $f2, $f10 \n\t" \ 1876 "pmullh $f4, $f4, $f10 \n\t" \ 1877 "paddh $f6, $f6, $f2 \n\t" \ 1878 "paddh $f8, $f8, $f4 \n\t" 1888 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ 1890 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \ 1891 mips_reg src_stride, \ 1892 int rnd, int64_t shift) \ 1895 DECLARE_VAR_LOW32; \ 1896 DECLARE_VAR_ADDRT; \ 1898 src -= src_stride; \ 1901 "xor $f0, $f0, $f0 \n\t" \ 1902 LOAD_ROUNDER_MMI("%[rnd]") \ 1903 "ldc1 $f10, %[ff_pw_53] \n\t" \ 1904 "ldc1 $f12, %[ff_pw_18] \n\t" \ 1907 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 1908 NORMALIZE_MMI("%[shift]") \ 1909 TRANSFER_DONT_PACK(OP_PUT) \ 1911 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1912 MMI_ULWC1($f2, $9, 0x08) \ 1914 "mov.d $f6, $f2 \n\t" \ 1915 "paddh $f2, $f2, $f2 \n\t" \ 1916 "paddh $f2, $f2, $f6 \n\t" \ 1917 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1918 MMI_ULWC1($f6, $9, 0x08) \ 1920 "pmullh $f6, $f6, $f12 \n\t" \ 1921 "psubh $f6, $f6, $f2 \n\t" \ 1922 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1923 MMI_ULWC1($f2, $9, 0x08) \ 1925 "pmullh $f2, $f2, $f10 \n\t" \ 1926 "paddh $f6, $f6, $f2 \n\t" \ 1927 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1928 MMI_ULWC1($f2, $9, 0x08) \ 1930 "li $8, 0x02 \n\t" \ 1931 "mtc1 $8, $f16 \n\t" \ 1932 "psllh $f2, $f2, $f16 \n\t" \ 1933 "psubh $f6, $f6, $f2 \n\t" \ 1934 "paddh $f6, $f6, $f14 \n\t" \ 1935 "li $8, 0x06 \n\t" \ 1936 "mtc1 $8, $f16 \n\t" \ 1937 "psrah $f6, $f6, $f16 \n\t" \ 1938 "sdc1 $f6, 0x10(%[dst]) \n\t" \ 1939 "addiu %[h], %[h], -0x01 \n\t" \ 1940 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \ 1941 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \ 1942 "bnez %[h], 1b \n\t" \ 1943 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1945 [src]"+r"(src), [dst]"+r"(dst) \ 1946 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \ 1947 [stride_x3]"r"(3*src_stride), \ 1948 [rnd]"m"(rnd), [shift]"f"(shift), \ 1949 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 1950 [ff_pw_3]"f"(ff_pw_3) \ 1951 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 1952 "$f14", "$f16", "memory" \ 1963 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 1965 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \ 1966 const int16_t *src, int rnd) \ 1969 DECLARE_VAR_ALL64; \ 1970 DECLARE_VAR_ADDRT; \ 1973 rnd -= (-4+58+13-3)*256; \ 1976 "xor $f0, $f0, $f0 \n\t" \ 1977 LOAD_ROUNDER_MMI("%[rnd]") \ 1978 "ldc1 $f10, %[ff_pw_53] \n\t" \ 1979 "ldc1 $f12, %[ff_pw_18] \n\t" \ 1982 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \ 1983 "li $8, 0x07 \n\t" \ 1984 "mtc1 $8, $f16 \n\t" \ 1985 NORMALIZE_MMI("$f16") \ 1987 "paddh $f6, $f6, %[ff_pw_128] \n\t" \ 1988 "paddh $f8, $f8, %[ff_pw_128] \n\t" \ 1989 TRANSFER_DO_PACK(OP) \ 1990 "addiu %[h], %[h], -0x01 \n\t" \ 1991 PTR_ADDU "%[src], %[src], 0x18 \n\t" \ 1992 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1993 "bnez %[h], 1b \n\t" \ 1994 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1996 [src]"+r"(src), [dst]"+r"(dst) \ 1997 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1998 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 1999 [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \ 2000 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 2001 "$f14", "$f16", "memory" \ 2013 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 2015 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \ 2016 mips_reg stride, int rnd, mips_reg offset) \ 2019 DECLARE_VAR_LOW32; \ 2020 DECLARE_VAR_ADDRT; \ 2025 __asm__ volatile ( \ 2026 "xor $f0, $f0, $f0 \n\t" \ 2027 LOAD_ROUNDER_MMI("%[rnd]") \ 2028 "ldc1 $f10, %[ff_pw_53] \n\t" \ 2029 "ldc1 $f12, %[ff_pw_18] \n\t" \ 2032 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 2033 "li $8, 0x06 \n\t" \ 2034 "mtc1 $8, $f16 \n\t" \ 2035 NORMALIZE_MMI("$f16") \ 2036 TRANSFER_DO_PACK(OP) \ 2037 "addiu %[h], %[h], -0x01 \n\t" \ 2038 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 2039 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 2040 "bnez %[h], 1b \n\t" \ 2041 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 2043 [src]"+r"(src), [dst]"+r"(dst) \ 2044 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \ 2045 [offset_x3]"r"(3*offset), [stride]"r"(stride), \ 2047 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 2048 [ff_pw_3]"f"(ff_pw_3) \ 2049 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 2050 "$f14", "$f16", "memory" \ 2063 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2064 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2089 #define VC1_MSPEL_MC(OP) \ 2090 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ 2091 int hmode, int vmode, int rnd) \ 2093 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ 2094 { NULL, vc1_put_ver_16b_shift1_mmi, \ 2095 vc1_put_ver_16b_shift2_mmi, \ 2096 vc1_put_ver_16b_shift3_mmi }; \ 2097 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ 2098 { NULL, OP ## vc1_hor_16b_shift1_mmi, \ 2099 OP ## vc1_hor_16b_shift2_mmi, \ 2100 OP ## vc1_hor_16b_shift3_mmi }; \ 2101 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \ 2102 { NULL, OP ## vc1_shift1_mmi, \ 2103 OP ## vc1_shift2_mmi, \ 2104 OP ## vc1_shift3_mmi }; \ 2108 static const int shift_value[] = { 0, 5, 1, 5 }; \ 2109 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \ 2111 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \ 2113 r = (1<<(shift-1)) + rnd-1; \ 2114 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \ 2116 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \ 2120 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \ 2126 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \ 2128 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ 2129 int stride, int hmode, int vmode, int rnd)\ 2131 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2132 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2133 dst += 8*stride; src += 8*stride; \ 2134 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2135 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2142 #define DECLARE_FUNCTION(a, b) \ 2143 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2144 const uint8_t *src, \ 2148 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2150 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2151 const uint8_t *src, \ 2155 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2157 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2158 const uint8_t *src, \ 2162 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2164 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2165 const uint8_t *src, \ 2169 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2191 #define CHROMA_MC_8_MMI \ 2192 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 2193 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2194 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ 2195 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2196 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 2197 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2198 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 2199 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2201 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2202 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \ 2203 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2204 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \ 2205 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2206 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \ 2207 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2208 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \ 2210 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2211 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2212 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2213 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2215 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 2216 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \ 2217 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 2218 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \ 2220 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ 2221 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 2222 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 2225 #define CHROMA_MC_4_MMI \ 2226 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2227 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2228 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2229 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2231 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2232 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2233 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2234 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2236 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2237 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2238 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2239 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2241 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \ 2242 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 2247 ptrdiff_t stride,
int h,
int x,
int y)
2249 const int A = (8 - x) * (8 - y);
2250 const int B = (x) * (8 - y);
2251 const int C = (8 - x) * (y);
2252 const int D = (x) * (y);
2258 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2261 "li %[tmp0], 0x06 \n\t" 2262 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2263 "mtc1 %[tmp0], %[ftmp9] \n\t" 2264 "pshufh %[A], %[A], %[ftmp0] \n\t" 2265 "pshufh %[B], %[B], %[ftmp0] \n\t" 2266 "pshufh %[C], %[C], %[ftmp0] \n\t" 2267 "pshufh %[D], %[D], %[ftmp0] \n\t" 2270 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2271 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2272 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2273 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2274 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2278 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2279 "addiu %[h], %[h], -0x01 \n\t" 2280 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2281 "bnez %[h], 1b \n\t" 2282 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2283 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2284 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2285 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2286 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2289 [tmp0]
"=&r"(tmp[0]),
2290 [src]
"+&r"(src), [dst]
"+&r"(dst),
2293 [
A]
"f"(
A), [B]
"f"(B),
2294 [
C]
"f"(
C), [D]
"f"(D),
2302 ptrdiff_t stride,
int h,
int x,
int y)
2304 const int A = (8 - x) * (8 - y);
2305 const int B = (x) * (8 - y);
2306 const int C = (8 - x) * (y);
2307 const int D = (x) * (y);
2313 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2316 "li %[tmp0], 0x06 \n\t" 2317 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2318 "mtc1 %[tmp0], %[ftmp5] \n\t" 2319 "pshufh %[A], %[A], %[ftmp0] \n\t" 2320 "pshufh %[B], %[B], %[ftmp0] \n\t" 2321 "pshufh %[C], %[C], %[ftmp0] \n\t" 2322 "pshufh %[D], %[D], %[ftmp0] \n\t" 2325 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2326 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2327 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2328 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2329 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2333 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2334 "addiu %[h], %[h], -0x01 \n\t" 2335 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2336 "bnez %[h], 1b \n\t" 2337 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2338 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2339 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2340 [tmp0]
"=&r"(tmp[0]),
2343 [src]
"+&r"(src), [dst]
"+&r"(dst),
2346 [
A]
"f"(
A), [B]
"f"(B),
2347 [
C]
"f"(
C), [D]
"f"(D),
2355 ptrdiff_t stride,
int h,
int x,
int y)
2357 const int A = (8 - x) * (8 - y);
2358 const int B = (x) * (8 - y);
2359 const int C = (8 - x) * (y);
2360 const int D = (x) * (y);
2366 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2369 "li %[tmp0], 0x06 \n\t" 2370 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2371 "mtc1 %[tmp0], %[ftmp9] \n\t" 2372 "pshufh %[A], %[A], %[ftmp0] \n\t" 2373 "pshufh %[B], %[B], %[ftmp0] \n\t" 2374 "pshufh %[C], %[C], %[ftmp0] \n\t" 2375 "pshufh %[D], %[D], %[ftmp0] \n\t" 2378 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2379 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2380 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2381 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2382 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2386 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2387 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2389 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2390 "addiu %[h], %[h], -0x01 \n\t" 2391 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2392 "bnez %[h], 1b \n\t" 2393 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2394 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2395 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2396 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2397 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2398 [tmp0]
"=&r"(tmp[0]),
2401 [src]
"+&r"(src), [dst]
"+&r"(dst),
2404 [
A]
"f"(
A), [B]
"f"(B),
2405 [
C]
"f"(
C), [D]
"f"(D),
2413 ptrdiff_t stride,
int h,
int x,
int y)
2415 const int A = (8 - x) * (8 - y);
2416 const int B = ( x) * (8 - y);
2417 const int C = (8 - x) * ( y);
2418 const int D = ( x) * ( y);
2424 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2427 "li %[tmp0], 0x06 \n\t" 2428 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2429 "mtc1 %[tmp0], %[ftmp5] \n\t" 2430 "pshufh %[A], %[A], %[ftmp0] \n\t" 2431 "pshufh %[B], %[B], %[ftmp0] \n\t" 2432 "pshufh %[C], %[C], %[ftmp0] \n\t" 2433 "pshufh %[D], %[D], %[ftmp0] \n\t" 2436 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2437 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2438 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2439 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2440 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2444 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2445 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2447 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2448 "addiu %[h], %[h], -0x01 \n\t" 2449 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2450 "bnez %[h], 1b \n\t" 2451 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2452 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2453 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2454 [tmp0]
"=&r"(tmp[0]),
2457 [src]
"+&r"(src), [dst]
"+&r"(dst),
2460 [
A]
"f"(
A), [B]
"f"(B),
2461 [
C]
"f"(
C), [D]
"f"(D),
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
static int shift(int a, int b)
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
The exact code depends on how similar the blocks are and how related they are to the block
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
static const int shift1[6]
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
simple assert() macros that are a bit more flexible than ISO C assert().
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Prediction block[y][x] dc[1]
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
#define flags(name, subs,...)
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
GLint GLenum GLboolean GLsizei stride
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
static const double coeff[2][5]
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step