54 #define X8(x) x,x,x,x,x,x,x,x
66 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
67 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
68 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
69 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
73 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
74 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
75 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
76 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
80 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
81 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
82 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
83 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
87 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
88 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
89 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
90 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
94 65536, 65536, 65536, 65536,
95 3597, 3597, 3597, 3597,
96 2260, 2260, 2260, 2260,
97 1203, 1203, 1203, 1203,
103 #define ROW1 "%%xmm6"
104 #define ROW3 "%%xmm4"
105 #define ROW5 "%%xmm5"
106 #define ROW7 "%%xmm7"
108 #define CLEAR_ODD(r) "pxor "r","r" \n\t"
109 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
113 # define ROW0 "%%xmm8"
115 # define ROW2 "%%xmm9"
117 # define ROW4 "%%xmm10"
119 # define ROW6 "%%xmm11"
121 # define CLEAR_EVEN(r) CLEAR_ODD(r)
122 # define PUT_EVEN(dst) PUT_ODD(dst)
123 # define XMMS "%%xmm12"
124 # define MOV_32_ONLY "#"
126 # define TAN3 "%%xmm13"
127 # define TAN1 "%%xmm14"
132 # define REG0 "%%xmm4"
133 # define ROW2 "2*16(%0)"
134 # define REG2 "%%xmm4"
135 # define ROW4 "4*16(%0)"
136 # define REG4 "%%xmm6"
137 # define ROW6 "6*16(%0)"
138 # define REG6 "%%xmm6"
139 # define CLEAR_EVEN(r)
140 # define PUT_EVEN(dst) \
141 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
142 "movdqa %%xmm2, "dst" \n\t"
143 # define XMMS "%%xmm2"
144 # define MOV_32_ONLY "movdqa "
145 # define SREG2 "%%xmm7"
146 # define TAN3 "%%xmm0"
147 # define TAN1 "%%xmm2"
151 #define ROUND(x) "paddd "MANGLE(x)
153 #define JZ(reg, to) \
154 "testl "reg","reg" \n\t" \
157 #define JNZ(reg, to) \
158 "testl "reg","reg" \n\t" \
161 #define TEST_ONE_ROW(src, reg, clear) \
163 "movq "src", %%mm1 \n\t" \
164 "por 8+"src", %%mm1 \n\t" \
165 "paddusb %%mm0, %%mm1 \n\t" \
166 "pmovmskb %%mm1, "reg" \n\t"
168 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
171 "movq "row1", %%mm1 \n\t" \
172 "por 8+"row1", %%mm1 \n\t" \
173 "movq "row2", %%mm2 \n\t" \
174 "por 8+"row2", %%mm2 \n\t" \
175 "paddusb %%mm0, %%mm1 \n\t" \
176 "paddusb %%mm0, %%mm2 \n\t" \
177 "pmovmskb %%mm1, "reg1" \n\t" \
178 "pmovmskb %%mm2, "reg2" \n\t"
181 #define iMTX_MULT(src, table, rounder, put) \
182 "movdqa "src", %%xmm3 \n\t" \
183 "movdqa %%xmm3, %%xmm0 \n\t" \
184 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" \
185 "punpcklqdq %%xmm0, %%xmm0 \n\t" \
186 "pmaddwd "table", %%xmm0 \n\t" \
187 "pmaddwd 16+"table", %%xmm1 \n\t" \
188 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" \
189 "punpckhqdq %%xmm3, %%xmm3 \n\t" \
190 "pmaddwd 32+"table", %%xmm2 \n\t" \
191 "pmaddwd 48+"table", %%xmm3 \n\t" \
192 "paddd %%xmm1, %%xmm0 \n\t" \
193 "paddd %%xmm3, %%xmm2 \n\t" \
194 rounder", %%xmm0 \n\t" \
195 "movdqa %%xmm2, %%xmm3 \n\t" \
196 "paddd %%xmm0, %%xmm2 \n\t" \
197 "psubd %%xmm3, %%xmm0 \n\t" \
198 "psrad $11, %%xmm2 \n\t" \
199 "psrad $11, %%xmm0 \n\t" \
200 "packssdw %%xmm0, %%xmm2 \n\t" \
205 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
206 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
209 #define iLLM_PASS(dct) \
210 "movdqa "TAN3", %%xmm1 \n\t" \
211 "movdqa "TAN1", %%xmm3 \n\t" \
212 "pmulhw %%xmm4, "TAN3" \n\t" \
213 "pmulhw %%xmm5, %%xmm1 \n\t" \
214 "paddsw %%xmm4, "TAN3" \n\t" \
215 "paddsw %%xmm5, %%xmm1 \n\t" \
216 "psubsw %%xmm5, "TAN3" \n\t" \
217 "paddsw %%xmm4, %%xmm1 \n\t" \
218 "pmulhw %%xmm7, %%xmm3 \n\t" \
219 "pmulhw %%xmm6, "TAN1" \n\t" \
220 "paddsw %%xmm6, %%xmm3 \n\t" \
221 "psubsw %%xmm7, "TAN1" \n\t" \
222 "movdqa %%xmm3, %%xmm7 \n\t" \
223 "movdqa "TAN1", %%xmm6 \n\t" \
224 "psubsw %%xmm1, %%xmm3 \n\t" \
225 "psubsw "TAN3", "TAN1" \n\t" \
226 "paddsw %%xmm7, %%xmm1 \n\t" \
227 "paddsw %%xmm6, "TAN3" \n\t" \
228 "movdqa %%xmm3, %%xmm6 \n\t" \
229 "psubsw "TAN3", %%xmm3 \n\t" \
230 "paddsw %%xmm6, "TAN3" \n\t" \
231 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
232 "pmulhw %%xmm4, %%xmm3 \n\t" \
233 "pmulhw %%xmm4, "TAN3" \n\t" \
234 "paddsw "TAN3", "TAN3" \n\t" \
235 "paddsw %%xmm3, %%xmm3 \n\t" \
236 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
237 MOV_32_ONLY ROW2", "REG2" \n\t" \
238 MOV_32_ONLY ROW6", "REG6" \n\t" \
239 "movdqa %%xmm7, %%xmm5 \n\t" \
240 "pmulhw "REG6", %%xmm7 \n\t" \
241 "pmulhw "REG2", %%xmm5 \n\t" \
242 "paddsw "REG2", %%xmm7 \n\t" \
243 "psubsw "REG6", %%xmm5 \n\t" \
244 MOV_32_ONLY ROW0", "REG0" \n\t" \
245 MOV_32_ONLY ROW4", "REG4" \n\t" \
246 MOV_32_ONLY" "TAN1", (%0) \n\t" \
247 "movdqa "REG0", "XMMS" \n\t" \
248 "psubsw "REG4", "REG0" \n\t" \
249 "paddsw "XMMS", "REG4" \n\t" \
250 "movdqa "REG4", "XMMS" \n\t" \
251 "psubsw %%xmm7, "REG4" \n\t" \
252 "paddsw "XMMS", %%xmm7 \n\t" \
253 "movdqa "REG0", "XMMS" \n\t" \
254 "psubsw %%xmm5, "REG0" \n\t" \
255 "paddsw "XMMS", %%xmm5 \n\t" \
256 "movdqa %%xmm5, "XMMS" \n\t" \
257 "psubsw "TAN3", %%xmm5 \n\t" \
258 "paddsw "XMMS", "TAN3" \n\t" \
259 "movdqa "REG0", "XMMS" \n\t" \
260 "psubsw %%xmm3, "REG0" \n\t" \
261 "paddsw "XMMS", %%xmm3 \n\t" \
262 MOV_32_ONLY" (%0), "TAN1" \n\t" \
263 "psraw $6, %%xmm5 \n\t" \
264 "psraw $6, "REG0" \n\t" \
265 "psraw $6, "TAN3" \n\t" \
266 "psraw $6, %%xmm3 \n\t" \
267 "movdqa "TAN3", 1*16("dct") \n\t" \
268 "movdqa %%xmm3, 2*16("dct") \n\t" \
269 "movdqa "REG0", 5*16("dct") \n\t" \
270 "movdqa %%xmm5, 6*16("dct") \n\t" \
271 "movdqa %%xmm7, %%xmm0 \n\t" \
272 "movdqa "REG4", %%xmm4 \n\t" \
273 "psubsw %%xmm1, %%xmm7 \n\t" \
274 "psubsw "TAN1", "REG4" \n\t" \
275 "paddsw %%xmm0, %%xmm1 \n\t" \
276 "paddsw %%xmm4, "TAN1" \n\t" \
277 "psraw $6, %%xmm1 \n\t" \
278 "psraw $6, %%xmm7 \n\t" \
279 "psraw $6, "TAN1" \n\t" \
280 "psraw $6, "REG4" \n\t" \
281 "movdqa %%xmm1, ("dct") \n\t" \
282 "movdqa "TAN1", 3*16("dct") \n\t" \
283 "movdqa "REG4", 4*16("dct") \n\t" \
284 "movdqa %%xmm7, 7*16("dct") \n\t"
287 #define iLLM_PASS_SPARSE(dct) \
288 "pmulhw %%xmm4, "TAN3" \n\t" \
289 "paddsw %%xmm4, "TAN3" \n\t" \
290 "movdqa %%xmm6, %%xmm3 \n\t" \
291 "pmulhw %%xmm6, "TAN1" \n\t" \
292 "movdqa %%xmm4, %%xmm1 \n\t" \
293 "psubsw %%xmm1, %%xmm3 \n\t" \
294 "paddsw %%xmm6, %%xmm1 \n\t" \
295 "movdqa "TAN1", %%xmm6 \n\t" \
296 "psubsw "TAN3", "TAN1" \n\t" \
297 "paddsw %%xmm6, "TAN3" \n\t" \
298 "movdqa %%xmm3, %%xmm6 \n\t" \
299 "psubsw "TAN3", %%xmm3 \n\t" \
300 "paddsw %%xmm6, "TAN3" \n\t" \
301 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
302 "pmulhw %%xmm4, %%xmm3 \n\t" \
303 "pmulhw %%xmm4, "TAN3" \n\t" \
304 "paddsw "TAN3", "TAN3" \n\t" \
305 "paddsw %%xmm3, %%xmm3 \n\t" \
306 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
307 MOV_32_ONLY ROW2", "SREG2" \n\t" \
308 "pmulhw "SREG2", %%xmm5 \n\t" \
309 MOV_32_ONLY ROW0", "REG0" \n\t" \
310 "movdqa "REG0", %%xmm6 \n\t" \
311 "psubsw "SREG2", %%xmm6 \n\t" \
312 "paddsw "REG0", "SREG2" \n\t" \
313 MOV_32_ONLY" "TAN1", (%0) \n\t" \
314 "movdqa "REG0", "XMMS" \n\t" \
315 "psubsw %%xmm5, "REG0" \n\t" \
316 "paddsw "XMMS", %%xmm5 \n\t" \
317 "movdqa %%xmm5, "XMMS" \n\t" \
318 "psubsw "TAN3", %%xmm5 \n\t" \
319 "paddsw "XMMS", "TAN3" \n\t" \
320 "movdqa "REG0", "XMMS" \n\t" \
321 "psubsw %%xmm3, "REG0" \n\t" \
322 "paddsw "XMMS", %%xmm3 \n\t" \
323 MOV_32_ONLY" (%0), "TAN1" \n\t" \
324 "psraw $6, %%xmm5 \n\t" \
325 "psraw $6, "REG0" \n\t" \
326 "psraw $6, "TAN3" \n\t" \
327 "psraw $6, %%xmm3 \n\t" \
328 "movdqa "TAN3", 1*16("dct") \n\t" \
329 "movdqa %%xmm3, 2*16("dct") \n\t" \
330 "movdqa "REG0", 5*16("dct") \n\t" \
331 "movdqa %%xmm5, 6*16("dct") \n\t" \
332 "movdqa "SREG2", %%xmm0 \n\t" \
333 "movdqa %%xmm6, %%xmm4 \n\t" \
334 "psubsw %%xmm1, "SREG2" \n\t" \
335 "psubsw "TAN1", %%xmm6 \n\t" \
336 "paddsw %%xmm0, %%xmm1 \n\t" \
337 "paddsw %%xmm4, "TAN1" \n\t" \
338 "psraw $6, %%xmm1 \n\t" \
339 "psraw $6, "SREG2" \n\t" \
340 "psraw $6, "TAN1" \n\t" \
341 "psraw $6, %%xmm6 \n\t" \
342 "movdqa %%xmm1, ("dct") \n\t" \
343 "movdqa "TAN1", 3*16("dct") \n\t" \
344 "movdqa %%xmm6, 4*16("dct") \n\t" \
345 "movdqa "SREG2", 7*16("dct") \n\t"
350 "movq "MANGLE(m127)
", %%mm0 \n\t"
351 iMTX_MULT(
"(%0)",
MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
352 iMTX_MULT("1*16(%0)",
MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
353 iMTX_MULT("2*16(%0)",
MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
355 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
357 iMTX_MULT("3*16(%0)",
MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
359 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
360 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
367 iLLM_PASS_SPARSE("%0")
370 iMTX_MULT("4*16(%0)",
MANGLE(iTab1), "
#", PUT_EVEN(ROW4))
372 iMTX_MULT(
"5*16(%0)",
MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
375 iMTX_MULT("6*16(%0)",
MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
378 iMTX_MULT("7*16(%0)",
MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
387 "%xmm4" ,
"%xmm5" ,
"%xmm6" ,
"%xmm7" ,)
390 "%xmm12",
"%xmm13",
"%xmm14",)
392 "%eax",
"%ecx",
"%edx",
"%esi",
"memory"