00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "dsputil_mmx.h"
00032 #include "idct_xvid.h"
00033
00034
00035
00036
00037
00038 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
00039 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00040
00041 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00042 {0x8000000080000000ULL, 0x8000000080000000ULL};
00043
00044 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
00045 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
00046 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
00047 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00048 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
00049 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
00050 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00051 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00052 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
00053 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
00054 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00055 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
00056 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
00057 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00058 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00059 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
00060 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
00061 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
00062 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00063 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00064 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00065
00066 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
00067 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
00068 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
00069 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
00070 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
00071 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
00072 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00073 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
00074 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
00075 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
00076 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
00077 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00078 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
00079
00080 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00081 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00082
00083 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
00084 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
00085
00086 #define MOVQ_BFE(regd) \
00087 __asm__ volatile ( \
00088 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00089 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00090
00091 #ifndef PIC
00092 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00093 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00094 #else
00095
00096
00097 #define MOVQ_BONE(regd) \
00098 __asm__ volatile ( \
00099 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00100 "psrlw $15, %%" #regd " \n\t" \
00101 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00102
00103 #define MOVQ_WTWO(regd) \
00104 __asm__ volatile ( \
00105 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00106 "psrlw $15, %%" #regd " \n\t" \
00107 "psllw $1, %%" #regd " \n\t"::)
00108
00109 #endif
00110
00111
00112
00113
00114 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00115 "movq " #rega ", " #regr " \n\t"\
00116 "pand " #regb ", " #regr " \n\t"\
00117 "pxor " #rega ", " #regb " \n\t"\
00118 "pand " #regfe "," #regb " \n\t"\
00119 "psrlq $1, " #regb " \n\t"\
00120 "paddb " #regb ", " #regr " \n\t"
00121
00122 #define PAVGB_MMX(rega, regb, regr, regfe) \
00123 "movq " #rega ", " #regr " \n\t"\
00124 "por " #regb ", " #regr " \n\t"\
00125 "pxor " #rega ", " #regb " \n\t"\
00126 "pand " #regfe "," #regb " \n\t"\
00127 "psrlq $1, " #regb " \n\t"\
00128 "psubb " #regb ", " #regr " \n\t"
00129
00130
00131 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00132 "movq " #rega ", " #regr " \n\t"\
00133 "movq " #regc ", " #regp " \n\t"\
00134 "pand " #regb ", " #regr " \n\t"\
00135 "pand " #regd ", " #regp " \n\t"\
00136 "pxor " #rega ", " #regb " \n\t"\
00137 "pxor " #regc ", " #regd " \n\t"\
00138 "pand %%mm6, " #regb " \n\t"\
00139 "pand %%mm6, " #regd " \n\t"\
00140 "psrlq $1, " #regb " \n\t"\
00141 "psrlq $1, " #regd " \n\t"\
00142 "paddb " #regb ", " #regr " \n\t"\
00143 "paddb " #regd ", " #regp " \n\t"
00144
00145 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00146 "movq " #rega ", " #regr " \n\t"\
00147 "movq " #regc ", " #regp " \n\t"\
00148 "por " #regb ", " #regr " \n\t"\
00149 "por " #regd ", " #regp " \n\t"\
00150 "pxor " #rega ", " #regb " \n\t"\
00151 "pxor " #regc ", " #regd " \n\t"\
00152 "pand %%mm6, " #regb " \n\t"\
00153 "pand %%mm6, " #regd " \n\t"\
00154 "psrlq $1, " #regd " \n\t"\
00155 "psrlq $1, " #regb " \n\t"\
00156 "psubb " #regb ", " #regr " \n\t"\
00157 "psubb " #regd ", " #regp " \n\t"
00158
00159
00160
00161 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00162 #define SET_RND MOVQ_WONE
00163 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00164 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00165 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
00166
00167 #include "dsputil_mmx_rnd_template.c"
00168
00169 #undef DEF
00170 #undef SET_RND
00171 #undef PAVGBP
00172 #undef PAVGB
00173
00174
00175
00176 #define DEF(x, y) x ## _ ## y ##_mmx
00177 #define SET_RND MOVQ_WTWO
00178 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00179 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00180
00181 #include "dsputil_mmx_rnd_template.c"
00182
00183 #undef DEF
00184 #undef SET_RND
00185 #undef PAVGBP
00186 #undef PAVGB
00187 #undef OP_AVG
00188
00189
00190
00191
00192 #define DEF(x) x ## _3dnow
00193 #define PAVGB "pavgusb"
00194 #define OP_AVG PAVGB
00195
00196 #include "dsputil_mmx_avg_template.c"
00197
00198 #undef DEF
00199 #undef PAVGB
00200 #undef OP_AVG
00201
00202
00203
00204
00205 #define DEF(x) x ## _mmx2
00206
00207
00208 #define PAVGB "pavgb"
00209 #define OP_AVG PAVGB
00210
00211 #include "dsputil_mmx_avg_template.c"
00212
00213 #undef DEF
00214 #undef PAVGB
00215 #undef OP_AVG
00216
00217 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00218 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00219 #define put_pixels16_mmx2 put_pixels16_mmx
00220 #define put_pixels8_mmx2 put_pixels8_mmx
00221 #define put_pixels4_mmx2 put_pixels4_mmx
00222 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00223 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00224 #define put_pixels16_3dnow put_pixels16_mmx
00225 #define put_pixels8_3dnow put_pixels8_mmx
00226 #define put_pixels4_3dnow put_pixels4_mmx
00227 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00228 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00229
00230
00231
00232
00233 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00234 {
00235 const DCTELEM *p;
00236 uint8_t *pix;
00237
00238
00239 p = block;
00240 pix = pixels;
00241
00242 __asm__ volatile(
00243 "movq %3, %%mm0 \n\t"
00244 "movq 8%3, %%mm1 \n\t"
00245 "movq 16%3, %%mm2 \n\t"
00246 "movq 24%3, %%mm3 \n\t"
00247 "movq 32%3, %%mm4 \n\t"
00248 "movq 40%3, %%mm5 \n\t"
00249 "movq 48%3, %%mm6 \n\t"
00250 "movq 56%3, %%mm7 \n\t"
00251 "packuswb %%mm1, %%mm0 \n\t"
00252 "packuswb %%mm3, %%mm2 \n\t"
00253 "packuswb %%mm5, %%mm4 \n\t"
00254 "packuswb %%mm7, %%mm6 \n\t"
00255 "movq %%mm0, (%0) \n\t"
00256 "movq %%mm2, (%0, %1) \n\t"
00257 "movq %%mm4, (%0, %1, 2) \n\t"
00258 "movq %%mm6, (%0, %2) \n\t"
00259 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
00260 :"memory");
00261 pix += line_size*4;
00262 p += 32;
00263
00264
00265
00266
00267 __asm__ volatile(
00268 "movq (%3), %%mm0 \n\t"
00269 "movq 8(%3), %%mm1 \n\t"
00270 "movq 16(%3), %%mm2 \n\t"
00271 "movq 24(%3), %%mm3 \n\t"
00272 "movq 32(%3), %%mm4 \n\t"
00273 "movq 40(%3), %%mm5 \n\t"
00274 "movq 48(%3), %%mm6 \n\t"
00275 "movq 56(%3), %%mm7 \n\t"
00276 "packuswb %%mm1, %%mm0 \n\t"
00277 "packuswb %%mm3, %%mm2 \n\t"
00278 "packuswb %%mm5, %%mm4 \n\t"
00279 "packuswb %%mm7, %%mm6 \n\t"
00280 "movq %%mm0, (%0) \n\t"
00281 "movq %%mm2, (%0, %1) \n\t"
00282 "movq %%mm4, (%0, %1, 2) \n\t"
00283 "movq %%mm6, (%0, %2) \n\t"
00284 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
00285 :"memory");
00286 }
00287
00288 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
00289 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00290
00291 #define put_signed_pixels_clamped_mmx_half(off) \
00292 "movq "#off"(%2), %%mm1 \n\t"\
00293 "movq 16+"#off"(%2), %%mm2 \n\t"\
00294 "movq 32+"#off"(%2), %%mm3 \n\t"\
00295 "movq 48+"#off"(%2), %%mm4 \n\t"\
00296 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
00297 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
00298 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
00299 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
00300 "paddb %%mm0, %%mm1 \n\t"\
00301 "paddb %%mm0, %%mm2 \n\t"\
00302 "paddb %%mm0, %%mm3 \n\t"\
00303 "paddb %%mm0, %%mm4 \n\t"\
00304 "movq %%mm1, (%0) \n\t"\
00305 "movq %%mm2, (%0, %3) \n\t"\
00306 "movq %%mm3, (%0, %3, 2) \n\t"\
00307 "movq %%mm4, (%0, %1) \n\t"
00308
00309 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00310 {
00311 x86_reg line_skip = line_size;
00312 x86_reg line_skip3;
00313
00314 __asm__ volatile (
00315 "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
00316 "lea (%3, %3, 2), %1 \n\t"
00317 put_signed_pixels_clamped_mmx_half(0)
00318 "lea (%0, %3, 4), %0 \n\t"
00319 put_signed_pixels_clamped_mmx_half(64)
00320 :"+&r" (pixels), "=&r" (line_skip3)
00321 :"r" (block), "r"(line_skip)
00322 :"memory");
00323 }
00324
00325 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00326 {
00327 const DCTELEM *p;
00328 uint8_t *pix;
00329 int i;
00330
00331
00332 p = block;
00333 pix = pixels;
00334 MOVQ_ZERO(mm7);
00335 i = 4;
00336 do {
00337 __asm__ volatile(
00338 "movq (%2), %%mm0 \n\t"
00339 "movq 8(%2), %%mm1 \n\t"
00340 "movq 16(%2), %%mm2 \n\t"
00341 "movq 24(%2), %%mm3 \n\t"
00342 "movq %0, %%mm4 \n\t"
00343 "movq %1, %%mm6 \n\t"
00344 "movq %%mm4, %%mm5 \n\t"
00345 "punpcklbw %%mm7, %%mm4 \n\t"
00346 "punpckhbw %%mm7, %%mm5 \n\t"
00347 "paddsw %%mm4, %%mm0 \n\t"
00348 "paddsw %%mm5, %%mm1 \n\t"
00349 "movq %%mm6, %%mm5 \n\t"
00350 "punpcklbw %%mm7, %%mm6 \n\t"
00351 "punpckhbw %%mm7, %%mm5 \n\t"
00352 "paddsw %%mm6, %%mm2 \n\t"
00353 "paddsw %%mm5, %%mm3 \n\t"
00354 "packuswb %%mm1, %%mm0 \n\t"
00355 "packuswb %%mm3, %%mm2 \n\t"
00356 "movq %%mm0, %0 \n\t"
00357 "movq %%mm2, %1 \n\t"
00358 :"+m"(*pix), "+m"(*(pix+line_size))
00359 :"r"(p)
00360 :"memory");
00361 pix += line_size*2;
00362 p += 16;
00363 } while (--i);
00364 }
00365
00366 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00367 {
00368 __asm__ volatile(
00369 "lea (%3, %3), %%"REG_a" \n\t"
00370 ASMALIGN(3)
00371 "1: \n\t"
00372 "movd (%1), %%mm0 \n\t"
00373 "movd (%1, %3), %%mm1 \n\t"
00374 "movd %%mm0, (%2) \n\t"
00375 "movd %%mm1, (%2, %3) \n\t"
00376 "add %%"REG_a", %1 \n\t"
00377 "add %%"REG_a", %2 \n\t"
00378 "movd (%1), %%mm0 \n\t"
00379 "movd (%1, %3), %%mm1 \n\t"
00380 "movd %%mm0, (%2) \n\t"
00381 "movd %%mm1, (%2, %3) \n\t"
00382 "add %%"REG_a", %1 \n\t"
00383 "add %%"REG_a", %2 \n\t"
00384 "subl $4, %0 \n\t"
00385 "jnz 1b \n\t"
00386 : "+g"(h), "+r" (pixels), "+r" (block)
00387 : "r"((x86_reg)line_size)
00388 : "%"REG_a, "memory"
00389 );
00390 }
00391
00392 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00393 {
00394 __asm__ volatile(
00395 "lea (%3, %3), %%"REG_a" \n\t"
00396 ASMALIGN(3)
00397 "1: \n\t"
00398 "movq (%1), %%mm0 \n\t"
00399 "movq (%1, %3), %%mm1 \n\t"
00400 "movq %%mm0, (%2) \n\t"
00401 "movq %%mm1, (%2, %3) \n\t"
00402 "add %%"REG_a", %1 \n\t"
00403 "add %%"REG_a", %2 \n\t"
00404 "movq (%1), %%mm0 \n\t"
00405 "movq (%1, %3), %%mm1 \n\t"
00406 "movq %%mm0, (%2) \n\t"
00407 "movq %%mm1, (%2, %3) \n\t"
00408 "add %%"REG_a", %1 \n\t"
00409 "add %%"REG_a", %2 \n\t"
00410 "subl $4, %0 \n\t"
00411 "jnz 1b \n\t"
00412 : "+g"(h), "+r" (pixels), "+r" (block)
00413 : "r"((x86_reg)line_size)
00414 : "%"REG_a, "memory"
00415 );
00416 }
00417
00418 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00419 {
00420 __asm__ volatile(
00421 "lea (%3, %3), %%"REG_a" \n\t"
00422 ASMALIGN(3)
00423 "1: \n\t"
00424 "movq (%1), %%mm0 \n\t"
00425 "movq 8(%1), %%mm4 \n\t"
00426 "movq (%1, %3), %%mm1 \n\t"
00427 "movq 8(%1, %3), %%mm5 \n\t"
00428 "movq %%mm0, (%2) \n\t"
00429 "movq %%mm4, 8(%2) \n\t"
00430 "movq %%mm1, (%2, %3) \n\t"
00431 "movq %%mm5, 8(%2, %3) \n\t"
00432 "add %%"REG_a", %1 \n\t"
00433 "add %%"REG_a", %2 \n\t"
00434 "movq (%1), %%mm0 \n\t"
00435 "movq 8(%1), %%mm4 \n\t"
00436 "movq (%1, %3), %%mm1 \n\t"
00437 "movq 8(%1, %3), %%mm5 \n\t"
00438 "movq %%mm0, (%2) \n\t"
00439 "movq %%mm4, 8(%2) \n\t"
00440 "movq %%mm1, (%2, %3) \n\t"
00441 "movq %%mm5, 8(%2, %3) \n\t"
00442 "add %%"REG_a", %1 \n\t"
00443 "add %%"REG_a", %2 \n\t"
00444 "subl $4, %0 \n\t"
00445 "jnz 1b \n\t"
00446 : "+g"(h), "+r" (pixels), "+r" (block)
00447 : "r"((x86_reg)line_size)
00448 : "%"REG_a, "memory"
00449 );
00450 }
00451
00452 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00453 {
00454 __asm__ volatile(
00455 "1: \n\t"
00456 "movdqu (%1), %%xmm0 \n\t"
00457 "movdqu (%1,%3), %%xmm1 \n\t"
00458 "movdqu (%1,%3,2), %%xmm2 \n\t"
00459 "movdqu (%1,%4), %%xmm3 \n\t"
00460 "movdqa %%xmm0, (%2) \n\t"
00461 "movdqa %%xmm1, (%2,%3) \n\t"
00462 "movdqa %%xmm2, (%2,%3,2) \n\t"
00463 "movdqa %%xmm3, (%2,%4) \n\t"
00464 "subl $4, %0 \n\t"
00465 "lea (%1,%3,4), %1 \n\t"
00466 "lea (%2,%3,4), %2 \n\t"
00467 "jnz 1b \n\t"
00468 : "+g"(h), "+r" (pixels), "+r" (block)
00469 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00470 : "memory"
00471 );
00472 }
00473
00474 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00475 {
00476 __asm__ volatile(
00477 "1: \n\t"
00478 "movdqu (%1), %%xmm0 \n\t"
00479 "movdqu (%1,%3), %%xmm1 \n\t"
00480 "movdqu (%1,%3,2), %%xmm2 \n\t"
00481 "movdqu (%1,%4), %%xmm3 \n\t"
00482 "pavgb (%2), %%xmm0 \n\t"
00483 "pavgb (%2,%3), %%xmm1 \n\t"
00484 "pavgb (%2,%3,2), %%xmm2 \n\t"
00485 "pavgb (%2,%4), %%xmm3 \n\t"
00486 "movdqa %%xmm0, (%2) \n\t"
00487 "movdqa %%xmm1, (%2,%3) \n\t"
00488 "movdqa %%xmm2, (%2,%3,2) \n\t"
00489 "movdqa %%xmm3, (%2,%4) \n\t"
00490 "subl $4, %0 \n\t"
00491 "lea (%1,%3,4), %1 \n\t"
00492 "lea (%2,%3,4), %2 \n\t"
00493 "jnz 1b \n\t"
00494 : "+g"(h), "+r" (pixels), "+r" (block)
00495 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00496 : "memory"
00497 );
00498 }
00499
00500 #define CLEAR_BLOCKS(name,n) \
00501 static void name(DCTELEM *blocks)\
00502 {\
00503 __asm__ volatile(\
00504 "pxor %%mm7, %%mm7 \n\t"\
00505 "mov %1, %%"REG_a" \n\t"\
00506 "1: \n\t"\
00507 "movq %%mm7, (%0, %%"REG_a") \n\t"\
00508 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
00509 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
00510 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
00511 "add $32, %%"REG_a" \n\t"\
00512 " js 1b \n\t"\
00513 : : "r" (((uint8_t *)blocks)+128*n),\
00514 "i" (-128*n)\
00515 : "%"REG_a\
00516 );\
00517 }
00518 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00519 CLEAR_BLOCKS(clear_block_mmx, 1)
00520
00521 static void clear_block_sse(DCTELEM *block)
00522 {
00523 __asm__ volatile(
00524 "xorps %%xmm0, %%xmm0 \n"
00525 "movaps %%xmm0, (%0) \n"
00526 "movaps %%xmm0, 16(%0) \n"
00527 "movaps %%xmm0, 32(%0) \n"
00528 "movaps %%xmm0, 48(%0) \n"
00529 "movaps %%xmm0, 64(%0) \n"
00530 "movaps %%xmm0, 80(%0) \n"
00531 "movaps %%xmm0, 96(%0) \n"
00532 "movaps %%xmm0, 112(%0) \n"
00533 :: "r"(block)
00534 : "memory"
00535 );
00536 }
00537
00538 static void clear_blocks_sse(DCTELEM *blocks)
00539 {\
00540 __asm__ volatile(
00541 "xorps %%xmm0, %%xmm0 \n"
00542 "mov %1, %%"REG_a" \n"
00543 "1: \n"
00544 "movaps %%xmm0, (%0, %%"REG_a") \n"
00545 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
00546 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
00547 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
00548 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
00549 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
00550 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
00551 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
00552 "add $128, %%"REG_a" \n"
00553 " js 1b \n"
00554 : : "r" (((uint8_t *)blocks)+128*6),
00555 "i" (-128*6)
00556 : "%"REG_a
00557 );
00558 }
00559
00560 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00561 x86_reg i=0;
00562 __asm__ volatile(
00563 "jmp 2f \n\t"
00564 "1: \n\t"
00565 "movq (%1, %0), %%mm0 \n\t"
00566 "movq (%2, %0), %%mm1 \n\t"
00567 "paddb %%mm0, %%mm1 \n\t"
00568 "movq %%mm1, (%2, %0) \n\t"
00569 "movq 8(%1, %0), %%mm0 \n\t"
00570 "movq 8(%2, %0), %%mm1 \n\t"
00571 "paddb %%mm0, %%mm1 \n\t"
00572 "movq %%mm1, 8(%2, %0) \n\t"
00573 "add $16, %0 \n\t"
00574 "2: \n\t"
00575 "cmp %3, %0 \n\t"
00576 " js 1b \n\t"
00577 : "+r" (i)
00578 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
00579 );
00580 for(; i<w; i++)
00581 dst[i+0] += src[i+0];
00582 }
00583
00584 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00585 x86_reg i=0;
00586 __asm__ volatile(
00587 "jmp 2f \n\t"
00588 "1: \n\t"
00589 "movq (%2, %0), %%mm0 \n\t"
00590 "movq 8(%2, %0), %%mm1 \n\t"
00591 "paddb (%3, %0), %%mm0 \n\t"
00592 "paddb 8(%3, %0), %%mm1 \n\t"
00593 "movq %%mm0, (%1, %0) \n\t"
00594 "movq %%mm1, 8(%1, %0) \n\t"
00595 "add $16, %0 \n\t"
00596 "2: \n\t"
00597 "cmp %4, %0 \n\t"
00598 " js 1b \n\t"
00599 : "+r" (i)
00600 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
00601 );
00602 for(; i<w; i++)
00603 dst[i] = src1[i] + src2[i];
00604 }
00605
00606 #if HAVE_7REGS && HAVE_TEN_OPERANDS
00607 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
00608 x86_reg w2 = -w;
00609 x86_reg x;
00610 int l = *left & 0xff;
00611 int tl = *left_top & 0xff;
00612 int t;
00613 __asm__ volatile(
00614 "mov %7, %3 \n"
00615 "1: \n"
00616 "movzbl (%3,%4), %2 \n"
00617 "mov %2, %k3 \n"
00618 "sub %b1, %b3 \n"
00619 "add %b0, %b3 \n"
00620 "mov %2, %1 \n"
00621 "cmp %0, %2 \n"
00622 "cmovg %0, %2 \n"
00623 "cmovg %1, %0 \n"
00624 "cmp %k3, %0 \n"
00625 "cmovg %k3, %0 \n"
00626 "mov %7, %3 \n"
00627 "cmp %2, %0 \n"
00628 "cmovl %2, %0 \n"
00629 "add (%6,%4), %b0 \n"
00630 "mov %b0, (%5,%4) \n"
00631 "inc %4 \n"
00632 "jl 1b \n"
00633 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00634 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
00635 );
00636 *left = l;
00637 *left_top = tl;
00638 }
00639 #endif
00640
00641 #define H263_LOOP_FILTER \
00642 "pxor %%mm7, %%mm7 \n\t"\
00643 "movq %0, %%mm0 \n\t"\
00644 "movq %0, %%mm1 \n\t"\
00645 "movq %3, %%mm2 \n\t"\
00646 "movq %3, %%mm3 \n\t"\
00647 "punpcklbw %%mm7, %%mm0 \n\t"\
00648 "punpckhbw %%mm7, %%mm1 \n\t"\
00649 "punpcklbw %%mm7, %%mm2 \n\t"\
00650 "punpckhbw %%mm7, %%mm3 \n\t"\
00651 "psubw %%mm2, %%mm0 \n\t"\
00652 "psubw %%mm3, %%mm1 \n\t"\
00653 "movq %1, %%mm2 \n\t"\
00654 "movq %1, %%mm3 \n\t"\
00655 "movq %2, %%mm4 \n\t"\
00656 "movq %2, %%mm5 \n\t"\
00657 "punpcklbw %%mm7, %%mm2 \n\t"\
00658 "punpckhbw %%mm7, %%mm3 \n\t"\
00659 "punpcklbw %%mm7, %%mm4 \n\t"\
00660 "punpckhbw %%mm7, %%mm5 \n\t"\
00661 "psubw %%mm2, %%mm4 \n\t"\
00662 "psubw %%mm3, %%mm5 \n\t"\
00663 "psllw $2, %%mm4 \n\t"\
00664 "psllw $2, %%mm5 \n\t"\
00665 "paddw %%mm0, %%mm4 \n\t"\
00666 "paddw %%mm1, %%mm5 \n\t"\
00667 "pxor %%mm6, %%mm6 \n\t"\
00668 "pcmpgtw %%mm4, %%mm6 \n\t"\
00669 "pcmpgtw %%mm5, %%mm7 \n\t"\
00670 "pxor %%mm6, %%mm4 \n\t"\
00671 "pxor %%mm7, %%mm5 \n\t"\
00672 "psubw %%mm6, %%mm4 \n\t"\
00673 "psubw %%mm7, %%mm5 \n\t"\
00674 "psrlw $3, %%mm4 \n\t"\
00675 "psrlw $3, %%mm5 \n\t"\
00676 "packuswb %%mm5, %%mm4 \n\t"\
00677 "packsswb %%mm7, %%mm6 \n\t"\
00678 "pxor %%mm7, %%mm7 \n\t"\
00679 "movd %4, %%mm2 \n\t"\
00680 "punpcklbw %%mm2, %%mm2 \n\t"\
00681 "punpcklbw %%mm2, %%mm2 \n\t"\
00682 "punpcklbw %%mm2, %%mm2 \n\t"\
00683 "psubusb %%mm4, %%mm2 \n\t"\
00684 "movq %%mm2, %%mm3 \n\t"\
00685 "psubusb %%mm4, %%mm3 \n\t"\
00686 "psubb %%mm3, %%mm2 \n\t"\
00687 "movq %1, %%mm3 \n\t"\
00688 "movq %2, %%mm4 \n\t"\
00689 "pxor %%mm6, %%mm3 \n\t"\
00690 "pxor %%mm6, %%mm4 \n\t"\
00691 "paddusb %%mm2, %%mm3 \n\t"\
00692 "psubusb %%mm2, %%mm4 \n\t"\
00693 "pxor %%mm6, %%mm3 \n\t"\
00694 "pxor %%mm6, %%mm4 \n\t"\
00695 "paddusb %%mm2, %%mm2 \n\t"\
00696 "packsswb %%mm1, %%mm0 \n\t"\
00697 "pcmpgtb %%mm0, %%mm7 \n\t"\
00698 "pxor %%mm7, %%mm0 \n\t"\
00699 "psubb %%mm7, %%mm0 \n\t"\
00700 "movq %%mm0, %%mm1 \n\t"\
00701 "psubusb %%mm2, %%mm0 \n\t"\
00702 "psubb %%mm0, %%mm1 \n\t"\
00703 "pand %5, %%mm1 \n\t"\
00704 "psrlw $2, %%mm1 \n\t"\
00705 "pxor %%mm7, %%mm1 \n\t"\
00706 "psubb %%mm7, %%mm1 \n\t"\
00707 "movq %0, %%mm5 \n\t"\
00708 "movq %3, %%mm6 \n\t"\
00709 "psubb %%mm1, %%mm5 \n\t"\
00710 "paddb %%mm1, %%mm6 \n\t"
00711
00712 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00713 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00714 const int strength= ff_h263_loop_filter_strength[qscale];
00715
00716 __asm__ volatile(
00717
00718 H263_LOOP_FILTER
00719
00720 "movq %%mm3, %1 \n\t"
00721 "movq %%mm4, %2 \n\t"
00722 "movq %%mm5, %0 \n\t"
00723 "movq %%mm6, %3 \n\t"
00724 : "+m" (*(uint64_t*)(src - 2*stride)),
00725 "+m" (*(uint64_t*)(src - 1*stride)),
00726 "+m" (*(uint64_t*)(src + 0*stride)),
00727 "+m" (*(uint64_t*)(src + 1*stride))
00728 : "g" (2*strength), "m"(ff_pb_FC)
00729 );
00730 }
00731 }
00732
00733 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00734 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00735 const int strength= ff_h263_loop_filter_strength[qscale];
00736 DECLARE_ALIGNED(8, uint64_t, temp)[4];
00737 uint8_t *btemp= (uint8_t*)temp;
00738
00739 src -= 2;
00740
00741 transpose4x4(btemp , src , 8, stride);
00742 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00743 __asm__ volatile(
00744 H263_LOOP_FILTER
00745
00746 : "+m" (temp[0]),
00747 "+m" (temp[1]),
00748 "+m" (temp[2]),
00749 "+m" (temp[3])
00750 : "g" (2*strength), "m"(ff_pb_FC)
00751 );
00752
00753 __asm__ volatile(
00754 "movq %%mm5, %%mm1 \n\t"
00755 "movq %%mm4, %%mm0 \n\t"
00756 "punpcklbw %%mm3, %%mm5 \n\t"
00757 "punpcklbw %%mm6, %%mm4 \n\t"
00758 "punpckhbw %%mm3, %%mm1 \n\t"
00759 "punpckhbw %%mm6, %%mm0 \n\t"
00760 "movq %%mm5, %%mm3 \n\t"
00761 "movq %%mm1, %%mm6 \n\t"
00762 "punpcklwd %%mm4, %%mm5 \n\t"
00763 "punpcklwd %%mm0, %%mm1 \n\t"
00764 "punpckhwd %%mm4, %%mm3 \n\t"
00765 "punpckhwd %%mm0, %%mm6 \n\t"
00766 "movd %%mm5, (%0) \n\t"
00767 "punpckhdq %%mm5, %%mm5 \n\t"
00768 "movd %%mm5, (%0,%2) \n\t"
00769 "movd %%mm3, (%0,%2,2) \n\t"
00770 "punpckhdq %%mm3, %%mm3 \n\t"
00771 "movd %%mm3, (%0,%3) \n\t"
00772 "movd %%mm1, (%1) \n\t"
00773 "punpckhdq %%mm1, %%mm1 \n\t"
00774 "movd %%mm1, (%1,%2) \n\t"
00775 "movd %%mm6, (%1,%2,2) \n\t"
00776 "punpckhdq %%mm6, %%mm6 \n\t"
00777 "movd %%mm6, (%1,%3) \n\t"
00778 :: "r" (src),
00779 "r" (src + 4*stride),
00780 "r" ((x86_reg) stride ),
00781 "r" ((x86_reg)(3*stride))
00782 );
00783 }
00784 }
00785
00786
00787
00788 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00789 {
00790 uint8_t *ptr, *last_line;
00791 int i;
00792
00793 last_line = buf + (height - 1) * wrap;
00794
00795 ptr = buf;
00796 if(w==8)
00797 {
00798 __asm__ volatile(
00799 "1: \n\t"
00800 "movd (%0), %%mm0 \n\t"
00801 "punpcklbw %%mm0, %%mm0 \n\t"
00802 "punpcklwd %%mm0, %%mm0 \n\t"
00803 "punpckldq %%mm0, %%mm0 \n\t"
00804 "movq %%mm0, -8(%0) \n\t"
00805 "movq -8(%0, %2), %%mm1 \n\t"
00806 "punpckhbw %%mm1, %%mm1 \n\t"
00807 "punpckhwd %%mm1, %%mm1 \n\t"
00808 "punpckhdq %%mm1, %%mm1 \n\t"
00809 "movq %%mm1, (%0, %2) \n\t"
00810 "add %1, %0 \n\t"
00811 "cmp %3, %0 \n\t"
00812 " jb 1b \n\t"
00813 : "+r" (ptr)
00814 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00815 );
00816 }
00817 else
00818 {
00819 __asm__ volatile(
00820 "1: \n\t"
00821 "movd (%0), %%mm0 \n\t"
00822 "punpcklbw %%mm0, %%mm0 \n\t"
00823 "punpcklwd %%mm0, %%mm0 \n\t"
00824 "punpckldq %%mm0, %%mm0 \n\t"
00825 "movq %%mm0, -8(%0) \n\t"
00826 "movq %%mm0, -16(%0) \n\t"
00827 "movq -8(%0, %2), %%mm1 \n\t"
00828 "punpckhbw %%mm1, %%mm1 \n\t"
00829 "punpckhwd %%mm1, %%mm1 \n\t"
00830 "punpckhdq %%mm1, %%mm1 \n\t"
00831 "movq %%mm1, (%0, %2) \n\t"
00832 "movq %%mm1, 8(%0, %2) \n\t"
00833 "add %1, %0 \n\t"
00834 "cmp %3, %0 \n\t"
00835 " jb 1b \n\t"
00836 : "+r" (ptr)
00837 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00838 );
00839 }
00840
00841 for(i=0;i<w;i+=4) {
00842
00843 ptr= buf - (i + 1) * wrap - w;
00844 __asm__ volatile(
00845 "1: \n\t"
00846 "movq (%1, %0), %%mm0 \n\t"
00847 "movq %%mm0, (%0) \n\t"
00848 "movq %%mm0, (%0, %2) \n\t"
00849 "movq %%mm0, (%0, %2, 2) \n\t"
00850 "movq %%mm0, (%0, %3) \n\t"
00851 "add $8, %0 \n\t"
00852 "cmp %4, %0 \n\t"
00853 " jb 1b \n\t"
00854 : "+r" (ptr)
00855 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
00856 );
00857 ptr= last_line + (i + 1) * wrap - w;
00858 __asm__ volatile(
00859 "1: \n\t"
00860 "movq (%1, %0), %%mm0 \n\t"
00861 "movq %%mm0, (%0) \n\t"
00862 "movq %%mm0, (%0, %2) \n\t"
00863 "movq %%mm0, (%0, %2, 2) \n\t"
00864 "movq %%mm0, (%0, %3) \n\t"
00865 "add $8, %0 \n\t"
00866 "cmp %4, %0 \n\t"
00867 " jb 1b \n\t"
00868 : "+r" (ptr)
00869 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
00870 );
00871 }
00872 }
00873
00874 #define PAETH(cpu, abs3)\
00875 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
00876 {\
00877 x86_reg i = -bpp;\
00878 x86_reg end = w-3;\
00879 __asm__ volatile(\
00880 "pxor %%mm7, %%mm7 \n"\
00881 "movd (%1,%0), %%mm0 \n"\
00882 "movd (%2,%0), %%mm1 \n"\
00883 "punpcklbw %%mm7, %%mm0 \n"\
00884 "punpcklbw %%mm7, %%mm1 \n"\
00885 "add %4, %0 \n"\
00886 "1: \n"\
00887 "movq %%mm1, %%mm2 \n"\
00888 "movd (%2,%0), %%mm1 \n"\
00889 "movq %%mm2, %%mm3 \n"\
00890 "punpcklbw %%mm7, %%mm1 \n"\
00891 "movq %%mm2, %%mm4 \n"\
00892 "psubw %%mm1, %%mm3 \n"\
00893 "psubw %%mm0, %%mm4 \n"\
00894 "movq %%mm3, %%mm5 \n"\
00895 "paddw %%mm4, %%mm5 \n"\
00896 abs3\
00897 "movq %%mm4, %%mm6 \n"\
00898 "pminsw %%mm5, %%mm6 \n"\
00899 "pcmpgtw %%mm6, %%mm3 \n"\
00900 "pcmpgtw %%mm5, %%mm4 \n"\
00901 "movq %%mm4, %%mm6 \n"\
00902 "pand %%mm3, %%mm4 \n"\
00903 "pandn %%mm3, %%mm6 \n"\
00904 "pandn %%mm0, %%mm3 \n"\
00905 "movd (%3,%0), %%mm0 \n"\
00906 "pand %%mm1, %%mm6 \n"\
00907 "pand %%mm4, %%mm2 \n"\
00908 "punpcklbw %%mm7, %%mm0 \n"\
00909 "movq %6, %%mm5 \n"\
00910 "paddw %%mm6, %%mm0 \n"\
00911 "paddw %%mm2, %%mm3 \n"\
00912 "paddw %%mm3, %%mm0 \n"\
00913 "pand %%mm5, %%mm0 \n"\
00914 "movq %%mm0, %%mm3 \n"\
00915 "packuswb %%mm3, %%mm3 \n"\
00916 "movd %%mm3, (%1,%0) \n"\
00917 "add %4, %0 \n"\
00918 "cmp %5, %0 \n"\
00919 "jle 1b \n"\
00920 :"+r"(i)\
00921 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
00922 "m"(ff_pw_255)\
00923 :"memory"\
00924 );\
00925 }
00926
00927 #define ABS3_MMX2\
00928 "psubw %%mm5, %%mm7 \n"\
00929 "pmaxsw %%mm7, %%mm5 \n"\
00930 "pxor %%mm6, %%mm6 \n"\
00931 "pxor %%mm7, %%mm7 \n"\
00932 "psubw %%mm3, %%mm6 \n"\
00933 "psubw %%mm4, %%mm7 \n"\
00934 "pmaxsw %%mm6, %%mm3 \n"\
00935 "pmaxsw %%mm7, %%mm4 \n"\
00936 "pxor %%mm7, %%mm7 \n"
00937
00938 #define ABS3_SSSE3\
00939 "pabsw %%mm3, %%mm3 \n"\
00940 "pabsw %%mm4, %%mm4 \n"\
00941 "pabsw %%mm5, %%mm5 \n"
00942
00943 PAETH(mmx2, ABS3_MMX2)
00944 #if HAVE_SSSE3
00945 PAETH(ssse3, ABS3_SSSE3)
00946 #endif
00947
00948 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
00949 "paddw " #m4 ", " #m3 " \n\t" \
00950 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00951 "pmullw " #m3 ", %%mm4 \n\t" \
00952 "movq "#in7", " #m3 " \n\t" \
00953 "movq "#in0", %%mm5 \n\t" \
00954 "paddw " #m3 ", %%mm5 \n\t" \
00955 "psubw %%mm5, %%mm4 \n\t" \
00956 "movq "#in1", %%mm5 \n\t" \
00957 "movq "#in2", %%mm6 \n\t" \
00958 "paddw " #m6 ", %%mm5 \n\t" \
00959 "paddw " #m5 ", %%mm6 \n\t" \
00960 "paddw %%mm6, %%mm6 \n\t" \
00961 "psubw %%mm6, %%mm5 \n\t" \
00962 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00963 "paddw " #rnd ", %%mm4 \n\t" \
00964 "paddw %%mm4, %%mm5 \n\t" \
00965 "psraw $5, %%mm5 \n\t"\
00966 "packuswb %%mm5, %%mm5 \n\t"\
00967 OP(%%mm5, out, %%mm7, d)
00968
00969 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
00970 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00971 uint64_t temp;\
00972 \
00973 __asm__ volatile(\
00974 "pxor %%mm7, %%mm7 \n\t"\
00975 "1: \n\t"\
00976 "movq (%0), %%mm0 \n\t" \
00977 "movq %%mm0, %%mm1 \n\t" \
00978 "movq %%mm0, %%mm2 \n\t" \
00979 "punpcklbw %%mm7, %%mm0 \n\t" \
00980 "punpckhbw %%mm7, %%mm1 \n\t" \
00981 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00982 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00983 "movq %%mm2, %%mm3 \n\t" \
00984 "movq %%mm2, %%mm4 \n\t" \
00985 "psllq $8, %%mm2 \n\t" \
00986 "psllq $16, %%mm3 \n\t" \
00987 "psllq $24, %%mm4 \n\t" \
00988 "punpckhbw %%mm7, %%mm2 \n\t" \
00989 "punpckhbw %%mm7, %%mm3 \n\t" \
00990 "punpckhbw %%mm7, %%mm4 \n\t" \
00991 "paddw %%mm3, %%mm5 \n\t" \
00992 "paddw %%mm2, %%mm6 \n\t" \
00993 "paddw %%mm5, %%mm5 \n\t" \
00994 "psubw %%mm5, %%mm6 \n\t" \
00995 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
00996 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
00997 "paddw %%mm4, %%mm0 \n\t" \
00998 "paddw %%mm1, %%mm5 \n\t" \
00999 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01000 "psubw %%mm5, %%mm0 \n\t" \
01001 "paddw %6, %%mm6 \n\t"\
01002 "paddw %%mm6, %%mm0 \n\t" \
01003 "psraw $5, %%mm0 \n\t"\
01004 "movq %%mm0, %5 \n\t"\
01005 \
01006 \
01007 "movq 5(%0), %%mm0 \n\t" \
01008 "movq %%mm0, %%mm5 \n\t" \
01009 "movq %%mm0, %%mm6 \n\t" \
01010 "psrlq $8, %%mm0 \n\t" \
01011 "psrlq $16, %%mm5 \n\t" \
01012 "punpcklbw %%mm7, %%mm0 \n\t" \
01013 "punpcklbw %%mm7, %%mm5 \n\t" \
01014 "paddw %%mm0, %%mm2 \n\t" \
01015 "paddw %%mm5, %%mm3 \n\t" \
01016 "paddw %%mm2, %%mm2 \n\t" \
01017 "psubw %%mm2, %%mm3 \n\t" \
01018 "movq %%mm6, %%mm2 \n\t" \
01019 "psrlq $24, %%mm6 \n\t" \
01020 "punpcklbw %%mm7, %%mm2 \n\t" \
01021 "punpcklbw %%mm7, %%mm6 \n\t" \
01022 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01023 "paddw %%mm2, %%mm1 \n\t" \
01024 "paddw %%mm6, %%mm4 \n\t" \
01025 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01026 "psubw %%mm4, %%mm3 \n\t" \
01027 "paddw %6, %%mm1 \n\t"\
01028 "paddw %%mm1, %%mm3 \n\t" \
01029 "psraw $5, %%mm3 \n\t"\
01030 "movq %5, %%mm1 \n\t"\
01031 "packuswb %%mm3, %%mm1 \n\t"\
01032 OP_MMX2(%%mm1, (%1),%%mm4, q)\
01033 \
01034 \
01035 "movq 9(%0), %%mm1 \n\t" \
01036 "movq %%mm1, %%mm4 \n\t" \
01037 "movq %%mm1, %%mm3 \n\t" \
01038 "psrlq $8, %%mm1 \n\t" \
01039 "psrlq $16, %%mm4 \n\t" \
01040 "punpcklbw %%mm7, %%mm1 \n\t" \
01041 "punpcklbw %%mm7, %%mm4 \n\t" \
01042 "paddw %%mm1, %%mm5 \n\t" \
01043 "paddw %%mm4, %%mm0 \n\t" \
01044 "paddw %%mm5, %%mm5 \n\t" \
01045 "psubw %%mm5, %%mm0 \n\t" \
01046 "movq %%mm3, %%mm5 \n\t" \
01047 "psrlq $24, %%mm3 \n\t" \
01048 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
01049 "punpcklbw %%mm7, %%mm3 \n\t" \
01050 "paddw %%mm3, %%mm2 \n\t" \
01051 "psubw %%mm2, %%mm0 \n\t" \
01052 "movq %%mm5, %%mm2 \n\t" \
01053 "punpcklbw %%mm7, %%mm2 \n\t" \
01054 "punpckhbw %%mm7, %%mm5 \n\t" \
01055 "paddw %%mm2, %%mm6 \n\t" \
01056 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
01057 "paddw %6, %%mm0 \n\t"\
01058 "paddw %%mm6, %%mm0 \n\t" \
01059 "psraw $5, %%mm0 \n\t"\
01060 \
01061 \
01062 "paddw %%mm5, %%mm3 \n\t" \
01063 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01064 "paddw %%mm4, %%mm6 \n\t" \
01065 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
01066 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01067 "paddw %%mm1, %%mm4 \n\t" \
01068 "paddw %%mm2, %%mm5 \n\t" \
01069 "paddw %%mm6, %%mm6 \n\t" \
01070 "psubw %%mm6, %%mm4 \n\t" \
01071 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
01072 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
01073 "psubw %%mm5, %%mm3 \n\t" \
01074 "paddw %6, %%mm4 \n\t"\
01075 "paddw %%mm3, %%mm4 \n\t" \
01076 "psraw $5, %%mm4 \n\t"\
01077 "packuswb %%mm4, %%mm0 \n\t"\
01078 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
01079 \
01080 "add %3, %0 \n\t"\
01081 "add %4, %1 \n\t"\
01082 "decl %2 \n\t"\
01083 " jnz 1b \n\t"\
01084 : "+a"(src), "+c"(dst), "+D"(h)\
01085 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(temp), "m"(ROUNDER)\
01086 : "memory"\
01087 );\
01088 }\
01089 \
01090 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01091 int i;\
01092 int16_t temp[16];\
01093 \
01094 for(i=0; i<h; i++)\
01095 {\
01096 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01097 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01098 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01099 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01100 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01101 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01102 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01103 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01104 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01105 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01106 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01107 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01108 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01109 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01110 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01111 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01112 __asm__ volatile(\
01113 "movq (%0), %%mm0 \n\t"\
01114 "movq 8(%0), %%mm1 \n\t"\
01115 "paddw %2, %%mm0 \n\t"\
01116 "paddw %2, %%mm1 \n\t"\
01117 "psraw $5, %%mm0 \n\t"\
01118 "psraw $5, %%mm1 \n\t"\
01119 "packuswb %%mm1, %%mm0 \n\t"\
01120 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01121 "movq 16(%0), %%mm0 \n\t"\
01122 "movq 24(%0), %%mm1 \n\t"\
01123 "paddw %2, %%mm0 \n\t"\
01124 "paddw %2, %%mm1 \n\t"\
01125 "psraw $5, %%mm0 \n\t"\
01126 "psraw $5, %%mm1 \n\t"\
01127 "packuswb %%mm1, %%mm0 \n\t"\
01128 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01129 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01130 : "memory"\
01131 );\
01132 dst+=dstStride;\
01133 src+=srcStride;\
01134 }\
01135 }\
01136 \
01137 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01138 __asm__ volatile(\
01139 "pxor %%mm7, %%mm7 \n\t"\
01140 "1: \n\t"\
01141 "movq (%0), %%mm0 \n\t" \
01142 "movq %%mm0, %%mm1 \n\t" \
01143 "movq %%mm0, %%mm2 \n\t" \
01144 "punpcklbw %%mm7, %%mm0 \n\t" \
01145 "punpckhbw %%mm7, %%mm1 \n\t" \
01146 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01147 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01148 "movq %%mm2, %%mm3 \n\t" \
01149 "movq %%mm2, %%mm4 \n\t" \
01150 "psllq $8, %%mm2 \n\t" \
01151 "psllq $16, %%mm3 \n\t" \
01152 "psllq $24, %%mm4 \n\t" \
01153 "punpckhbw %%mm7, %%mm2 \n\t" \
01154 "punpckhbw %%mm7, %%mm3 \n\t" \
01155 "punpckhbw %%mm7, %%mm4 \n\t" \
01156 "paddw %%mm3, %%mm5 \n\t" \
01157 "paddw %%mm2, %%mm6 \n\t" \
01158 "paddw %%mm5, %%mm5 \n\t" \
01159 "psubw %%mm5, %%mm6 \n\t" \
01160 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01161 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01162 "paddw %%mm4, %%mm0 \n\t" \
01163 "paddw %%mm1, %%mm5 \n\t" \
01164 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01165 "psubw %%mm5, %%mm0 \n\t" \
01166 "paddw %5, %%mm6 \n\t"\
01167 "paddw %%mm6, %%mm0 \n\t" \
01168 "psraw $5, %%mm0 \n\t"\
01169 \
01170 \
01171 "movd 5(%0), %%mm5 \n\t" \
01172 "punpcklbw %%mm7, %%mm5 \n\t" \
01173 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01174 "paddw %%mm5, %%mm1 \n\t" \
01175 "paddw %%mm6, %%mm2 \n\t" \
01176 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01177 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01178 "paddw %%mm6, %%mm3 \n\t" \
01179 "paddw %%mm5, %%mm4 \n\t" \
01180 "paddw %%mm2, %%mm2 \n\t" \
01181 "psubw %%mm2, %%mm3 \n\t" \
01182 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01183 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01184 "psubw %%mm4, %%mm3 \n\t" \
01185 "paddw %5, %%mm1 \n\t"\
01186 "paddw %%mm1, %%mm3 \n\t" \
01187 "psraw $5, %%mm3 \n\t"\
01188 "packuswb %%mm3, %%mm0 \n\t"\
01189 OP_MMX2(%%mm0, (%1), %%mm4, q)\
01190 \
01191 "add %3, %0 \n\t"\
01192 "add %4, %1 \n\t"\
01193 "decl %2 \n\t"\
01194 " jnz 1b \n\t"\
01195 : "+a"(src), "+c"(dst), "+d"(h)\
01196 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ROUNDER)\
01197 : "memory"\
01198 );\
01199 }\
01200 \
01201 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01202 int i;\
01203 int16_t temp[8];\
01204 \
01205 for(i=0; i<h; i++)\
01206 {\
01207 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01208 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01209 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01210 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01211 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01212 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01213 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01214 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01215 __asm__ volatile(\
01216 "movq (%0), %%mm0 \n\t"\
01217 "movq 8(%0), %%mm1 \n\t"\
01218 "paddw %2, %%mm0 \n\t"\
01219 "paddw %2, %%mm1 \n\t"\
01220 "psraw $5, %%mm0 \n\t"\
01221 "psraw $5, %%mm1 \n\t"\
01222 "packuswb %%mm1, %%mm0 \n\t"\
01223 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01224 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01225 :"memory"\
01226 );\
01227 dst+=dstStride;\
01228 src+=srcStride;\
01229 }\
01230 }
01231
01232 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
01233 \
01234 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01235 uint64_t temp[17*4];\
01236 uint64_t *temp_ptr= temp;\
01237 int count= 17;\
01238 \
01239 \
01240 __asm__ volatile(\
01241 "pxor %%mm7, %%mm7 \n\t"\
01242 "1: \n\t"\
01243 "movq (%0), %%mm0 \n\t"\
01244 "movq (%0), %%mm1 \n\t"\
01245 "movq 8(%0), %%mm2 \n\t"\
01246 "movq 8(%0), %%mm3 \n\t"\
01247 "punpcklbw %%mm7, %%mm0 \n\t"\
01248 "punpckhbw %%mm7, %%mm1 \n\t"\
01249 "punpcklbw %%mm7, %%mm2 \n\t"\
01250 "punpckhbw %%mm7, %%mm3 \n\t"\
01251 "movq %%mm0, (%1) \n\t"\
01252 "movq %%mm1, 17*8(%1) \n\t"\
01253 "movq %%mm2, 2*17*8(%1) \n\t"\
01254 "movq %%mm3, 3*17*8(%1) \n\t"\
01255 "add $8, %1 \n\t"\
01256 "add %3, %0 \n\t"\
01257 "decl %2 \n\t"\
01258 " jnz 1b \n\t"\
01259 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01260 : "r" ((x86_reg)srcStride)\
01261 : "memory"\
01262 );\
01263 \
01264 temp_ptr= temp;\
01265 count=4;\
01266 \
01267 \
01268 __asm__ volatile(\
01269 \
01270 "1: \n\t"\
01271 "movq (%0), %%mm0 \n\t"\
01272 "movq 8(%0), %%mm1 \n\t"\
01273 "movq 16(%0), %%mm2 \n\t"\
01274 "movq 24(%0), %%mm3 \n\t"\
01275 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01276 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01277 "add %4, %1 \n\t"\
01278 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01279 \
01280 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01281 "add %4, %1 \n\t"\
01282 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01283 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
01284 "add %4, %1 \n\t"\
01285 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
01286 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
01287 "add %4, %1 \n\t"\
01288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
01289 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
01290 "add %4, %1 \n\t"\
01291 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
01292 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
01293 "add %4, %1 \n\t"\
01294 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
01295 \
01296 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
01297 "add %4, %1 \n\t" \
01298 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
01299 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
01300 \
01301 "add $136, %0 \n\t"\
01302 "add %6, %1 \n\t"\
01303 "decl %2 \n\t"\
01304 " jnz 1b \n\t"\
01305 \
01306 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01307 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
01308 :"memory"\
01309 );\
01310 }\
01311 \
01312 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01313 uint64_t temp[9*2];\
01314 uint64_t *temp_ptr= temp;\
01315 int count= 9;\
01316 \
01317 \
01318 __asm__ volatile(\
01319 "pxor %%mm7, %%mm7 \n\t"\
01320 "1: \n\t"\
01321 "movq (%0), %%mm0 \n\t"\
01322 "movq (%0), %%mm1 \n\t"\
01323 "punpcklbw %%mm7, %%mm0 \n\t"\
01324 "punpckhbw %%mm7, %%mm1 \n\t"\
01325 "movq %%mm0, (%1) \n\t"\
01326 "movq %%mm1, 9*8(%1) \n\t"\
01327 "add $8, %1 \n\t"\
01328 "add %3, %0 \n\t"\
01329 "decl %2 \n\t"\
01330 " jnz 1b \n\t"\
01331 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01332 : "r" ((x86_reg)srcStride)\
01333 : "memory"\
01334 );\
01335 \
01336 temp_ptr= temp;\
01337 count=2;\
01338 \
01339 \
01340 __asm__ volatile(\
01341 \
01342 "1: \n\t"\
01343 "movq (%0), %%mm0 \n\t"\
01344 "movq 8(%0), %%mm1 \n\t"\
01345 "movq 16(%0), %%mm2 \n\t"\
01346 "movq 24(%0), %%mm3 \n\t"\
01347 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01348 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01349 "add %4, %1 \n\t"\
01350 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01351 \
01352 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01353 "add %4, %1 \n\t"\
01354 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01355 \
01356 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
01357 "add %4, %1 \n\t"\
01358 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
01359 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
01360 \
01361 "add $72, %0 \n\t"\
01362 "add %6, %1 \n\t"\
01363 "decl %2 \n\t"\
01364 " jnz 1b \n\t"\
01365 \
01366 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01367 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
01368 : "memory"\
01369 );\
01370 }\
01371 \
01372 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01373 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
01374 }\
01375 \
01376 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01377 uint64_t temp[8];\
01378 uint8_t * const half= (uint8_t*)temp;\
01379 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01380 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01381 }\
01382 \
01383 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01384 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
01385 }\
01386 \
01387 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01388 uint64_t temp[8];\
01389 uint8_t * const half= (uint8_t*)temp;\
01390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01391 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
01392 }\
01393 \
01394 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01395 uint64_t temp[8];\
01396 uint8_t * const half= (uint8_t*)temp;\
01397 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01398 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01399 }\
01400 \
01401 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01402 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
01403 }\
01404 \
01405 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01406 uint64_t temp[8];\
01407 uint8_t * const half= (uint8_t*)temp;\
01408 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01409 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
01410 }\
01411 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01412 uint64_t half[8 + 9];\
01413 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01414 uint8_t * const halfHV= ((uint8_t*)half);\
01415 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01416 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01417 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01418 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01419 }\
01420 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01421 uint64_t half[8 + 9];\
01422 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01423 uint8_t * const halfHV= ((uint8_t*)half);\
01424 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01425 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01426 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01427 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01428 }\
01429 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01430 uint64_t half[8 + 9];\
01431 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01432 uint8_t * const halfHV= ((uint8_t*)half);\
01433 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01434 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01435 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01436 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01437 }\
01438 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01439 uint64_t half[8 + 9];\
01440 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01441 uint8_t * const halfHV= ((uint8_t*)half);\
01442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01443 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01444 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01445 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01446 }\
01447 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01448 uint64_t half[8 + 9];\
01449 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01450 uint8_t * const halfHV= ((uint8_t*)half);\
01451 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01454 }\
01455 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01456 uint64_t half[8 + 9];\
01457 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01458 uint8_t * const halfHV= ((uint8_t*)half);\
01459 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01461 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01462 }\
01463 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01464 uint64_t half[8 + 9];\
01465 uint8_t * const halfH= ((uint8_t*)half);\
01466 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01467 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01468 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01469 }\
01470 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01471 uint64_t half[8 + 9];\
01472 uint8_t * const halfH= ((uint8_t*)half);\
01473 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01474 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01475 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01476 }\
01477 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01478 uint64_t half[9];\
01479 uint8_t * const halfH= ((uint8_t*)half);\
01480 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01481 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01482 }\
01483 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01484 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
01485 }\
01486 \
01487 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01488 uint64_t temp[32];\
01489 uint8_t * const half= (uint8_t*)temp;\
01490 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01491 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01492 }\
01493 \
01494 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01495 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
01496 }\
01497 \
01498 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01499 uint64_t temp[32];\
01500 uint8_t * const half= (uint8_t*)temp;\
01501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01502 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
01503 }\
01504 \
01505 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01506 uint64_t temp[32];\
01507 uint8_t * const half= (uint8_t*)temp;\
01508 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01509 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01510 }\
01511 \
01512 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01513 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
01514 }\
01515 \
01516 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01517 uint64_t temp[32];\
01518 uint8_t * const half= (uint8_t*)temp;\
01519 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01520 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
01521 }\
01522 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01523 uint64_t half[16*2 + 17*2];\
01524 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01525 uint8_t * const halfHV= ((uint8_t*)half);\
01526 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01527 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01528 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01529 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01530 }\
01531 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01532 uint64_t half[16*2 + 17*2];\
01533 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01534 uint8_t * const halfHV= ((uint8_t*)half);\
01535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01536 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01537 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01538 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01539 }\
01540 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01541 uint64_t half[16*2 + 17*2];\
01542 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01543 uint8_t * const halfHV= ((uint8_t*)half);\
01544 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01545 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01546 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01547 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01548 }\
01549 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01550 uint64_t half[16*2 + 17*2];\
01551 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01552 uint8_t * const halfHV= ((uint8_t*)half);\
01553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01554 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01555 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01556 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01557 }\
01558 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01559 uint64_t half[16*2 + 17*2];\
01560 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01561 uint8_t * const halfHV= ((uint8_t*)half);\
01562 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01563 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01564 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01565 }\
01566 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01567 uint64_t half[16*2 + 17*2];\
01568 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01569 uint8_t * const halfHV= ((uint8_t*)half);\
01570 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01571 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01572 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01573 }\
01574 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01575 uint64_t half[17*2];\
01576 uint8_t * const halfH= ((uint8_t*)half);\
01577 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01578 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01579 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01580 }\
01581 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01582 uint64_t half[17*2];\
01583 uint8_t * const halfH= ((uint8_t*)half);\
01584 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01585 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01586 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01587 }\
01588 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01589 uint64_t half[17*2];\
01590 uint8_t * const halfH= ((uint8_t*)half);\
01591 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01592 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01593 }
01594
01595 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
01596 #define AVG_3DNOW_OP(a,b,temp, size) \
01597 "mov" #size " " #b ", " #temp " \n\t"\
01598 "pavgusb " #temp ", " #a " \n\t"\
01599 "mov" #size " " #a ", " #b " \n\t"
01600 #define AVG_MMX2_OP(a,b,temp, size) \
01601 "mov" #size " " #b ", " #temp " \n\t"\
01602 "pavgb " #temp ", " #a " \n\t"\
01603 "mov" #size " " #a ", " #b " \n\t"
01604
01605 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
01606 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
01607 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
01608 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
01609 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
01610 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
01611 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
01612 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
01613 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
01614
01615
01616
01617
01618 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
01619 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01620 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
01621 }
01622 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
01623 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01624 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
01625 }
01626
01627 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
01628 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
01629 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
01630 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
01631 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
01632 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
01633 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
01634 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
01635 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
01636 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
01637 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01638 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
01639 }\
01640 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01641 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
01642 }\
01643 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
01644 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
01645 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
01646 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
01647 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
01648 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
01649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
01650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
01651
01652 QPEL_2TAP(put_, 16, mmx2)
01653 QPEL_2TAP(avg_, 16, mmx2)
01654 QPEL_2TAP(put_, 8, mmx2)
01655 QPEL_2TAP(avg_, 8, mmx2)
01656 QPEL_2TAP(put_, 16, 3dnow)
01657 QPEL_2TAP(avg_, 16, 3dnow)
01658 QPEL_2TAP(put_, 8, 3dnow)
01659 QPEL_2TAP(avg_, 8, 3dnow)
01660
01661
01662 #if 0
01663 static void just_return(void) { return; }
01664 #endif
01665
01666 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01667 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
01668 const int w = 8;
01669 const int ix = ox>>(16+shift);
01670 const int iy = oy>>(16+shift);
01671 const int oxs = ox>>4;
01672 const int oys = oy>>4;
01673 const int dxxs = dxx>>4;
01674 const int dxys = dxy>>4;
01675 const int dyxs = dyx>>4;
01676 const int dyys = dyy>>4;
01677 const uint16_t r4[4] = {r,r,r,r};
01678 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
01679 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
01680 const uint64_t shift2 = 2*shift;
01681 uint8_t edge_buf[(h+1)*stride];
01682 int x, y;
01683
01684 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
01685 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
01686 const int dxh = dxy*(h-1);
01687 const int dyw = dyx*(w-1);
01688 if(
01689 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
01690 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
01691
01692 || (dxx|dxy|dyx|dyy)&15 )
01693 {
01694
01695 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
01696 return;
01697 }
01698
01699 src += ix + iy*stride;
01700 if( (unsigned)ix >= width-w ||
01701 (unsigned)iy >= height-h )
01702 {
01703 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
01704 src = edge_buf;
01705 }
01706
01707 __asm__ volatile(
01708 "movd %0, %%mm6 \n\t"
01709 "pxor %%mm7, %%mm7 \n\t"
01710 "punpcklwd %%mm6, %%mm6 \n\t"
01711 "punpcklwd %%mm6, %%mm6 \n\t"
01712 :: "r"(1<<shift)
01713 );
01714
01715 for(x=0; x<w; x+=4){
01716 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
01717 oxs - dxys + dxxs*(x+1),
01718 oxs - dxys + dxxs*(x+2),
01719 oxs - dxys + dxxs*(x+3) };
01720 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
01721 oys - dyys + dyxs*(x+1),
01722 oys - dyys + dyxs*(x+2),
01723 oys - dyys + dyxs*(x+3) };
01724
01725 for(y=0; y<h; y++){
01726 __asm__ volatile(
01727 "movq %0, %%mm4 \n\t"
01728 "movq %1, %%mm5 \n\t"
01729 "paddw %2, %%mm4 \n\t"
01730 "paddw %3, %%mm5 \n\t"
01731 "movq %%mm4, %0 \n\t"
01732 "movq %%mm5, %1 \n\t"
01733 "psrlw $12, %%mm4 \n\t"
01734 "psrlw $12, %%mm5 \n\t"
01735 : "+m"(*dx4), "+m"(*dy4)
01736 : "m"(*dxy4), "m"(*dyy4)
01737 );
01738
01739 __asm__ volatile(
01740 "movq %%mm6, %%mm2 \n\t"
01741 "movq %%mm6, %%mm1 \n\t"
01742 "psubw %%mm4, %%mm2 \n\t"
01743 "psubw %%mm5, %%mm1 \n\t"
01744 "movq %%mm2, %%mm0 \n\t"
01745 "movq %%mm4, %%mm3 \n\t"
01746 "pmullw %%mm1, %%mm0 \n\t"
01747 "pmullw %%mm5, %%mm3 \n\t"
01748 "pmullw %%mm5, %%mm2 \n\t"
01749 "pmullw %%mm4, %%mm1 \n\t"
01750
01751 "movd %4, %%mm5 \n\t"
01752 "movd %3, %%mm4 \n\t"
01753 "punpcklbw %%mm7, %%mm5 \n\t"
01754 "punpcklbw %%mm7, %%mm4 \n\t"
01755 "pmullw %%mm5, %%mm3 \n\t"
01756 "pmullw %%mm4, %%mm2 \n\t"
01757
01758 "movd %2, %%mm5 \n\t"
01759 "movd %1, %%mm4 \n\t"
01760 "punpcklbw %%mm7, %%mm5 \n\t"
01761 "punpcklbw %%mm7, %%mm4 \n\t"
01762 "pmullw %%mm5, %%mm1 \n\t"
01763 "pmullw %%mm4, %%mm0 \n\t"
01764 "paddw %5, %%mm1 \n\t"
01765 "paddw %%mm3, %%mm2 \n\t"
01766 "paddw %%mm1, %%mm0 \n\t"
01767 "paddw %%mm2, %%mm0 \n\t"
01768
01769 "psrlw %6, %%mm0 \n\t"
01770 "packuswb %%mm0, %%mm0 \n\t"
01771 "movd %%mm0, %0 \n\t"
01772
01773 : "=m"(dst[x+y*stride])
01774 : "m"(src[0]), "m"(src[1]),
01775 "m"(src[stride]), "m"(src[stride+1]),
01776 "m"(*r4), "m"(shift2)
01777 );
01778 src += stride;
01779 }
01780 src += 4-h*stride;
01781 }
01782 }
01783
01784 #define PREFETCH(name, op) \
01785 static void name(void *mem, int stride, int h){\
01786 const uint8_t *p= mem;\
01787 do{\
01788 __asm__ volatile(#op" %0" :: "m"(*p));\
01789 p+= stride;\
01790 }while(--h);\
01791 }
01792 PREFETCH(prefetch_mmx2, prefetcht0)
01793 PREFETCH(prefetch_3dnow, prefetch)
01794 #undef PREFETCH
01795
01796 #include "h264_qpel_mmx.c"
01797
01798 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
01799 int stride, int h, int x, int y);
01800 void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
01801 int stride, int h, int x, int y);
01802 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
01803 int stride, int h, int x, int y);
01804 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
01805 int stride, int h, int x, int y);
01806 void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
01807 int stride, int h, int x, int y);
01808 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
01809 int stride, int h, int x, int y);
01810 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
01811 int stride, int h, int x, int y);
01812 void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
01813 int stride, int h, int x, int y);
01814 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
01815 int stride, int h, int x, int y);
01816
01817 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01818 int stride, int h, int x, int y);
01819 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01820 int stride, int h, int x, int y);
01821 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
01822 int stride, int h, int x, int y);
01823 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
01824 int stride, int h, int x, int y);
01825 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01826 int stride, int h, int x, int y);
01827 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01828 int stride, int h, int x, int y);
01829
01830 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01831 int stride, int h, int x, int y);
01832 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01833 int stride, int h, int x, int y);
01834
01835 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01836 int stride, int h, int x, int y);
01837 void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
01838 int stride, int h, int x, int y);
01839 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01840 int stride, int h, int x, int y);
01841
01842 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01843 int stride, int h, int x, int y);
01844 void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
01845 int stride, int h, int x, int y);
01846 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01847 int stride, int h, int x, int y);
01848
01849
01850
01851 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01852 put_pixels8_mmx(dst, src, stride, 8);
01853 }
01854 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01855 avg_pixels8_mmx(dst, src, stride, 8);
01856 }
01857 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01858 put_pixels16_mmx(dst, src, stride, 16);
01859 }
01860 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01861 avg_pixels16_mmx(dst, src, stride, 16);
01862 }
01863
01864
01865 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01866 put_pixels8_mmx(dst, src, stride, 8);
01867 }
01868 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01869 avg_pixels8_mmx2(dst, src, stride, 8);
01870 }
01871
01872
01873
01874 #if CONFIG_GPL
01875 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01876 {
01877 ff_mmx_idct (block);
01878 ff_put_pixels_clamped_mmx(block, dest, line_size);
01879 }
01880 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01881 {
01882 ff_mmx_idct (block);
01883 ff_add_pixels_clamped_mmx(block, dest, line_size);
01884 }
01885 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01886 {
01887 ff_mmxext_idct (block);
01888 ff_put_pixels_clamped_mmx(block, dest, line_size);
01889 }
01890 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01891 {
01892 ff_mmxext_idct (block);
01893 ff_add_pixels_clamped_mmx(block, dest, line_size);
01894 }
01895 #endif
01896 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
01897 {
01898 ff_idct_xvid_mmx (block);
01899 ff_put_pixels_clamped_mmx(block, dest, line_size);
01900 }
01901 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
01902 {
01903 ff_idct_xvid_mmx (block);
01904 ff_add_pixels_clamped_mmx(block, dest, line_size);
01905 }
01906 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
01907 {
01908 ff_idct_xvid_mmx2 (block);
01909 ff_put_pixels_clamped_mmx(block, dest, line_size);
01910 }
01911 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
01912 {
01913 ff_idct_xvid_mmx2 (block);
01914 ff_add_pixels_clamped_mmx(block, dest, line_size);
01915 }
01916
01917 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
01918 {
01919 int i;
01920 __asm__ volatile("pxor %%mm7, %%mm7":);
01921 for(i=0; i<blocksize; i+=2) {
01922 __asm__ volatile(
01923 "movq %0, %%mm0 \n\t"
01924 "movq %1, %%mm1 \n\t"
01925 "movq %%mm0, %%mm2 \n\t"
01926 "movq %%mm1, %%mm3 \n\t"
01927 "pfcmpge %%mm7, %%mm2 \n\t"
01928 "pfcmpge %%mm7, %%mm3 \n\t"
01929 "pslld $31, %%mm2 \n\t"
01930 "pxor %%mm2, %%mm1 \n\t"
01931 "movq %%mm3, %%mm4 \n\t"
01932 "pand %%mm1, %%mm3 \n\t"
01933 "pandn %%mm1, %%mm4 \n\t"
01934 "pfadd %%mm0, %%mm3 \n\t"
01935 "pfsub %%mm4, %%mm0 \n\t"
01936 "movq %%mm3, %1 \n\t"
01937 "movq %%mm0, %0 \n\t"
01938 :"+m"(mag[i]), "+m"(ang[i])
01939 ::"memory"
01940 );
01941 }
01942 __asm__ volatile("femms");
01943 }
01944 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
01945 {
01946 int i;
01947
01948 __asm__ volatile(
01949 "movaps %0, %%xmm5 \n\t"
01950 ::"m"(ff_pdw_80000000[0])
01951 );
01952 for(i=0; i<blocksize; i+=4) {
01953 __asm__ volatile(
01954 "movaps %0, %%xmm0 \n\t"
01955 "movaps %1, %%xmm1 \n\t"
01956 "xorps %%xmm2, %%xmm2 \n\t"
01957 "xorps %%xmm3, %%xmm3 \n\t"
01958 "cmpleps %%xmm0, %%xmm2 \n\t"
01959 "cmpleps %%xmm1, %%xmm3 \n\t"
01960 "andps %%xmm5, %%xmm2 \n\t"
01961 "xorps %%xmm2, %%xmm1 \n\t"
01962 "movaps %%xmm3, %%xmm4 \n\t"
01963 "andps %%xmm1, %%xmm3 \n\t"
01964 "andnps %%xmm1, %%xmm4 \n\t"
01965 "addps %%xmm0, %%xmm3 \n\t"
01966 "subps %%xmm4, %%xmm0 \n\t"
01967 "movaps %%xmm3, %1 \n\t"
01968 "movaps %%xmm0, %0 \n\t"
01969 :"+m"(mag[i]), "+m"(ang[i])
01970 ::"memory"
01971 );
01972 }
01973 }
01974
01975 #define IF1(x) x
01976 #define IF0(x)
01977
01978 #define MIX5(mono,stereo)\
01979 __asm__ volatile(\
01980 "movss 0(%2), %%xmm5 \n"\
01981 "movss 8(%2), %%xmm6 \n"\
01982 "movss 24(%2), %%xmm7 \n"\
01983 "shufps $0, %%xmm5, %%xmm5 \n"\
01984 "shufps $0, %%xmm6, %%xmm6 \n"\
01985 "shufps $0, %%xmm7, %%xmm7 \n"\
01986 "1: \n"\
01987 "movaps (%0,%1), %%xmm0 \n"\
01988 "movaps 0x400(%0,%1), %%xmm1 \n"\
01989 "movaps 0x800(%0,%1), %%xmm2 \n"\
01990 "movaps 0xc00(%0,%1), %%xmm3 \n"\
01991 "movaps 0x1000(%0,%1), %%xmm4 \n"\
01992 "mulps %%xmm5, %%xmm0 \n"\
01993 "mulps %%xmm6, %%xmm1 \n"\
01994 "mulps %%xmm5, %%xmm2 \n"\
01995 "mulps %%xmm7, %%xmm3 \n"\
01996 "mulps %%xmm7, %%xmm4 \n"\
01997 stereo("addps %%xmm1, %%xmm0 \n")\
01998 "addps %%xmm1, %%xmm2 \n"\
01999 "addps %%xmm3, %%xmm0 \n"\
02000 "addps %%xmm4, %%xmm2 \n"\
02001 mono("addps %%xmm2, %%xmm0 \n")\
02002 "movaps %%xmm0, (%0,%1) \n"\
02003 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
02004 "add $16, %0 \n"\
02005 "jl 1b \n"\
02006 :"+&r"(i)\
02007 :"r"(samples[0]+len), "r"(matrix)\
02008 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
02009 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
02010 "memory"\
02011 );
02012
02013 #define MIX_MISC(stereo)\
02014 __asm__ volatile(\
02015 "1: \n"\
02016 "movaps (%3,%0), %%xmm0 \n"\
02017 stereo("movaps %%xmm0, %%xmm1 \n")\
02018 "mulps %%xmm4, %%xmm0 \n"\
02019 stereo("mulps %%xmm5, %%xmm1 \n")\
02020 "lea 1024(%3,%0), %1 \n"\
02021 "mov %5, %2 \n"\
02022 "2: \n"\
02023 "movaps (%1), %%xmm2 \n"\
02024 stereo("movaps %%xmm2, %%xmm3 \n")\
02025 "mulps (%4,%2), %%xmm2 \n"\
02026 stereo("mulps 16(%4,%2), %%xmm3 \n")\
02027 "addps %%xmm2, %%xmm0 \n"\
02028 stereo("addps %%xmm3, %%xmm1 \n")\
02029 "add $1024, %1 \n"\
02030 "add $32, %2 \n"\
02031 "jl 2b \n"\
02032 "movaps %%xmm0, (%3,%0) \n"\
02033 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
02034 "add $16, %0 \n"\
02035 "jl 1b \n"\
02036 :"+&r"(i), "=&r"(j), "=&r"(k)\
02037 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
02038 :"memory"\
02039 );
02040
02041 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
02042 {
02043 int (*matrix_cmp)[2] = (int(*)[2])matrix;
02044 intptr_t i,j,k;
02045
02046 i = -len*sizeof(float);
02047 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
02048 MIX5(IF0,IF1);
02049 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
02050 MIX5(IF1,IF0);
02051 } else {
02052 DECLARE_ALIGNED(16, float, matrix_simd)[in_ch][2][4];
02053 j = 2*in_ch*sizeof(float);
02054 __asm__ volatile(
02055 "1: \n"
02056 "sub $8, %0 \n"
02057 "movss (%2,%0), %%xmm4 \n"
02058 "movss 4(%2,%0), %%xmm5 \n"
02059 "shufps $0, %%xmm4, %%xmm4 \n"
02060 "shufps $0, %%xmm5, %%xmm5 \n"
02061 "movaps %%xmm4, (%1,%0,4) \n"
02062 "movaps %%xmm5, 16(%1,%0,4) \n"
02063 "jg 1b \n"
02064 :"+&r"(j)
02065 :"r"(matrix_simd), "r"(matrix)
02066 :"memory"
02067 );
02068 if(out_ch == 2) {
02069 MIX_MISC(IF1);
02070 } else {
02071 MIX_MISC(IF0);
02072 }
02073 }
02074 }
02075
02076 static void vector_fmul_3dnow(float *dst, const float *src, int len){
02077 x86_reg i = (len-4)*4;
02078 __asm__ volatile(
02079 "1: \n\t"
02080 "movq (%1,%0), %%mm0 \n\t"
02081 "movq 8(%1,%0), %%mm1 \n\t"
02082 "pfmul (%2,%0), %%mm0 \n\t"
02083 "pfmul 8(%2,%0), %%mm1 \n\t"
02084 "movq %%mm0, (%1,%0) \n\t"
02085 "movq %%mm1, 8(%1,%0) \n\t"
02086 "sub $16, %0 \n\t"
02087 "jge 1b \n\t"
02088 "femms \n\t"
02089 :"+r"(i)
02090 :"r"(dst), "r"(src)
02091 :"memory"
02092 );
02093 }
02094 static void vector_fmul_sse(float *dst, const float *src, int len){
02095 x86_reg i = (len-8)*4;
02096 __asm__ volatile(
02097 "1: \n\t"
02098 "movaps (%1,%0), %%xmm0 \n\t"
02099 "movaps 16(%1,%0), %%xmm1 \n\t"
02100 "mulps (%2,%0), %%xmm0 \n\t"
02101 "mulps 16(%2,%0), %%xmm1 \n\t"
02102 "movaps %%xmm0, (%1,%0) \n\t"
02103 "movaps %%xmm1, 16(%1,%0) \n\t"
02104 "sub $32, %0 \n\t"
02105 "jge 1b \n\t"
02106 :"+r"(i)
02107 :"r"(dst), "r"(src)
02108 :"memory"
02109 );
02110 }
02111
02112 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
02113 x86_reg i = len*4-16;
02114 __asm__ volatile(
02115 "1: \n\t"
02116 "pswapd 8(%1), %%mm0 \n\t"
02117 "pswapd (%1), %%mm1 \n\t"
02118 "pfmul (%3,%0), %%mm0 \n\t"
02119 "pfmul 8(%3,%0), %%mm1 \n\t"
02120 "movq %%mm0, (%2,%0) \n\t"
02121 "movq %%mm1, 8(%2,%0) \n\t"
02122 "add $16, %1 \n\t"
02123 "sub $16, %0 \n\t"
02124 "jge 1b \n\t"
02125 :"+r"(i), "+r"(src1)
02126 :"r"(dst), "r"(src0)
02127 );
02128 __asm__ volatile("femms");
02129 }
02130 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
02131 x86_reg i = len*4-32;
02132 __asm__ volatile(
02133 "1: \n\t"
02134 "movaps 16(%1), %%xmm0 \n\t"
02135 "movaps (%1), %%xmm1 \n\t"
02136 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
02137 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
02138 "mulps (%3,%0), %%xmm0 \n\t"
02139 "mulps 16(%3,%0), %%xmm1 \n\t"
02140 "movaps %%xmm0, (%2,%0) \n\t"
02141 "movaps %%xmm1, 16(%2,%0) \n\t"
02142 "add $32, %1 \n\t"
02143 "sub $32, %0 \n\t"
02144 "jge 1b \n\t"
02145 :"+r"(i), "+r"(src1)
02146 :"r"(dst), "r"(src0)
02147 );
02148 }
02149
02150 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
02151 const float *src2, int len){
02152 x86_reg i = (len-4)*4;
02153 __asm__ volatile(
02154 "1: \n\t"
02155 "movq (%2,%0), %%mm0 \n\t"
02156 "movq 8(%2,%0), %%mm1 \n\t"
02157 "pfmul (%3,%0), %%mm0 \n\t"
02158 "pfmul 8(%3,%0), %%mm1 \n\t"
02159 "pfadd (%4,%0), %%mm0 \n\t"
02160 "pfadd 8(%4,%0), %%mm1 \n\t"
02161 "movq %%mm0, (%1,%0) \n\t"
02162 "movq %%mm1, 8(%1,%0) \n\t"
02163 "sub $16, %0 \n\t"
02164 "jge 1b \n\t"
02165 :"+r"(i)
02166 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02167 :"memory"
02168 );
02169 __asm__ volatile("femms");
02170 }
02171 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02172 const float *src2, int len){
02173 x86_reg i = (len-8)*4;
02174 __asm__ volatile(
02175 "1: \n\t"
02176 "movaps (%2,%0), %%xmm0 \n\t"
02177 "movaps 16(%2,%0), %%xmm1 \n\t"
02178 "mulps (%3,%0), %%xmm0 \n\t"
02179 "mulps 16(%3,%0), %%xmm1 \n\t"
02180 "addps (%4,%0), %%xmm0 \n\t"
02181 "addps 16(%4,%0), %%xmm1 \n\t"
02182 "movaps %%xmm0, (%1,%0) \n\t"
02183 "movaps %%xmm1, 16(%1,%0) \n\t"
02184 "sub $32, %0 \n\t"
02185 "jge 1b \n\t"
02186 :"+r"(i)
02187 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02188 :"memory"
02189 );
02190 }
02191
02192 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
02193 const float *win, float add_bias, int len){
02194 #if HAVE_6REGS
02195 if(add_bias == 0){
02196 x86_reg i = -len*4;
02197 x86_reg j = len*4-8;
02198 __asm__ volatile(
02199 "1: \n"
02200 "pswapd (%5,%1), %%mm1 \n"
02201 "movq (%5,%0), %%mm0 \n"
02202 "pswapd (%4,%1), %%mm5 \n"
02203 "movq (%3,%0), %%mm4 \n"
02204 "movq %%mm0, %%mm2 \n"
02205 "movq %%mm1, %%mm3 \n"
02206 "pfmul %%mm4, %%mm2 \n"
02207 "pfmul %%mm5, %%mm3 \n"
02208 "pfmul %%mm4, %%mm1 \n"
02209 "pfmul %%mm5, %%mm0 \n"
02210 "pfadd %%mm3, %%mm2 \n"
02211 "pfsub %%mm0, %%mm1 \n"
02212 "pswapd %%mm2, %%mm2 \n"
02213 "movq %%mm1, (%2,%0) \n"
02214 "movq %%mm2, (%2,%1) \n"
02215 "sub $8, %1 \n"
02216 "add $8, %0 \n"
02217 "jl 1b \n"
02218 "femms \n"
02219 :"+r"(i), "+r"(j)
02220 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02221 );
02222 }else
02223 #endif
02224 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
02225 }
02226
02227 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
02228 const float *win, float add_bias, int len){
02229 #if HAVE_6REGS
02230 if(add_bias == 0){
02231 x86_reg i = -len*4;
02232 x86_reg j = len*4-16;
02233 __asm__ volatile(
02234 "1: \n"
02235 "movaps (%5,%1), %%xmm1 \n"
02236 "movaps (%5,%0), %%xmm0 \n"
02237 "movaps (%4,%1), %%xmm5 \n"
02238 "movaps (%3,%0), %%xmm4 \n"
02239 "shufps $0x1b, %%xmm1, %%xmm1 \n"
02240 "shufps $0x1b, %%xmm5, %%xmm5 \n"
02241 "movaps %%xmm0, %%xmm2 \n"
02242 "movaps %%xmm1, %%xmm3 \n"
02243 "mulps %%xmm4, %%xmm2 \n"
02244 "mulps %%xmm5, %%xmm3 \n"
02245 "mulps %%xmm4, %%xmm1 \n"
02246 "mulps %%xmm5, %%xmm0 \n"
02247 "addps %%xmm3, %%xmm2 \n"
02248 "subps %%xmm0, %%xmm1 \n"
02249 "shufps $0x1b, %%xmm2, %%xmm2 \n"
02250 "movaps %%xmm1, (%2,%0) \n"
02251 "movaps %%xmm2, (%2,%1) \n"
02252 "sub $16, %1 \n"
02253 "add $16, %0 \n"
02254 "jl 1b \n"
02255 :"+r"(i), "+r"(j)
02256 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02257 );
02258 }else
02259 #endif
02260 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
02261 }
02262
02263 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
02264 {
02265 x86_reg i = -4*len;
02266 __asm__ volatile(
02267 "movss %3, %%xmm4 \n"
02268 "shufps $0, %%xmm4, %%xmm4 \n"
02269 "1: \n"
02270 "cvtpi2ps (%2,%0), %%xmm0 \n"
02271 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
02272 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
02273 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
02274 "movlhps %%xmm1, %%xmm0 \n"
02275 "movlhps %%xmm3, %%xmm2 \n"
02276 "mulps %%xmm4, %%xmm0 \n"
02277 "mulps %%xmm4, %%xmm2 \n"
02278 "movaps %%xmm0, (%1,%0) \n"
02279 "movaps %%xmm2, 16(%1,%0) \n"
02280 "add $32, %0 \n"
02281 "jl 1b \n"
02282 :"+r"(i)
02283 :"r"(dst+len), "r"(src+len), "m"(mul)
02284 );
02285 }
02286
02287 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
02288 {
02289 x86_reg i = -4*len;
02290 __asm__ volatile(
02291 "movss %3, %%xmm4 \n"
02292 "shufps $0, %%xmm4, %%xmm4 \n"
02293 "1: \n"
02294 "cvtdq2ps (%2,%0), %%xmm0 \n"
02295 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
02296 "mulps %%xmm4, %%xmm0 \n"
02297 "mulps %%xmm4, %%xmm1 \n"
02298 "movaps %%xmm0, (%1,%0) \n"
02299 "movaps %%xmm1, 16(%1,%0) \n"
02300 "add $32, %0 \n"
02301 "jl 1b \n"
02302 :"+r"(i)
02303 :"r"(dst+len), "r"(src+len), "m"(mul)
02304 );
02305 }
02306
02307 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
02308 int len)
02309 {
02310 x86_reg i = (len-16)*4;
02311 __asm__ volatile(
02312 "movss %3, %%xmm4 \n"
02313 "movss %4, %%xmm5 \n"
02314 "shufps $0, %%xmm4, %%xmm4 \n"
02315 "shufps $0, %%xmm5, %%xmm5 \n"
02316 "1: \n\t"
02317 "movaps (%2,%0), %%xmm0 \n\t"
02318 "movaps 16(%2,%0), %%xmm1 \n\t"
02319 "movaps 32(%2,%0), %%xmm2 \n\t"
02320 "movaps 48(%2,%0), %%xmm3 \n\t"
02321 "maxps %%xmm4, %%xmm0 \n\t"
02322 "maxps %%xmm4, %%xmm1 \n\t"
02323 "maxps %%xmm4, %%xmm2 \n\t"
02324 "maxps %%xmm4, %%xmm3 \n\t"
02325 "minps %%xmm5, %%xmm0 \n\t"
02326 "minps %%xmm5, %%xmm1 \n\t"
02327 "minps %%xmm5, %%xmm2 \n\t"
02328 "minps %%xmm5, %%xmm3 \n\t"
02329 "movaps %%xmm0, (%1,%0) \n\t"
02330 "movaps %%xmm1, 16(%1,%0) \n\t"
02331 "movaps %%xmm2, 32(%1,%0) \n\t"
02332 "movaps %%xmm3, 48(%1,%0) \n\t"
02333 "sub $64, %0 \n\t"
02334 "jge 1b \n\t"
02335 :"+&r"(i)
02336 :"r"(dst), "r"(src), "m"(min), "m"(max)
02337 :"memory"
02338 );
02339 }
02340
02341 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
02342 x86_reg reglen = len;
02343
02344 __asm__ volatile(
02345 "add %0 , %0 \n\t"
02346 "lea (%2,%0,2) , %2 \n\t"
02347 "add %0 , %1 \n\t"
02348 "neg %0 \n\t"
02349 "1: \n\t"
02350 "pf2id (%2,%0,2) , %%mm0 \n\t"
02351 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
02352 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
02353 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
02354 "packssdw %%mm1 , %%mm0 \n\t"
02355 "packssdw %%mm3 , %%mm2 \n\t"
02356 "movq %%mm0 , (%1,%0) \n\t"
02357 "movq %%mm2 , 8(%1,%0) \n\t"
02358 "add $16 , %0 \n\t"
02359 " js 1b \n\t"
02360 "femms \n\t"
02361 :"+r"(reglen), "+r"(dst), "+r"(src)
02362 );
02363 }
02364 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
02365 x86_reg reglen = len;
02366 __asm__ volatile(
02367 "add %0 , %0 \n\t"
02368 "lea (%2,%0,2) , %2 \n\t"
02369 "add %0 , %1 \n\t"
02370 "neg %0 \n\t"
02371 "1: \n\t"
02372 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
02373 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
02374 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
02375 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
02376 "packssdw %%mm1 , %%mm0 \n\t"
02377 "packssdw %%mm3 , %%mm2 \n\t"
02378 "movq %%mm0 , (%1,%0) \n\t"
02379 "movq %%mm2 , 8(%1,%0) \n\t"
02380 "add $16 , %0 \n\t"
02381 " js 1b \n\t"
02382 "emms \n\t"
02383 :"+r"(reglen), "+r"(dst), "+r"(src)
02384 );
02385 }
02386
02387 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
02388 x86_reg reglen = len;
02389 __asm__ volatile(
02390 "add %0 , %0 \n\t"
02391 "lea (%2,%0,2) , %2 \n\t"
02392 "add %0 , %1 \n\t"
02393 "neg %0 \n\t"
02394 "1: \n\t"
02395 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
02396 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
02397 "packssdw %%xmm1 , %%xmm0 \n\t"
02398 "movdqa %%xmm0 , (%1,%0) \n\t"
02399 "add $16 , %0 \n\t"
02400 " js 1b \n\t"
02401 :"+r"(reglen), "+r"(dst), "+r"(src)
02402 );
02403 }
02404
02405 void ff_vp3_idct_mmx(int16_t *input_data);
02406 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02407 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02408
02409 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
02410
02411 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02412 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02413
02414 void ff_vp3_idct_sse2(int16_t *input_data);
02415 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02416 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02417
02418 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
02419 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
02420 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
02421 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
02422 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
02423 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02424 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02425 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02426 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
02427 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
02428 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
02429
02430 #if !HAVE_YASM
02431 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
02432 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
02433 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
02434 #endif
02435 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
02436
02437 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
02438 \
02439 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
02440 DECLARE_ALIGNED(16, int16_t, tmp)[len];\
02441 int i,j,c;\
02442 for(c=0; c<channels; c++){\
02443 float_to_int16_##cpu(tmp, src[c], len);\
02444 for(i=0, j=c; i<len; i++, j+=channels)\
02445 dst[j] = tmp[i];\
02446 }\
02447 }\
02448 \
02449 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
02450 if(channels==1)\
02451 float_to_int16_##cpu(dst, src[0], len);\
02452 else if(channels==2){\
02453 x86_reg reglen = len; \
02454 const float *src0 = src[0];\
02455 const float *src1 = src[1];\
02456 __asm__ volatile(\
02457 "shl $2, %0 \n"\
02458 "add %0, %1 \n"\
02459 "add %0, %2 \n"\
02460 "add %0, %3 \n"\
02461 "neg %0 \n"\
02462 body\
02463 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
02464 );\
02465 }else if(channels==6){\
02466 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
02467 }else\
02468 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
02469 }
02470
02471 FLOAT_TO_INT16_INTERLEAVE(3dnow,
02472 "1: \n"
02473 "pf2id (%2,%0), %%mm0 \n"
02474 "pf2id 8(%2,%0), %%mm1 \n"
02475 "pf2id (%3,%0), %%mm2 \n"
02476 "pf2id 8(%3,%0), %%mm3 \n"
02477 "packssdw %%mm1, %%mm0 \n"
02478 "packssdw %%mm3, %%mm2 \n"
02479 "movq %%mm0, %%mm1 \n"
02480 "punpcklwd %%mm2, %%mm0 \n"
02481 "punpckhwd %%mm2, %%mm1 \n"
02482 "movq %%mm0, (%1,%0)\n"
02483 "movq %%mm1, 8(%1,%0)\n"
02484 "add $16, %0 \n"
02485 "js 1b \n"
02486 "femms \n"
02487 )
02488
02489 FLOAT_TO_INT16_INTERLEAVE(sse,
02490 "1: \n"
02491 "cvtps2pi (%2,%0), %%mm0 \n"
02492 "cvtps2pi 8(%2,%0), %%mm1 \n"
02493 "cvtps2pi (%3,%0), %%mm2 \n"
02494 "cvtps2pi 8(%3,%0), %%mm3 \n"
02495 "packssdw %%mm1, %%mm0 \n"
02496 "packssdw %%mm3, %%mm2 \n"
02497 "movq %%mm0, %%mm1 \n"
02498 "punpcklwd %%mm2, %%mm0 \n"
02499 "punpckhwd %%mm2, %%mm1 \n"
02500 "movq %%mm0, (%1,%0)\n"
02501 "movq %%mm1, 8(%1,%0)\n"
02502 "add $16, %0 \n"
02503 "js 1b \n"
02504 "emms \n"
02505 )
02506
02507 FLOAT_TO_INT16_INTERLEAVE(sse2,
02508 "1: \n"
02509 "cvtps2dq (%2,%0), %%xmm0 \n"
02510 "cvtps2dq (%3,%0), %%xmm1 \n"
02511 "packssdw %%xmm1, %%xmm0 \n"
02512 "movhlps %%xmm0, %%xmm1 \n"
02513 "punpcklwd %%xmm1, %%xmm0 \n"
02514 "movdqa %%xmm0, (%1,%0) \n"
02515 "add $16, %0 \n"
02516 "js 1b \n"
02517 )
02518
02519 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
02520 if(channels==6)
02521 ff_float_to_int16_interleave6_3dn2(dst, src, len);
02522 else
02523 float_to_int16_interleave_3dnow(dst, src, len, channels);
02524 }
02525
02526 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02527
02528 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
02529 {
02530 int mm_flags = av_get_cpu_flags();
02531
02532 if (avctx->dsp_mask) {
02533 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
02534 mm_flags |= (avctx->dsp_mask & 0xffff);
02535 else
02536 mm_flags &= ~(avctx->dsp_mask & 0xffff);
02537 }
02538
02539 #if 0
02540 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
02541 if (mm_flags & AV_CPU_FLAG_MMX)
02542 av_log(avctx, AV_LOG_INFO, " mmx");
02543 if (mm_flags & AV_CPU_FLAG_MMX2)
02544 av_log(avctx, AV_LOG_INFO, " mmx2");
02545 if (mm_flags & AV_CPU_FLAG_3DNOW)
02546 av_log(avctx, AV_LOG_INFO, " 3dnow");
02547 if (mm_flags & AV_CPU_FLAG_SSE)
02548 av_log(avctx, AV_LOG_INFO, " sse");
02549 if (mm_flags & AV_CPU_FLAG_SSE2)
02550 av_log(avctx, AV_LOG_INFO, " sse2");
02551 av_log(avctx, AV_LOG_INFO, "\n");
02552 #endif
02553
02554 if (mm_flags & AV_CPU_FLAG_MMX) {
02555 const int idct_algo= avctx->idct_algo;
02556
02557 if(avctx->lowres==0){
02558 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
02559 c->idct_put= ff_simple_idct_put_mmx;
02560 c->idct_add= ff_simple_idct_add_mmx;
02561 c->idct = ff_simple_idct_mmx;
02562 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
02563 #if CONFIG_GPL
02564 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
02565 if(mm_flags & AV_CPU_FLAG_MMX2){
02566 c->idct_put= ff_libmpeg2mmx2_idct_put;
02567 c->idct_add= ff_libmpeg2mmx2_idct_add;
02568 c->idct = ff_mmxext_idct;
02569 }else{
02570 c->idct_put= ff_libmpeg2mmx_idct_put;
02571 c->idct_add= ff_libmpeg2mmx_idct_add;
02572 c->idct = ff_mmx_idct;
02573 }
02574 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02575 #endif
02576 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
02577 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
02578 if(mm_flags & AV_CPU_FLAG_SSE2){
02579 c->idct_put= ff_vp3_idct_put_sse2;
02580 c->idct_add= ff_vp3_idct_add_sse2;
02581 c->idct = ff_vp3_idct_sse2;
02582 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02583 }else{
02584 c->idct_put= ff_vp3_idct_put_mmx;
02585 c->idct_add= ff_vp3_idct_add_mmx;
02586 c->idct = ff_vp3_idct_mmx;
02587 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
02588 }
02589 }else if(idct_algo==FF_IDCT_CAVS){
02590 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02591 }else if(idct_algo==FF_IDCT_XVIDMMX){
02592 if(mm_flags & AV_CPU_FLAG_SSE2){
02593 c->idct_put= ff_idct_xvid_sse2_put;
02594 c->idct_add= ff_idct_xvid_sse2_add;
02595 c->idct = ff_idct_xvid_sse2;
02596 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
02597 }else if(mm_flags & AV_CPU_FLAG_MMX2){
02598 c->idct_put= ff_idct_xvid_mmx2_put;
02599 c->idct_add= ff_idct_xvid_mmx2_add;
02600 c->idct = ff_idct_xvid_mmx2;
02601 }else{
02602 c->idct_put= ff_idct_xvid_mmx_put;
02603 c->idct_add= ff_idct_xvid_mmx_add;
02604 c->idct = ff_idct_xvid_mmx;
02605 }
02606 }
02607 }
02608
02609 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
02610 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02611 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
02612 c->clear_block = clear_block_mmx;
02613 c->clear_blocks = clear_blocks_mmx;
02614 if ((mm_flags & AV_CPU_FLAG_SSE) &&
02615 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
02616
02617 c->clear_block = clear_block_sse;
02618 c->clear_blocks = clear_blocks_sse;
02619 }
02620
02621 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02622 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
02623 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
02624 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
02625 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
02626
02627 SET_HPEL_FUNCS(put, 0, 16, mmx);
02628 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02629 SET_HPEL_FUNCS(avg, 0, 16, mmx);
02630 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02631 SET_HPEL_FUNCS(put, 1, 8, mmx);
02632 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
02633 SET_HPEL_FUNCS(avg, 1, 8, mmx);
02634 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
02635
02636 c->gmc= gmc_mmx;
02637
02638 c->add_bytes= add_bytes_mmx;
02639 c->add_bytes_l2= add_bytes_l2_mmx;
02640
02641 c->draw_edges = draw_edges_mmx;
02642
02643 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02644 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
02645 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
02646 }
02647
02648 #if HAVE_YASM
02649 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
02650 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
02651 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
02652
02653 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
02654 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
02655 #endif
02656
02657 if (mm_flags & AV_CPU_FLAG_MMX2) {
02658 c->prefetch = prefetch_mmx2;
02659
02660 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02661 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02662
02663 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02664 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02665 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02666
02667 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02668 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02669
02670 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02671 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02672 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02673
02674 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02675 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02676 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02677 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02678 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02679 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02680 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02681
02682 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02683 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
02684 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
02685 }
02686 }
02687 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02688 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
02689 }
02690
02691 if (CONFIG_VP3_DECODER
02692 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02693 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
02694 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
02695 }
02696
02697 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02698 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
02699 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
02700 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
02701 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
02702 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
02703 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
02704 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
02705 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
02706 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
02707 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
02708 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
02709 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
02710 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
02711 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
02712 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
02713 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
02714
02715 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
02716 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
02717 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
02718 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
02719 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
02720 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
02721
02722 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
02723 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
02724 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
02725 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
02726 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
02727 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
02728
02729 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
02730 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
02731 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
02732 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
02733
02734 #if HAVE_YASM
02735 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
02736 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
02737
02738 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
02739
02740 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
02741 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
02742 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
02743 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
02744
02745 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
02746 #endif
02747 #if HAVE_7REGS && HAVE_TEN_OPERANDS
02748 if( mm_flags&AV_CPU_FLAG_3DNOW )
02749 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
02750 #endif
02751
02752 if (CONFIG_VC1_DECODER)
02753 ff_vc1dsp_init_mmx(c, avctx);
02754
02755 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
02756 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
02757 c->prefetch = prefetch_3dnow;
02758
02759 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02760 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02761
02762 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02763 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02764 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02765
02766 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02767 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02768
02769 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02770 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02771 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02772
02773 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02774 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02775 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02776 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02777 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02778 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02779 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02780 }
02781
02782 if (CONFIG_VP3_DECODER
02783 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02784 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02785 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02786 }
02787
02788 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
02789 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
02790 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
02791 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
02792 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
02793 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
02794
02795 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
02796 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
02797 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
02798 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
02799 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
02800 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
02801
02802 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
02803 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
02804 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
02805 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
02806
02807 #if HAVE_YASM
02808 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
02809 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
02810
02811 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
02812
02813 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
02814 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
02815 #endif
02816 }
02817
02818
02819 #define H264_QPEL_FUNCS(x, y, CPU)\
02820 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
02821 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
02822 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
02823 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
02824 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
02825
02826 c->put_pixels_tab[0][0] = put_pixels16_sse2;
02827 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
02828 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
02829 H264_QPEL_FUNCS(0, 0, sse2);
02830 }
02831 if(mm_flags & AV_CPU_FLAG_SSE2){
02832 H264_QPEL_FUNCS(0, 1, sse2);
02833 H264_QPEL_FUNCS(0, 2, sse2);
02834 H264_QPEL_FUNCS(0, 3, sse2);
02835 H264_QPEL_FUNCS(1, 1, sse2);
02836 H264_QPEL_FUNCS(1, 2, sse2);
02837 H264_QPEL_FUNCS(1, 3, sse2);
02838 H264_QPEL_FUNCS(2, 1, sse2);
02839 H264_QPEL_FUNCS(2, 2, sse2);
02840 H264_QPEL_FUNCS(2, 3, sse2);
02841 H264_QPEL_FUNCS(3, 1, sse2);
02842 H264_QPEL_FUNCS(3, 2, sse2);
02843 H264_QPEL_FUNCS(3, 3, sse2);
02844 }
02845 #if HAVE_SSSE3
02846 if(mm_flags & AV_CPU_FLAG_SSSE3){
02847 H264_QPEL_FUNCS(1, 0, ssse3);
02848 H264_QPEL_FUNCS(1, 1, ssse3);
02849 H264_QPEL_FUNCS(1, 2, ssse3);
02850 H264_QPEL_FUNCS(1, 3, ssse3);
02851 H264_QPEL_FUNCS(2, 0, ssse3);
02852 H264_QPEL_FUNCS(2, 1, ssse3);
02853 H264_QPEL_FUNCS(2, 2, ssse3);
02854 H264_QPEL_FUNCS(2, 3, ssse3);
02855 H264_QPEL_FUNCS(3, 0, ssse3);
02856 H264_QPEL_FUNCS(3, 1, ssse3);
02857 H264_QPEL_FUNCS(3, 2, ssse3);
02858 H264_QPEL_FUNCS(3, 3, ssse3);
02859 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
02860 #if HAVE_YASM
02861 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
02862 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
02863 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
02864 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
02865 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
02866 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
02867 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02868 if (mm_flags & AV_CPU_FLAG_SSE4)
02869 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02870 #endif
02871 }
02872 #endif
02873
02874 if(mm_flags & AV_CPU_FLAG_3DNOW){
02875 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02876 c->vector_fmul = vector_fmul_3dnow;
02877 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02878 c->float_to_int16 = float_to_int16_3dnow;
02879 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
02880 }
02881 }
02882 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
02883 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
02884 c->vector_fmul_window = vector_fmul_window_3dnow2;
02885 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02886 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
02887 }
02888 }
02889 if(mm_flags & AV_CPU_FLAG_MMX2){
02890 #if HAVE_YASM
02891 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
02892 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
02893 #endif
02894 }
02895 if(mm_flags & AV_CPU_FLAG_SSE){
02896 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02897 c->ac3_downmix = ac3_downmix_sse;
02898 c->vector_fmul = vector_fmul_sse;
02899 c->vector_fmul_reverse = vector_fmul_reverse_sse;
02900 c->vector_fmul_add = vector_fmul_add_sse;
02901 c->vector_fmul_window = vector_fmul_window_sse;
02902 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
02903 c->vector_clipf = vector_clipf_sse;
02904 c->float_to_int16 = float_to_int16_sse;
02905 c->float_to_int16_interleave = float_to_int16_interleave_sse;
02906 #if HAVE_YASM
02907 c->scalarproduct_float = ff_scalarproduct_float_sse;
02908 #endif
02909 }
02910 if(mm_flags & AV_CPU_FLAG_3DNOW)
02911 c->vector_fmul_add = vector_fmul_add_3dnow;
02912 if(mm_flags & AV_CPU_FLAG_SSE2){
02913 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
02914 c->float_to_int16 = float_to_int16_sse2;
02915 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
02916 #if HAVE_YASM
02917 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
02918 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02919 #endif
02920 }
02921 if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM)
02922 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02923 }
02924
02925 if (CONFIG_ENCODERS)
02926 dsputilenc_init_mmx(c, avctx);
02927
02928 #if 0
02929
02930 get_pixels = just_return;
02931 put_pixels_clamped = just_return;
02932 add_pixels_clamped = just_return;
02933
02934 pix_abs16x16 = just_return;
02935 pix_abs16x16_x2 = just_return;
02936 pix_abs16x16_y2 = just_return;
02937 pix_abs16x16_xy2 = just_return;
02938
02939 put_pixels_tab[0] = just_return;
02940 put_pixels_tab[1] = just_return;
02941 put_pixels_tab[2] = just_return;
02942 put_pixels_tab[3] = just_return;
02943
02944 put_no_rnd_pixels_tab[0] = just_return;
02945 put_no_rnd_pixels_tab[1] = just_return;
02946 put_no_rnd_pixels_tab[2] = just_return;
02947 put_no_rnd_pixels_tab[3] = just_return;
02948
02949 avg_pixels_tab[0] = just_return;
02950 avg_pixels_tab[1] = just_return;
02951 avg_pixels_tab[2] = just_return;
02952 avg_pixels_tab[3] = just_return;
02953
02954 avg_no_rnd_pixels_tab[0] = just_return;
02955 avg_no_rnd_pixels_tab[1] = just_return;
02956 avg_no_rnd_pixels_tab[2] = just_return;
02957 avg_no_rnd_pixels_tab[3] = just_return;
02958
02959
02960
02961 #endif
02962 }