00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027
00028
00029
00030 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00033 \
00034 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00035 psum = vec_mladd(vB, vsrc1ssH, psum);\
00036 psum = vec_mladd(vC, vsrc2ssH, psum);\
00037 psum = vec_mladd(vD, vsrc3ssH, psum);\
00038 psum = BIAS2(psum);\
00039 psum = vec_sr(psum, v6us);\
00040 \
00041 vdst = vec_ld(0, dst);\
00042 ppsum = (vec_u8)vec_pack(psum, psum);\
00043 vfdst = vec_perm(vdst, ppsum, fperm);\
00044 \
00045 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00046 \
00047 vec_st(fsum, 0, dst);\
00048 \
00049 vsrc0ssH = vsrc2ssH;\
00050 vsrc1ssH = vsrc3ssH;\
00051 \
00052 dst += stride;\
00053 src += stride;
00054
00055 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00056 \
00057 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00058 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00059 \
00060 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00061 psum = vec_mladd(vE, vsrc1ssH, psum);\
00062 psum = vec_sr(psum, v6us);\
00063 \
00064 vdst = vec_ld(0, dst);\
00065 ppsum = (vec_u8)vec_pack(psum, psum);\
00066 vfdst = vec_perm(vdst, ppsum, fperm);\
00067 \
00068 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00069 \
00070 vec_st(fsum, 0, dst);\
00071 \
00072 dst += stride;\
00073 src += stride;
00074
00075 #define noop(a) a
00076 #define add28(a) vec_add(v28ss, a)
00077
00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00079 int stride, int h, int x, int y) {
00080 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00081 {((8 - x) * (8 - y)),
00082 (( x) * (8 - y)),
00083 ((8 - x) * ( y)),
00084 (( x) * ( y))};
00085 register int i;
00086 vec_u8 fperm;
00087 const vec_s32 vABCD = vec_ld(0, ABCD);
00088 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00089 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00090 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00091 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00092 LOAD_ZERO;
00093 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00094 const vec_u16 v6us = vec_splat_u16(6);
00095 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00096 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00097
00098 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00099 vec_u8 vsrc0uc, vsrc1uc;
00100 vec_s16 vsrc0ssH, vsrc1ssH;
00101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00102 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00103 vec_u8 vdst, ppsum, vfdst, fsum;
00104
00105 if (((unsigned long)dst) % 16 == 0) {
00106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00107 0x14, 0x15, 0x16, 0x17,
00108 0x08, 0x09, 0x0A, 0x0B,
00109 0x0C, 0x0D, 0x0E, 0x0F};
00110 } else {
00111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00112 0x04, 0x05, 0x06, 0x07,
00113 0x18, 0x19, 0x1A, 0x1B,
00114 0x1C, 0x1D, 0x1E, 0x1F};
00115 }
00116
00117 vsrcAuc = vec_ld(0, src);
00118
00119 if (loadSecond)
00120 vsrcBuc = vec_ld(16, src);
00121 vsrcperm0 = vec_lvsl(0, src);
00122 vsrcperm1 = vec_lvsl(1, src);
00123
00124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00125 if (reallyBadAlign)
00126 vsrc1uc = vsrcBuc;
00127 else
00128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00129
00130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00132
00133 if (ABCD[3]) {
00134 if (!loadSecond) {
00135 for (i = 0 ; i < h ; i++) {
00136 vsrcCuc = vec_ld(stride + 0, src);
00137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00139
00140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00141 }
00142 } else {
00143 vec_u8 vsrcDuc;
00144 for (i = 0 ; i < h ; i++) {
00145 vsrcCuc = vec_ld(stride + 0, src);
00146 vsrcDuc = vec_ld(stride + 16, src);
00147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00148 if (reallyBadAlign)
00149 vsrc3uc = vsrcDuc;
00150 else
00151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00152
00153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00154 }
00155 }
00156 } else {
00157 const vec_s16 vE = vec_add(vB, vC);
00158 if (ABCD[2]) {
00159 if (!loadSecond) {
00160 for (i = 0 ; i < h ; i++) {
00161 vsrcCuc = vec_ld(stride + 0, src);
00162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00164
00165 vsrc0uc = vsrc1uc;
00166 }
00167 } else {
00168 vec_u8 vsrcDuc;
00169 for (i = 0 ; i < h ; i++) {
00170 vsrcCuc = vec_ld(stride + 0, src);
00171 vsrcDuc = vec_ld(stride + 15, src);
00172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00174
00175 vsrc0uc = vsrc1uc;
00176 }
00177 }
00178 } else {
00179 if (!loadSecond) {
00180 for (i = 0 ; i < h ; i++) {
00181 vsrcCuc = vec_ld(0, src);
00182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00184
00185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00186 }
00187 } else {
00188 vec_u8 vsrcDuc;
00189 for (i = 0 ; i < h ; i++) {
00190 vsrcCuc = vec_ld(0, src);
00191 vsrcDuc = vec_ld(15, src);
00192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00193 if (reallyBadAlign)
00194 vsrc1uc = vsrcDuc;
00195 else
00196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00197
00198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00199 }
00200 }
00201 }
00202 }
00203 }
00204
00205
00206 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00207 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00208 {((8 - x) * (8 - y)),
00209 (( x) * (8 - y)),
00210 ((8 - x) * ( y)),
00211 (( x) * ( y))};
00212 register int i;
00213 vec_u8 fperm;
00214 const vec_s32 vABCD = vec_ld(0, ABCD);
00215 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00216 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00217 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00218 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00219 LOAD_ZERO;
00220 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00221 const vec_u16 v6us = vec_splat_u16(6);
00222 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00223 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00224
00225 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00226 vec_u8 vsrc0uc, vsrc1uc;
00227 vec_s16 vsrc0ssH, vsrc1ssH;
00228 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00229 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00230 vec_u8 vdst, ppsum, vfdst, fsum;
00231
00232 if (((unsigned long)dst) % 16 == 0) {
00233 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00234 0x14, 0x15, 0x16, 0x17,
00235 0x08, 0x09, 0x0A, 0x0B,
00236 0x0C, 0x0D, 0x0E, 0x0F};
00237 } else {
00238 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00239 0x04, 0x05, 0x06, 0x07,
00240 0x18, 0x19, 0x1A, 0x1B,
00241 0x1C, 0x1D, 0x1E, 0x1F};
00242 }
00243
00244 vsrcAuc = vec_ld(0, src);
00245
00246 if (loadSecond)
00247 vsrcBuc = vec_ld(16, src);
00248 vsrcperm0 = vec_lvsl(0, src);
00249 vsrcperm1 = vec_lvsl(1, src);
00250
00251 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00252 if (reallyBadAlign)
00253 vsrc1uc = vsrcBuc;
00254 else
00255 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00256
00257 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00258 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00259
00260 if (!loadSecond) {
00261 for (i = 0 ; i < h ; i++) {
00262
00263
00264 vsrcCuc = vec_ld(stride + 0, src);
00265
00266 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00267 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00268
00269 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00270 }
00271 } else {
00272 vec_u8 vsrcDuc;
00273 for (i = 0 ; i < h ; i++) {
00274 vsrcCuc = vec_ld(stride + 0, src);
00275 vsrcDuc = vec_ld(stride + 16, src);
00276
00277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00278 if (reallyBadAlign)
00279 vsrc3uc = vsrcDuc;
00280 else
00281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00282
00283 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00284 }
00285 }
00286 }
00287
00288 #undef noop
00289 #undef add28
00290 #undef CHROMA_MC8_ALTIVEC_CORE
00291
00292
00293 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00294 register int i;
00295
00296 LOAD_ZERO;
00297 const vec_u8 permM2 = vec_lvsl(-2, src);
00298 const vec_u8 permM1 = vec_lvsl(-1, src);
00299 const vec_u8 permP0 = vec_lvsl(+0, src);
00300 const vec_u8 permP1 = vec_lvsl(+1, src);
00301 const vec_u8 permP2 = vec_lvsl(+2, src);
00302 const vec_u8 permP3 = vec_lvsl(+3, src);
00303 const vec_s16 v5ss = vec_splat_s16(5);
00304 const vec_u16 v5us = vec_splat_u16(5);
00305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00306 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00307
00308 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00309
00310 register int align = ((((unsigned long)src) - 2) % 16);
00311
00312 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00313 srcP2A, srcP2B, srcP3A, srcP3B,
00314 srcM1A, srcM1B, srcM2A, srcM2B,
00315 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00316 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00317 psumA, psumB, sumA, sumB;
00318
00319 vec_u8 sum, vdst, fsum;
00320
00321 for (i = 0 ; i < 16 ; i ++) {
00322 vec_u8 srcR1 = vec_ld(-2, src);
00323 vec_u8 srcR2 = vec_ld(14, src);
00324
00325 switch (align) {
00326 default: {
00327 srcM2 = vec_perm(srcR1, srcR2, permM2);
00328 srcM1 = vec_perm(srcR1, srcR2, permM1);
00329 srcP0 = vec_perm(srcR1, srcR2, permP0);
00330 srcP1 = vec_perm(srcR1, srcR2, permP1);
00331 srcP2 = vec_perm(srcR1, srcR2, permP2);
00332 srcP3 = vec_perm(srcR1, srcR2, permP3);
00333 } break;
00334 case 11: {
00335 srcM2 = vec_perm(srcR1, srcR2, permM2);
00336 srcM1 = vec_perm(srcR1, srcR2, permM1);
00337 srcP0 = vec_perm(srcR1, srcR2, permP0);
00338 srcP1 = vec_perm(srcR1, srcR2, permP1);
00339 srcP2 = vec_perm(srcR1, srcR2, permP2);
00340 srcP3 = srcR2;
00341 } break;
00342 case 12: {
00343 vec_u8 srcR3 = vec_ld(30, src);
00344 srcM2 = vec_perm(srcR1, srcR2, permM2);
00345 srcM1 = vec_perm(srcR1, srcR2, permM1);
00346 srcP0 = vec_perm(srcR1, srcR2, permP0);
00347 srcP1 = vec_perm(srcR1, srcR2, permP1);
00348 srcP2 = srcR2;
00349 srcP3 = vec_perm(srcR2, srcR3, permP3);
00350 } break;
00351 case 13: {
00352 vec_u8 srcR3 = vec_ld(30, src);
00353 srcM2 = vec_perm(srcR1, srcR2, permM2);
00354 srcM1 = vec_perm(srcR1, srcR2, permM1);
00355 srcP0 = vec_perm(srcR1, srcR2, permP0);
00356 srcP1 = srcR2;
00357 srcP2 = vec_perm(srcR2, srcR3, permP2);
00358 srcP3 = vec_perm(srcR2, srcR3, permP3);
00359 } break;
00360 case 14: {
00361 vec_u8 srcR3 = vec_ld(30, src);
00362 srcM2 = vec_perm(srcR1, srcR2, permM2);
00363 srcM1 = vec_perm(srcR1, srcR2, permM1);
00364 srcP0 = srcR2;
00365 srcP1 = vec_perm(srcR2, srcR3, permP1);
00366 srcP2 = vec_perm(srcR2, srcR3, permP2);
00367 srcP3 = vec_perm(srcR2, srcR3, permP3);
00368 } break;
00369 case 15: {
00370 vec_u8 srcR3 = vec_ld(30, src);
00371 srcM2 = vec_perm(srcR1, srcR2, permM2);
00372 srcM1 = srcR2;
00373 srcP0 = vec_perm(srcR2, srcR3, permP0);
00374 srcP1 = vec_perm(srcR2, srcR3, permP1);
00375 srcP2 = vec_perm(srcR2, srcR3, permP2);
00376 srcP3 = vec_perm(srcR2, srcR3, permP3);
00377 } break;
00378 }
00379
00380 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00381 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00382 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00383 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00384
00385 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00386 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00387 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00388 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00389
00390 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00391 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00392 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00393 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00394
00395 sum1A = vec_adds(srcP0A, srcP1A);
00396 sum1B = vec_adds(srcP0B, srcP1B);
00397 sum2A = vec_adds(srcM1A, srcP2A);
00398 sum2B = vec_adds(srcM1B, srcP2B);
00399 sum3A = vec_adds(srcM2A, srcP3A);
00400 sum3B = vec_adds(srcM2B, srcP3B);
00401
00402 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00403 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00404
00405 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00406 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00407
00408 pp3A = vec_add(sum3A, pp1A);
00409 pp3B = vec_add(sum3B, pp1B);
00410
00411 psumA = vec_sub(pp3A, pp2A);
00412 psumB = vec_sub(pp3B, pp2B);
00413
00414 sumA = vec_sra(psumA, v5us);
00415 sumB = vec_sra(psumB, v5us);
00416
00417 sum = vec_packsu(sumA, sumB);
00418
00419 ASSERT_ALIGNED(dst);
00420 vdst = vec_ld(0, dst);
00421
00422 OP_U8_ALTIVEC(fsum, sum, vdst);
00423
00424 vec_st(fsum, 0, dst);
00425
00426 src += srcStride;
00427 dst += dstStride;
00428 }
00429 }
00430
00431
00432 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00433 register int i;
00434
00435 LOAD_ZERO;
00436 const vec_u8 perm = vec_lvsl(0, src);
00437 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00438 const vec_u16 v5us = vec_splat_u16(5);
00439 const vec_s16 v5ss = vec_splat_s16(5);
00440 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00441
00442 uint8_t *srcbis = src - (srcStride * 2);
00443
00444 const vec_u8 srcM2a = vec_ld(0, srcbis);
00445 const vec_u8 srcM2b = vec_ld(16, srcbis);
00446 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00447
00448 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00449 const vec_u8 srcM1b = vec_ld(16, srcbis);
00450 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00451
00452 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00453 const vec_u8 srcP0b = vec_ld(16, srcbis);
00454 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00455
00456 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00457 const vec_u8 srcP1b = vec_ld(16, srcbis);
00458 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00459
00460 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00461 const vec_u8 srcP2b = vec_ld(16, srcbis);
00462 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00463
00464
00465 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00466 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00467 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00468 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00469 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00470 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00471 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00472 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00473 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00474 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00475
00476 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00477 psumA, psumB, sumA, sumB,
00478 srcP3ssA, srcP3ssB,
00479 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00480
00481 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00482
00483 for (i = 0 ; i < 16 ; i++) {
00484 srcP3a = vec_ld(0, srcbis += srcStride);
00485 srcP3b = vec_ld(16, srcbis);
00486 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00487 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00488 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00489
00490
00491 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00492 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00493 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00494 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00495 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00496 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00497
00498 srcM2ssA = srcM1ssA;
00499 srcM2ssB = srcM1ssB;
00500 srcM1ssA = srcP0ssA;
00501 srcM1ssB = srcP0ssB;
00502 srcP0ssA = srcP1ssA;
00503 srcP0ssB = srcP1ssB;
00504 srcP1ssA = srcP2ssA;
00505 srcP1ssB = srcP2ssB;
00506 srcP2ssA = srcP3ssA;
00507 srcP2ssB = srcP3ssB;
00508
00509 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00510 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00511
00512 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00513 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00514
00515 pp3A = vec_add(sum3A, pp1A);
00516 pp3B = vec_add(sum3B, pp1B);
00517
00518 psumA = vec_sub(pp3A, pp2A);
00519 psumB = vec_sub(pp3B, pp2B);
00520
00521 sumA = vec_sra(psumA, v5us);
00522 sumB = vec_sra(psumB, v5us);
00523
00524 sum = vec_packsu(sumA, sumB);
00525
00526 ASSERT_ALIGNED(dst);
00527 vdst = vec_ld(0, dst);
00528
00529 OP_U8_ALTIVEC(fsum, sum, vdst);
00530
00531 vec_st(fsum, 0, dst);
00532
00533 dst += dstStride;
00534 }
00535 }
00536
00537
00538 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00539 register int i;
00540 LOAD_ZERO;
00541 const vec_u8 permM2 = vec_lvsl(-2, src);
00542 const vec_u8 permM1 = vec_lvsl(-1, src);
00543 const vec_u8 permP0 = vec_lvsl(+0, src);
00544 const vec_u8 permP1 = vec_lvsl(+1, src);
00545 const vec_u8 permP2 = vec_lvsl(+2, src);
00546 const vec_u8 permP3 = vec_lvsl(+3, src);
00547 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00548 const vec_u32 v10ui = vec_splat_u32(10);
00549 const vec_s16 v5ss = vec_splat_s16(5);
00550 const vec_s16 v1ss = vec_splat_s16(1);
00551 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00552 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00553
00554 register int align = ((((unsigned long)src) - 2) % 16);
00555
00556 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00557 srcP2A, srcP2B, srcP3A, srcP3B,
00558 srcM1A, srcM1B, srcM2A, srcM2B,
00559 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00560 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00561
00562 const vec_u8 mperm = (const vec_u8)
00563 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00564 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00565 int16_t *tmpbis = tmp;
00566
00567 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00568 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00569 tmpP2ssA, tmpP2ssB;
00570
00571 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00572 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00573 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00574 ssumAe, ssumAo, ssumBe, ssumBo;
00575 vec_u8 fsum, sumv, sum, vdst;
00576 vec_s16 ssume, ssumo;
00577
00578 src -= (2 * srcStride);
00579 for (i = 0 ; i < 21 ; i ++) {
00580 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00581 vec_u8 srcR1 = vec_ld(-2, src);
00582 vec_u8 srcR2 = vec_ld(14, src);
00583
00584 switch (align) {
00585 default: {
00586 srcM2 = vec_perm(srcR1, srcR2, permM2);
00587 srcM1 = vec_perm(srcR1, srcR2, permM1);
00588 srcP0 = vec_perm(srcR1, srcR2, permP0);
00589 srcP1 = vec_perm(srcR1, srcR2, permP1);
00590 srcP2 = vec_perm(srcR1, srcR2, permP2);
00591 srcP3 = vec_perm(srcR1, srcR2, permP3);
00592 } break;
00593 case 11: {
00594 srcM2 = vec_perm(srcR1, srcR2, permM2);
00595 srcM1 = vec_perm(srcR1, srcR2, permM1);
00596 srcP0 = vec_perm(srcR1, srcR2, permP0);
00597 srcP1 = vec_perm(srcR1, srcR2, permP1);
00598 srcP2 = vec_perm(srcR1, srcR2, permP2);
00599 srcP3 = srcR2;
00600 } break;
00601 case 12: {
00602 vec_u8 srcR3 = vec_ld(30, src);
00603 srcM2 = vec_perm(srcR1, srcR2, permM2);
00604 srcM1 = vec_perm(srcR1, srcR2, permM1);
00605 srcP0 = vec_perm(srcR1, srcR2, permP0);
00606 srcP1 = vec_perm(srcR1, srcR2, permP1);
00607 srcP2 = srcR2;
00608 srcP3 = vec_perm(srcR2, srcR3, permP3);
00609 } break;
00610 case 13: {
00611 vec_u8 srcR3 = vec_ld(30, src);
00612 srcM2 = vec_perm(srcR1, srcR2, permM2);
00613 srcM1 = vec_perm(srcR1, srcR2, permM1);
00614 srcP0 = vec_perm(srcR1, srcR2, permP0);
00615 srcP1 = srcR2;
00616 srcP2 = vec_perm(srcR2, srcR3, permP2);
00617 srcP3 = vec_perm(srcR2, srcR3, permP3);
00618 } break;
00619 case 14: {
00620 vec_u8 srcR3 = vec_ld(30, src);
00621 srcM2 = vec_perm(srcR1, srcR2, permM2);
00622 srcM1 = vec_perm(srcR1, srcR2, permM1);
00623 srcP0 = srcR2;
00624 srcP1 = vec_perm(srcR2, srcR3, permP1);
00625 srcP2 = vec_perm(srcR2, srcR3, permP2);
00626 srcP3 = vec_perm(srcR2, srcR3, permP3);
00627 } break;
00628 case 15: {
00629 vec_u8 srcR3 = vec_ld(30, src);
00630 srcM2 = vec_perm(srcR1, srcR2, permM2);
00631 srcM1 = srcR2;
00632 srcP0 = vec_perm(srcR2, srcR3, permP0);
00633 srcP1 = vec_perm(srcR2, srcR3, permP1);
00634 srcP2 = vec_perm(srcR2, srcR3, permP2);
00635 srcP3 = vec_perm(srcR2, srcR3, permP3);
00636 } break;
00637 }
00638
00639 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00640 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00641 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00642 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00643
00644 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00645 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00646 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00647 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00648
00649 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00650 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00651 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00652 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00653
00654 sum1A = vec_adds(srcP0A, srcP1A);
00655 sum1B = vec_adds(srcP0B, srcP1B);
00656 sum2A = vec_adds(srcM1A, srcP2A);
00657 sum2B = vec_adds(srcM1B, srcP2B);
00658 sum3A = vec_adds(srcM2A, srcP3A);
00659 sum3B = vec_adds(srcM2B, srcP3B);
00660
00661 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00662 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00663
00664 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00665 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00666
00667 psumA = vec_sub(pp1A, pp2A);
00668 psumB = vec_sub(pp1B, pp2B);
00669
00670 vec_st(psumA, 0, tmp);
00671 vec_st(psumB, 16, tmp);
00672
00673 src += srcStride;
00674 tmp += tmpStride;
00675 }
00676
00677 tmpM2ssA = vec_ld(0, tmpbis);
00678 tmpM2ssB = vec_ld(16, tmpbis);
00679 tmpbis += tmpStride;
00680 tmpM1ssA = vec_ld(0, tmpbis);
00681 tmpM1ssB = vec_ld(16, tmpbis);
00682 tmpbis += tmpStride;
00683 tmpP0ssA = vec_ld(0, tmpbis);
00684 tmpP0ssB = vec_ld(16, tmpbis);
00685 tmpbis += tmpStride;
00686 tmpP1ssA = vec_ld(0, tmpbis);
00687 tmpP1ssB = vec_ld(16, tmpbis);
00688 tmpbis += tmpStride;
00689 tmpP2ssA = vec_ld(0, tmpbis);
00690 tmpP2ssB = vec_ld(16, tmpbis);
00691 tmpbis += tmpStride;
00692
00693 for (i = 0 ; i < 16 ; i++) {
00694 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00695 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00696
00697 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00698 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00699 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00700 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00701 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00702 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00703
00704 tmpbis += tmpStride;
00705
00706 tmpM2ssA = tmpM1ssA;
00707 tmpM2ssB = tmpM1ssB;
00708 tmpM1ssA = tmpP0ssA;
00709 tmpM1ssB = tmpP0ssB;
00710 tmpP0ssA = tmpP1ssA;
00711 tmpP0ssB = tmpP1ssB;
00712 tmpP1ssA = tmpP2ssA;
00713 tmpP1ssB = tmpP2ssB;
00714 tmpP2ssA = tmpP3ssA;
00715 tmpP2ssB = tmpP3ssB;
00716
00717 pp1Ae = vec_mule(sum1A, v20ss);
00718 pp1Ao = vec_mulo(sum1A, v20ss);
00719 pp1Be = vec_mule(sum1B, v20ss);
00720 pp1Bo = vec_mulo(sum1B, v20ss);
00721
00722 pp2Ae = vec_mule(sum2A, v5ss);
00723 pp2Ao = vec_mulo(sum2A, v5ss);
00724 pp2Be = vec_mule(sum2B, v5ss);
00725 pp2Bo = vec_mulo(sum2B, v5ss);
00726
00727 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00728 pp3Ao = vec_mulo(sum3A, v1ss);
00729 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00730 pp3Bo = vec_mulo(sum3B, v1ss);
00731
00732 pp1cAe = vec_add(pp1Ae, v512si);
00733 pp1cAo = vec_add(pp1Ao, v512si);
00734 pp1cBe = vec_add(pp1Be, v512si);
00735 pp1cBo = vec_add(pp1Bo, v512si);
00736
00737 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00738 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00739 pp32Be = vec_sub(pp3Be, pp2Be);
00740 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00741
00742 sumAe = vec_add(pp1cAe, pp32Ae);
00743 sumAo = vec_add(pp1cAo, pp32Ao);
00744 sumBe = vec_add(pp1cBe, pp32Be);
00745 sumBo = vec_add(pp1cBo, pp32Bo);
00746
00747 ssumAe = vec_sra(sumAe, v10ui);
00748 ssumAo = vec_sra(sumAo, v10ui);
00749 ssumBe = vec_sra(sumBe, v10ui);
00750 ssumBo = vec_sra(sumBo, v10ui);
00751
00752 ssume = vec_packs(ssumAe, ssumBe);
00753 ssumo = vec_packs(ssumAo, ssumBo);
00754
00755 sumv = vec_packsu(ssume, ssumo);
00756 sum = vec_perm(sumv, sumv, mperm);
00757
00758 ASSERT_ALIGNED(dst);
00759 vdst = vec_ld(0, dst);
00760
00761 OP_U8_ALTIVEC(fsum, sum, vdst);
00762
00763 vec_st(fsum, 0, dst);
00764
00765 dst += dstStride;
00766 }
00767 }