Altivec-optimized blitters!
Vast majority of this work is compliments of Bob Ippolito. http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many other posts. --HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%401048
This commit is contained in:
parent
20dbe0d931
commit
48c21b291e
4 changed files with 1501 additions and 26 deletions
14
configure.in
14
configure.in
|
@ -1839,17 +1839,18 @@ CheckAltivec()
|
|||
{
|
||||
AC_MSG_CHECKING(for GCC Altivec instruction support)
|
||||
have_gcc_altivec=no
|
||||
save_CFLAGS="${CFLAGS}"
|
||||
CFLAGS="${CFLAGS} -DGCC_ALTIVEC -DUSE_ALTIVEC_BLITTERS -faltivec"
|
||||
AC_TRY_COMPILE([
|
||||
vector unsigned int vzero() {
|
||||
return vec_splat_u32(0);
|
||||
}
|
||||
],[
|
||||
asm volatile ("mtspr 256, %0\n\t"
|
||||
"vand %%v0, %%v0, %%v0"
|
||||
:
|
||||
: "r" (-1));
|
||||
],[
|
||||
have_gcc_altivec=yes
|
||||
])
|
||||
if test x$have_gcc_altivec = xyes; then
|
||||
CFLAGS="$CFLAGS -DGCC_ALTIVEC"
|
||||
if test x$have_gcc_altivec = xno; then
|
||||
CFLAGS="${save_CFLAGS}"
|
||||
fi
|
||||
AC_MSG_RESULT($have_gcc_altivec)
|
||||
}
|
||||
|
@ -2564,6 +2565,7 @@ case "$target" in
|
|||
CheckMacGL
|
||||
CheckPTHREAD
|
||||
CheckSIGACTION
|
||||
CheckAltivec
|
||||
# If either the audio or CD driver is used, add the AudioUnit framework
|
||||
if test x$enable_audio = xyes -o x$enable_cdrom = xyes; then
|
||||
SYSTEM_LIBS="$SYSTEM_LIBS -framework AudioToolbox -framework AudioUnit"
|
||||
|
|
|
@ -374,6 +374,20 @@ do { \
|
|||
dB = (((sB-dB)*(A))>>8)+dB; \
|
||||
} while(0)
|
||||
|
||||
/* Blend the RGB values of two pixels based on a source alpha value */
|
||||
#define ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB) \
|
||||
do { \
|
||||
unsigned tR, tG, tB, tA; \
|
||||
tA = 255 - sA; \
|
||||
tR = 1 + (sR * sA) + (dR * tA); \
|
||||
dR = (tR + (tR >> 8)) >> 8; \
|
||||
tG = 1 + (sG * sA) + (dG * tA); \
|
||||
dG = (tG + (tG >> 8)) >> 8; \
|
||||
tB = 1 + (sB * sA) + (dB * tA); \
|
||||
dB = (tB + (tB >> 8)) >> 8; \
|
||||
} while(0)
|
||||
|
||||
|
||||
/* This is a very useful loop for optimizing blitters */
|
||||
#if defined(_MSC_VER) && (_MSC_VER == 1300)
|
||||
/* There's a bug in the Visual C++ 7 optimizer when compiling this code */
|
||||
|
|
|
@ -35,9 +35,9 @@ static char rcsid =
|
|||
#define MMX_ASMBLIT
|
||||
#endif
|
||||
|
||||
#ifdef MMX_ASMBLIT
|
||||
/* Function to check the CPU flags */
|
||||
#include "SDL_cpuinfo.h"
|
||||
#ifdef MMX_ASMBLIT
|
||||
#include "mmx.h"
|
||||
#endif
|
||||
|
||||
|
@ -421,6 +421,762 @@ static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
#include <assert.h>
|
||||
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
|
||||
#define VECPRINT(msg, v) do { \
|
||||
vector unsigned int tmpvec = (vector unsigned int)(v); \
|
||||
unsigned int *vp = (unsigned int *)&tmpvec; \
|
||||
printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
|
||||
} while (0)
|
||||
|
||||
/* the permuation vector that takes the high bytes out of all the appropriate shorts
|
||||
(vector unsigned char)(
|
||||
0x00, 0x10, 0x02, 0x12,
|
||||
0x04, 0x14, 0x06, 0x16,
|
||||
0x08, 0x18, 0x0A, 0x1A,
|
||||
0x0C, 0x1C, 0x0E, 0x1E );
|
||||
*/
|
||||
#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
|
||||
#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
|
||||
#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
|
||||
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
|
||||
? vec_lvsl(0, src) \
|
||||
: vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
|
||||
|
||||
|
||||
#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
|
||||
/* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
|
||||
vector unsigned short vtemp1 = vec_mule(vs, valpha); \
|
||||
/* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
|
||||
vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
|
||||
/* valpha2 is 255-alpha */ \
|
||||
vector unsigned char valpha2 = vec_nor(valpha, valpha); \
|
||||
/* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
|
||||
vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
|
||||
/* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
|
||||
vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
|
||||
/* add source and dest */ \
|
||||
vtemp1 = vec_add(vtemp1, vtemp3); \
|
||||
vtemp2 = vec_add(vtemp2, vtemp4); \
|
||||
/* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
|
||||
vtemp1 = vec_add(vtemp1, v1_16); \
|
||||
vtemp3 = vec_sr(vtemp1, v8_16); \
|
||||
vtemp1 = vec_add(vtemp1, vtemp3); \
|
||||
/* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
|
||||
vtemp2 = vec_add(vtemp2, v1_16); \
|
||||
vtemp4 = vec_sr(vtemp2, v8_16); \
|
||||
vtemp2 = vec_add(vtemp2, vtemp4); \
|
||||
/* (>>8) and get ARGBARGBARGBARGB */ \
|
||||
vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
|
||||
} while (0)
|
||||
|
||||
/* Calculate the permute vector used for 32->32 swizzling */
|
||||
static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
|
||||
const SDL_PixelFormat *dstfmt)
|
||||
{
|
||||
/*
|
||||
* We have to assume that the bits that aren't used by other
|
||||
* colors is alpha, and it's one complete byte, since some formats
|
||||
* leave alpha with a zero mask, but we should still swizzle the bits.
|
||||
*/
|
||||
/* ARGB */
|
||||
const static struct SDL_PixelFormat default_pixel_format = {
|
||||
NULL, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
16, 8, 0, 24,
|
||||
0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
|
||||
0, 0};
|
||||
if (!srcfmt) {
|
||||
srcfmt = &default_pixel_format;
|
||||
}
|
||||
if (!dstfmt) {
|
||||
dstfmt = &default_pixel_format;
|
||||
}
|
||||
vector unsigned char plus = (vector unsigned char)
|
||||
( 0x00, 0x00, 0x00, 0x00,
|
||||
0x04, 0x04, 0x04, 0x04,
|
||||
0x08, 0x08, 0x08, 0x08,
|
||||
0x0C, 0x0C, 0x0C, 0x0C );
|
||||
vector unsigned char vswiz;
|
||||
vector unsigned int srcvec;
|
||||
#define RESHIFT(X) (3 - ((X) >> 3))
|
||||
Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
|
||||
Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
|
||||
Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
|
||||
Uint32 amask;
|
||||
/* Use zero for alpha if either surface doesn't have alpha */
|
||||
if (dstfmt->Amask) {
|
||||
amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
|
||||
} else {
|
||||
amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
|
||||
}
|
||||
#undef RESHIFT
|
||||
((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
|
||||
vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
|
||||
return(vswiz);
|
||||
}
|
||||
|
||||
static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
int height = info->d_height;
|
||||
Uint8 *src = (Uint8 *)info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint8 *dst = (Uint8 *)info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
|
||||
vector unsigned char v0 = vec_splat_u8(0);
|
||||
vector unsigned short v8_16 = vec_splat_u16(8);
|
||||
vector unsigned short v1_16 = vec_splat_u16(1);
|
||||
vector unsigned short v2_16 = vec_splat_u16(2);
|
||||
vector unsigned short v3_16 = vec_splat_u16(3);
|
||||
vector unsigned int v8_32 = vec_splat_u32(8);
|
||||
vector unsigned int v16_32 = vec_add(v8_32, v8_32);
|
||||
vector unsigned short v3f = (vector unsigned short)(
|
||||
0x003f, 0x003f, 0x003f, 0x003f,
|
||||
0x003f, 0x003f, 0x003f, 0x003f);
|
||||
vector unsigned short vfc = (vector unsigned short)(
|
||||
0x00fc, 0x00fc, 0x00fc, 0x00fc,
|
||||
0x00fc, 0x00fc, 0x00fc, 0x00fc);
|
||||
|
||||
/*
|
||||
0x10 - 0x1f is the alpha
|
||||
0x00 - 0x0e evens are the red
|
||||
0x01 - 0x0f odds are zero
|
||||
*/
|
||||
vector unsigned char vredalpha1 = (vector unsigned char)(
|
||||
0x10, 0x00, 0x01, 0x01,
|
||||
0x10, 0x02, 0x01, 0x01,
|
||||
0x10, 0x04, 0x01, 0x01,
|
||||
0x10, 0x06, 0x01, 0x01
|
||||
);
|
||||
vector unsigned char vredalpha2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
|
||||
);
|
||||
/*
|
||||
0x00 - 0x0f is ARxx ARxx ARxx ARxx
|
||||
0x11 - 0x0f odds are blue
|
||||
*/
|
||||
vector unsigned char vblue1 = (vector unsigned char)(
|
||||
0x00, 0x01, 0x02, 0x11,
|
||||
0x04, 0x05, 0x06, 0x13,
|
||||
0x08, 0x09, 0x0a, 0x15,
|
||||
0x0c, 0x0d, 0x0e, 0x17
|
||||
);
|
||||
vector unsigned char vblue2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vblue1, v8_32)
|
||||
);
|
||||
/*
|
||||
0x00 - 0x0f is ARxB ARxB ARxB ARxB
|
||||
0x10 - 0x0e evens are green
|
||||
*/
|
||||
vector unsigned char vgreen1 = (vector unsigned char)(
|
||||
0x00, 0x01, 0x10, 0x03,
|
||||
0x04, 0x05, 0x12, 0x07,
|
||||
0x08, 0x09, 0x14, 0x0b,
|
||||
0x0c, 0x0d, 0x16, 0x0f
|
||||
);
|
||||
vector unsigned char vgreen2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
|
||||
);
|
||||
vector unsigned char vgmerge = (vector unsigned char)(
|
||||
0x00, 0x02, 0x00, 0x06,
|
||||
0x00, 0x0a, 0x00, 0x0e,
|
||||
0x00, 0x12, 0x00, 0x16,
|
||||
0x00, 0x1a, 0x00, 0x1e);
|
||||
vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
|
||||
vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
|
||||
vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
|
||||
|
||||
vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
|
||||
vf800 = vec_sl(vf800, vec_splat_u16(8));
|
||||
|
||||
while(height--) {
|
||||
int extrawidth;
|
||||
vector unsigned char valigner;
|
||||
vector unsigned char vsrc;
|
||||
vector unsigned char voverflow;
|
||||
int width = info->d_width;
|
||||
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
while (condition) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, dR, dG, dB, sA; \
|
||||
DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \
|
||||
if(sA) { \
|
||||
unsigned short dstpixel = *((unsigned short *)dst); \
|
||||
dR = (dstpixel >> 8) & 0xf8; \
|
||||
dG = (dstpixel >> 3) & 0xfc; \
|
||||
dB = (dstpixel << 3) & 0xf8; \
|
||||
ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
|
||||
*((unsigned short *)dst) = ( \
|
||||
((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
|
||||
); \
|
||||
} \
|
||||
src += 4; \
|
||||
dst += 2; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
|
||||
extrawidth = (width % 8);
|
||||
valigner = VEC_ALIGNER(src);
|
||||
vsrc = (vector unsigned char)vec_ld(0, src);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char vsrc1, vsrc2;
|
||||
vector unsigned char vdst1, vdst2;
|
||||
vector unsigned short vR, vG, vB;
|
||||
vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
|
||||
|
||||
/* Load 8 pixels from src as ARGB */
|
||||
voverflow = (vector unsigned char)vec_ld(15, src);
|
||||
vsrc = vec_perm(vsrc, voverflow, valigner);
|
||||
vsrc1 = vec_perm(vsrc, vsrc, vpermute);
|
||||
src += 16;
|
||||
vsrc = (vector unsigned char)vec_ld(15, src);
|
||||
voverflow = vec_perm(voverflow, vsrc, valigner);
|
||||
vsrc2 = vec_perm(voverflow, voverflow, vpermute);
|
||||
src += 16;
|
||||
|
||||
/* Load 8 pixels from dst as XRGB */
|
||||
voverflow = vec_ld(0, dst);
|
||||
vR = vec_and((vector unsigned short)voverflow, vf800);
|
||||
vB = vec_sl((vector unsigned short)voverflow, v3_16);
|
||||
vG = vec_sl(vB, v2_16);
|
||||
vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
|
||||
vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
|
||||
vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
|
||||
vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
|
||||
vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
|
||||
vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
|
||||
|
||||
/* Alpha blend 8 pixels as ARGB */
|
||||
valpha = vec_perm(vsrc1, v0, valphaPermute);
|
||||
VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
|
||||
valpha = vec_perm(vsrc2, v0, valphaPermute);
|
||||
VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
|
||||
|
||||
/* Convert 8 pixels to 565 */
|
||||
vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
|
||||
vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
|
||||
vgpixel = vec_and(vgpixel, vfc);
|
||||
vgpixel = vec_sl(vgpixel, v3_16);
|
||||
vrpixel = vec_sl(vpixel, v1_16);
|
||||
vrpixel = vec_and(vrpixel, vf800);
|
||||
vbpixel = vec_and(vpixel, v3f);
|
||||
vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
|
||||
vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
|
||||
|
||||
/* Store 8 pixels */
|
||||
vec_st(vdst1, 0, dst);
|
||||
|
||||
width -= 8;
|
||||
dst += 16;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
#undef ONE_PIXEL_BLEND
|
||||
src += srcskip;
|
||||
dst += dstskip;
|
||||
}
|
||||
}
|
||||
|
||||
static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
unsigned alpha = info->src->alpha;
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *)info->s_pixels;
|
||||
int srcskip = info->s_skip >> 2;
|
||||
Uint32 *dstp = (Uint32 *)info->d_pixels;
|
||||
int dstskip = info->d_skip >> 2;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
unsigned sA = srcfmt->alpha;
|
||||
unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
|
||||
Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
|
||||
Uint32 ckey = info->src->colorkey;
|
||||
vector unsigned char mergePermute;
|
||||
vector unsigned char vsrcPermute;
|
||||
vector unsigned char vdstPermute;
|
||||
vector unsigned char vsdstPermute;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char valphamask;
|
||||
vector unsigned char vbits;
|
||||
vector unsigned char v0;
|
||||
vector unsigned short v1;
|
||||
vector unsigned short v8;
|
||||
vector unsigned int vckey;
|
||||
vector unsigned int vrgbmask;
|
||||
|
||||
mergePermute = VEC_MERGE_PERMUTE();
|
||||
v0 = vec_splat_u8(0);
|
||||
v1 = vec_splat_u16(1);
|
||||
v8 = vec_splat_u16(8);
|
||||
|
||||
/* set the alpha to 255 on the destination surf */
|
||||
valphamask = VEC_ALPHA_MASK();
|
||||
|
||||
vsrcPermute = calc_swizzle32(srcfmt, NULL);
|
||||
vdstPermute = calc_swizzle32(NULL, dstfmt);
|
||||
vsdstPermute = calc_swizzle32(dstfmt, NULL);
|
||||
|
||||
/* set a vector full of alpha and 255-alpha */
|
||||
((unsigned char *)&valpha)[0] = alpha;
|
||||
valpha = vec_splat(valpha, 0);
|
||||
vbits = (vector unsigned char)vec_splat_s8(-1);
|
||||
|
||||
ckey &= rgbmask;
|
||||
((unsigned int *)&vckey)[0] = ckey;
|
||||
vckey = vec_splat(vckey, 0);
|
||||
((unsigned int *)&vrgbmask)[0] = rgbmask;
|
||||
vrgbmask = vec_splat(vrgbmask, 0);
|
||||
|
||||
while(height--) {
|
||||
int width = info->d_width;
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
while (condition) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, dR, dG, dB; \
|
||||
RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \
|
||||
if(sA && pixel != ckey) { \
|
||||
RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
|
||||
DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
|
||||
ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
|
||||
ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
|
||||
} \
|
||||
((Uint8 *)dstp) += 4; \
|
||||
((Uint8 *)srcp) += 4; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
if (width > 0) {
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = VEC_ALIGNER(srcp);
|
||||
vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char vsel;
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vd;
|
||||
vector unsigned char vd_orig;
|
||||
|
||||
/* s = *srcp */
|
||||
voverflow = (vector unsigned char)vec_ld(15, srcp);
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
|
||||
/* vsel is set for items that match the key */
|
||||
vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
|
||||
vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
|
||||
|
||||
/* permute to source format */
|
||||
vs = vec_perm(vs, valpha, vsrcPermute);
|
||||
|
||||
/* d = *dstp */
|
||||
vd = (vector unsigned char)vec_ld(0, dstp);
|
||||
vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
|
||||
|
||||
VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
|
||||
|
||||
/* set the alpha channel to full on */
|
||||
vd = vec_or(vd, valphamask);
|
||||
|
||||
/* mask out color key */
|
||||
vd = vec_sel(vd, vd_orig, vsel);
|
||||
|
||||
/* permute to dest format */
|
||||
vd = vec_perm(vd, vbits, vdstPermute);
|
||||
|
||||
/* *dstp = res */
|
||||
vec_st((vector unsigned int)vd, 0, dstp);
|
||||
|
||||
srcp += 4;
|
||||
dstp += 4;
|
||||
width -= 4;
|
||||
vs = voverflow;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
}
|
||||
#undef ONE_PIXEL_BLEND
|
||||
|
||||
srcp += srcskip;
|
||||
dstp += dstskip;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
int width = info->d_width;
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *)info->s_pixels;
|
||||
int srcskip = info->s_skip >> 2;
|
||||
Uint32 *dstp = (Uint32 *)info->d_pixels;
|
||||
int dstskip = info->d_skip >> 2;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
vector unsigned char mergePermute;
|
||||
vector unsigned char valphaPermute;
|
||||
vector unsigned char vsrcPermute;
|
||||
vector unsigned char vdstPermute;
|
||||
vector unsigned char vsdstPermute;
|
||||
vector unsigned char valphamask;
|
||||
vector unsigned char vpixelmask;
|
||||
vector unsigned char v0;
|
||||
vector unsigned short v1;
|
||||
vector unsigned short v8;
|
||||
|
||||
v0 = vec_splat_u8(0);
|
||||
v1 = vec_splat_u16(1);
|
||||
v8 = vec_splat_u16(8);
|
||||
mergePermute = VEC_MERGE_PERMUTE();
|
||||
valphamask = VEC_ALPHA_MASK();
|
||||
valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
|
||||
vpixelmask = vec_nor(valphamask, v0);
|
||||
vsrcPermute = calc_swizzle32(srcfmt, NULL);
|
||||
vdstPermute = calc_swizzle32(NULL, dstfmt);
|
||||
vsdstPermute = calc_swizzle32(dstfmt, NULL);
|
||||
|
||||
while ( height-- ) {
|
||||
width = info->d_width;
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
|
||||
DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \
|
||||
if(sA) { \
|
||||
DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \
|
||||
ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
|
||||
ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
|
||||
} \
|
||||
++srcp; \
|
||||
++dstp; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
if (width > 0) {
|
||||
// vsrcPermute
|
||||
// vdstPermute
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = VEC_ALIGNER(srcp);
|
||||
vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vd;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char vdstalpha;
|
||||
/* s = *srcp */
|
||||
voverflow = (vector unsigned char)vec_ld(15, srcp);
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
vs = vec_perm(vs, v0, vsrcPermute);
|
||||
|
||||
valpha = vec_perm(vs, v0, valphaPermute);
|
||||
|
||||
/* d = *dstp */
|
||||
vd = (vector unsigned char)vec_ld(0, dstp);
|
||||
vd = vec_perm(vd, v0, vsdstPermute);
|
||||
vdstalpha = vec_and(vd, valphamask);
|
||||
|
||||
VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
|
||||
|
||||
/* set the alpha to the dest alpha */
|
||||
vd = vec_and(vd, vpixelmask);
|
||||
vd = vec_or(vd, vdstalpha);
|
||||
vd = vec_perm(vd, v0, vdstPermute);
|
||||
|
||||
/* *dstp = res */
|
||||
vec_st((vector unsigned int)vd, 0, dstp);
|
||||
|
||||
srcp += 4;
|
||||
dstp += 4;
|
||||
width -= 4;
|
||||
vs = voverflow;
|
||||
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
}
|
||||
srcp += srcskip;
|
||||
dstp += dstskip;
|
||||
#undef ONE_PIXEL_BLEND
|
||||
}
|
||||
}
|
||||
|
||||
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
|
||||
static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
int width = info->d_width;
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *)info->s_pixels;
|
||||
int srcskip = info->s_skip >> 2;
|
||||
Uint32 *dstp = (Uint32 *)info->d_pixels;
|
||||
int dstskip = info->d_skip >> 2;
|
||||
vector unsigned char mergePermute;
|
||||
vector unsigned char valphaPermute;
|
||||
vector unsigned char valphamask;
|
||||
vector unsigned char vpixelmask;
|
||||
vector unsigned char v0;
|
||||
vector unsigned short v1;
|
||||
vector unsigned short v8;
|
||||
v0 = vec_splat_u8(0);
|
||||
v1 = vec_splat_u16(1);
|
||||
v8 = vec_splat_u16(8);
|
||||
mergePermute = VEC_MERGE_PERMUTE();
|
||||
valphamask = VEC_ALPHA_MASK();
|
||||
valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
|
||||
|
||||
|
||||
vpixelmask = vec_nor(valphamask, v0);
|
||||
while(height--) {
|
||||
width = info->d_width;
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
while ((condition)) { \
|
||||
Uint32 dalpha; \
|
||||
Uint32 d; \
|
||||
Uint32 s1; \
|
||||
Uint32 d1; \
|
||||
Uint32 s = *srcp; \
|
||||
Uint32 alpha = s >> 24; \
|
||||
if(alpha) { \
|
||||
if(alpha == SDL_ALPHA_OPAQUE) { \
|
||||
*dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
|
||||
} else { \
|
||||
d = *dstp; \
|
||||
dalpha = d & 0xff000000; \
|
||||
s1 = s & 0xff00ff; \
|
||||
d1 = d & 0xff00ff; \
|
||||
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
|
||||
s &= 0xff00; \
|
||||
d &= 0xff00; \
|
||||
d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
|
||||
*dstp = d1 | d | dalpha; \
|
||||
} \
|
||||
} \
|
||||
++srcp; \
|
||||
++dstp; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
if (width > 0) {
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = VEC_ALIGNER(srcp);
|
||||
vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vd;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char vdstalpha;
|
||||
/* s = *srcp */
|
||||
voverflow = (vector unsigned char)vec_ld(15, srcp);
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
|
||||
valpha = vec_perm(vs, v0, valphaPermute);
|
||||
|
||||
/* d = *dstp */
|
||||
vd = (vector unsigned char)vec_ld(0, dstp);
|
||||
vdstalpha = vec_and(vd, valphamask);
|
||||
|
||||
VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
|
||||
|
||||
/* set the alpha to the dest alpha */
|
||||
vd = vec_and(vd, vpixelmask);
|
||||
vd = vec_or(vd, vdstalpha);
|
||||
|
||||
/* *dstp = res */
|
||||
vec_st((vector unsigned int)vd, 0, dstp);
|
||||
|
||||
srcp += 4;
|
||||
dstp += 4;
|
||||
width -= 4;
|
||||
vs = voverflow;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
}
|
||||
srcp += srcskip;
|
||||
dstp += dstskip;
|
||||
}
|
||||
#undef ONE_PIXEL_BLEND
|
||||
}
|
||||
|
||||
static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
/* XXX : 6 */
|
||||
unsigned alpha = info->src->alpha;
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *)info->s_pixels;
|
||||
int srcskip = info->s_skip >> 2;
|
||||
Uint32 *dstp = (Uint32 *)info->d_pixels;
|
||||
int dstskip = info->d_skip >> 2;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
unsigned sA = srcfmt->alpha;
|
||||
unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
|
||||
vector unsigned char mergePermute;
|
||||
vector unsigned char vsrcPermute;
|
||||
vector unsigned char vdstPermute;
|
||||
vector unsigned char vsdstPermute;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char valphamask;
|
||||
vector unsigned char vbits;
|
||||
vector unsigned short v1;
|
||||
vector unsigned short v8;
|
||||
|
||||
mergePermute = VEC_MERGE_PERMUTE();
|
||||
v1 = vec_splat_u16(1);
|
||||
v8 = vec_splat_u16(8);
|
||||
|
||||
/* set the alpha to 255 on the destination surf */
|
||||
valphamask = VEC_ALPHA_MASK();
|
||||
|
||||
vsrcPermute = calc_swizzle32(srcfmt, NULL);
|
||||
vdstPermute = calc_swizzle32(NULL, dstfmt);
|
||||
vsdstPermute = calc_swizzle32(dstfmt, NULL);
|
||||
|
||||
/* set a vector full of alpha and 255-alpha */
|
||||
((unsigned char *)&valpha)[0] = alpha;
|
||||
valpha = vec_splat(valpha, 0);
|
||||
vbits = (vector unsigned char)vec_splat_s8(-1);
|
||||
|
||||
while(height--) {
|
||||
int width = info->d_width;
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, dR, dG, dB; \
|
||||
DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \
|
||||
DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
|
||||
ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
|
||||
ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
|
||||
++srcp; \
|
||||
++dstp; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
if (width > 0) {
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = vec_lvsl(0, srcp);
|
||||
vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vd;
|
||||
|
||||
/* s = *srcp */
|
||||
voverflow = (vector unsigned char)vec_ld(15, srcp);
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
vs = vec_perm(vs, valpha, vsrcPermute);
|
||||
|
||||
/* d = *dstp */
|
||||
vd = (vector unsigned char)vec_ld(0, dstp);
|
||||
vd = vec_perm(vd, vd, vsdstPermute);
|
||||
|
||||
VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
|
||||
|
||||
/* set the alpha channel to full on */
|
||||
vd = vec_or(vd, valphamask);
|
||||
vd = vec_perm(vd, vbits, vdstPermute);
|
||||
|
||||
/* *dstp = res */
|
||||
vec_st((vector unsigned int)vd, 0, dstp);
|
||||
|
||||
srcp += 4;
|
||||
dstp += 4;
|
||||
width -= 4;
|
||||
vs = voverflow;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
}
|
||||
#undef ONE_PIXEL_BLEND
|
||||
|
||||
srcp += srcskip;
|
||||
dstp += dstskip;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* fast RGB888->(A)RGB888 blending */
|
||||
static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
unsigned alpha = info->src->alpha;
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *)info->s_pixels;
|
||||
int srcskip = info->s_skip >> 2;
|
||||
Uint32 *dstp = (Uint32 *)info->d_pixels;
|
||||
int dstskip = info->d_skip >> 2;
|
||||
vector unsigned char mergePermute;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char valphamask;
|
||||
vector unsigned short v1;
|
||||
vector unsigned short v8;
|
||||
|
||||
mergePermute = VEC_MERGE_PERMUTE();
|
||||
v1 = vec_splat_u16(1);
|
||||
v8 = vec_splat_u16(8);
|
||||
|
||||
/* set the alpha to 255 on the destination surf */
|
||||
valphamask = VEC_ALPHA_MASK();
|
||||
|
||||
/* set a vector full of alpha and 255-alpha */
|
||||
((unsigned char *)&valpha)[0] = alpha;
|
||||
valpha = vec_splat(valpha, 0);
|
||||
|
||||
while(height--) {
|
||||
int width = info->d_width;
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
|
||||
Uint32 s = *srcp; \
|
||||
Uint32 d = *dstp; \
|
||||
Uint32 s1 = s & 0xff00ff; \
|
||||
Uint32 d1 = d & 0xff00ff; \
|
||||
d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
|
||||
& 0xff00ff; \
|
||||
s &= 0xff00; \
|
||||
d &= 0xff00; \
|
||||
d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
|
||||
*dstp = d1 | d | 0xff000000; \
|
||||
++srcp; \
|
||||
++dstp; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
if (width > 0) {
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = VEC_ALIGNER(srcp);
|
||||
vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
while (width) {
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vd;
|
||||
|
||||
/* s = *srcp */
|
||||
voverflow = (vector unsigned char)vec_ld(15, srcp);
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
|
||||
/* d = *dstp */
|
||||
vd = (vector unsigned char)vec_ld(0, dstp);
|
||||
|
||||
VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
|
||||
|
||||
/* set the alpha channel to full on */
|
||||
vd = vec_or(vd, valphamask);
|
||||
|
||||
/* *dstp = res */
|
||||
vec_st((vector unsigned int)vd, 0, dstp);
|
||||
|
||||
srcp += 4;
|
||||
dstp += 4;
|
||||
width -= 4;
|
||||
vs = voverflow;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
}
|
||||
#undef ONE_PIXEL_BLEND
|
||||
|
||||
srcp += srcskip;
|
||||
dstp += dstskip;
|
||||
}
|
||||
}
|
||||
#endif /* USE_ALTIVEC_BLITTERS */
|
||||
|
||||
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
|
||||
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
|
||||
{
|
||||
|
@ -1372,7 +2128,12 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
|
|||
if(df->BytesPerPixel == 1)
|
||||
return BlitNto1SurfaceAlphaKey;
|
||||
else
|
||||
return BlitNtoNSurfaceAlphaKey;
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec())
|
||||
return Blit32to32SurfaceAlphaKeyAltivec;
|
||||
else
|
||||
#endif
|
||||
return BlitNtoNSurfaceAlphaKey;
|
||||
} else {
|
||||
/* Per-surface alpha blits */
|
||||
switch(df->BytesPerPixel) {
|
||||
|
@ -1413,10 +2174,20 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
|
|||
if(SDL_HasMMX())
|
||||
return BlitRGBtoRGBSurfaceAlphaMMX;
|
||||
else
|
||||
#endif
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if(SDL_HasAltiVec())
|
||||
return BlitRGBtoRGBSurfaceAlphaAltivec;
|
||||
else
|
||||
#endif
|
||||
return BlitRGBtoRGBSurfaceAlpha;
|
||||
}
|
||||
else
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
|
||||
return Blit32to32SurfaceAlphaAltivec;
|
||||
else
|
||||
#endif
|
||||
return BlitNtoNSurfaceAlpha;
|
||||
|
||||
case 3:
|
||||
|
@ -1431,6 +2202,13 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
|
|||
return BlitNto1PixelAlpha;
|
||||
|
||||
case 2:
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if(sf->BytesPerPixel == 4 &&
|
||||
df->Gmask == 0x7e0 &&
|
||||
df->Bmask == 0x1f)
|
||||
return Blit32to565PixelAlphaAltivec;
|
||||
else
|
||||
#endif
|
||||
if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
|
||||
&& sf->Gmask == 0xff00
|
||||
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
|
||||
|
@ -1456,9 +2234,19 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
|
|||
if(SDL_HasMMX())
|
||||
return BlitRGBtoRGBPixelAlphaMMX;
|
||||
else
|
||||
#endif
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if(SDL_HasAltiVec())
|
||||
return BlitRGBtoRGBPixelAlphaAltivec;
|
||||
else
|
||||
#endif
|
||||
return BlitRGBtoRGBPixelAlpha;
|
||||
}
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
|
||||
return Blit32to32PixelAlphaAltivec;
|
||||
else
|
||||
#endif
|
||||
return BlitNtoNPixelAlpha;
|
||||
|
||||
case 3:
|
||||
|
|
|
@ -35,6 +35,656 @@ static char rcsid =
|
|||
|
||||
/* Functions to blit from N-bit surfaces to other surfaces */
|
||||
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
#include <assert.h>
|
||||
#ifdef MACOSX
|
||||
#include <sys/sysctl.h>
|
||||
#include <stdlib.h>
|
||||
static size_t GetL3CacheSize( void )
|
||||
{
|
||||
const char key[] = "hw.l3cachesize";
|
||||
u_int64_t result = 0;
|
||||
size_t typeSize = sizeof( result );
|
||||
|
||||
|
||||
int err = sysctlbyname( key, &result, &typeSize, NULL, 0 );
|
||||
if( 0 != err ) return 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
static size_t GetL3CacheSize( void )
|
||||
{
|
||||
/* XXX: Just guess G4 */
|
||||
return 2097152;
|
||||
}
|
||||
#endif /* MACOSX */
|
||||
|
||||
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
|
||||
#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
|
||||
( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
|
||||
0x04+a, 0x04+b, 0x04+c, 0x04+d, \
|
||||
0x08+a, 0x08+b, 0x08+c, 0x08+d, \
|
||||
0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
|
||||
|
||||
#define MAKE8888(dstfmt, r, g, b, a) \
|
||||
( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
|
||||
((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
|
||||
((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
|
||||
((a<<dstfmt->Ashift)&dstfmt->Amask) )
|
||||
|
||||
/*
|
||||
* Data Stream Touch...Altivec cache prefetching.
|
||||
*
|
||||
* Don't use this on a G5...however, the speed boost is very significant
|
||||
* on a G4.
|
||||
*/
|
||||
#define DST_CHAN_SRC 1
|
||||
#define DST_CHAN_DEST 2
|
||||
|
||||
/* macro to set DST control word value... */
|
||||
#define DST_CTRL(size, count, stride) \
|
||||
(((size) << 24) | ((count) << 16) | (stride))
|
||||
|
||||
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
|
||||
? vec_lvsl(0, src) \
|
||||
: vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
|
||||
|
||||
/* Calculate the permute vector used for 32->32 swizzling */
|
||||
static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
|
||||
const SDL_PixelFormat *dstfmt)
|
||||
{
|
||||
/*
|
||||
* We have to assume that the bits that aren't used by other
|
||||
* colors is alpha, and it's one complete byte, since some formats
|
||||
* leave alpha with a zero mask, but we should still swizzle the bits.
|
||||
*/
|
||||
/* ARGB */
|
||||
const static struct SDL_PixelFormat default_pixel_format = {
|
||||
NULL, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
16, 8, 0, 24,
|
||||
0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
|
||||
0, 0};
|
||||
if (!srcfmt) {
|
||||
srcfmt = &default_pixel_format;
|
||||
}
|
||||
if (!dstfmt) {
|
||||
dstfmt = &default_pixel_format;
|
||||
}
|
||||
vector unsigned char plus = (vector unsigned char)( 0x00, 0x00, 0x00, 0x00,
|
||||
0x04, 0x04, 0x04, 0x04,
|
||||
0x08, 0x08, 0x08, 0x08,
|
||||
0x0C, 0x0C, 0x0C, 0x0C );
|
||||
vector unsigned char vswiz;
|
||||
vector unsigned int srcvec;
|
||||
#define RESHIFT(X) (3 - ((X) >> 3))
|
||||
Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
|
||||
Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
|
||||
Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
|
||||
Uint32 amask;
|
||||
/* Use zero for alpha if either surface doesn't have alpha */
|
||||
if (dstfmt->Amask) {
|
||||
amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
|
||||
} else {
|
||||
amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
|
||||
}
|
||||
#undef RESHIFT
|
||||
((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
|
||||
vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
|
||||
return(vswiz);
|
||||
}
|
||||
|
||||
static void Blit_RGB888_RGB565(SDL_BlitInfo *info);
|
||||
static void Blit_RGB888_RGB565Altivec(SDL_BlitInfo *info) {
|
||||
int height = info->d_height;
|
||||
Uint8 *src = (Uint8 *) info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint8 *dst = (Uint8 *) info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
vector unsigned char valpha = vec_splat_u8(0);
|
||||
vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
|
||||
vector unsigned char vgmerge = (vector unsigned char)(
|
||||
0x00, 0x02, 0x00, 0x06,
|
||||
0x00, 0x0a, 0x00, 0x0e,
|
||||
0x00, 0x12, 0x00, 0x16,
|
||||
0x00, 0x1a, 0x00, 0x1e);
|
||||
vector unsigned short v1 = vec_splat_u16(1);
|
||||
vector unsigned short v3 = vec_splat_u16(3);
|
||||
vector unsigned short v3f = (vector unsigned short)(
|
||||
0x003f, 0x003f, 0x003f, 0x003f,
|
||||
0x003f, 0x003f, 0x003f, 0x003f);
|
||||
vector unsigned short vfc = (vector unsigned short)(
|
||||
0x00fc, 0x00fc, 0x00fc, 0x00fc,
|
||||
0x00fc, 0x00fc, 0x00fc, 0x00fc);
|
||||
vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
|
||||
vf800 = vec_sl(vf800, vec_splat_u16(8));
|
||||
|
||||
while (height--) {
|
||||
vector unsigned char valigner;
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vsrc;
|
||||
|
||||
int width = info->d_width;
|
||||
int extrawidth;
|
||||
|
||||
/* do scalar until we can align... */
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
while (condition) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, sA; \
|
||||
DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, pixel, \
|
||||
sR, sG, sB, sA); \
|
||||
*(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
|
||||
((sG << 3) & 0x000007E0) | \
|
||||
((sB >> 3) & 0x0000001F)); \
|
||||
dst += 2; \
|
||||
src += 4; \
|
||||
widthvar--; \
|
||||
}
|
||||
|
||||
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
|
||||
|
||||
/* After all that work, here's the vector part! */
|
||||
extrawidth = (width % 8); /* trailing unaligned stores */
|
||||
width -= extrawidth;
|
||||
vsrc = vec_ld(0, src);
|
||||
valigner = VEC_ALIGNER(src);
|
||||
|
||||
while (width) {
|
||||
vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
|
||||
vector unsigned int vsrc1, vsrc2;
|
||||
vector unsigned char vdst;
|
||||
|
||||
voverflow = vec_ld(15, src);
|
||||
vsrc = vec_perm(vsrc, voverflow, valigner);
|
||||
vsrc1 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
|
||||
src += 16;
|
||||
vsrc = voverflow;
|
||||
voverflow = vec_ld(15, src);
|
||||
vsrc = vec_perm(vsrc, voverflow, valigner);
|
||||
vsrc2 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
|
||||
/* 1555 */
|
||||
vpixel = (vector unsigned short)vec_packpx(vsrc1, vsrc2);
|
||||
vgpixel = (vector unsigned short)vec_perm(vsrc1, vsrc2, vgmerge);
|
||||
vgpixel = vec_and(vgpixel, vfc);
|
||||
vgpixel = vec_sl(vgpixel, v3);
|
||||
vrpixel = vec_sl(vpixel, v1);
|
||||
vrpixel = vec_and(vrpixel, vf800);
|
||||
vbpixel = vec_and(vpixel, v3f);
|
||||
vdst = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
|
||||
/* 565 */
|
||||
vdst = vec_or(vdst, (vector unsigned char)vbpixel);
|
||||
vec_st(vdst, 0, dst);
|
||||
|
||||
width -= 8;
|
||||
src += 16;
|
||||
dst += 16;
|
||||
vsrc = voverflow;
|
||||
}
|
||||
|
||||
assert(width == 0);
|
||||
|
||||
|
||||
/* do scalar until we can align... */
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
#undef ONE_PIXEL_BLEND
|
||||
|
||||
src += srcskip; /* move to next row, accounting for pitch. */
|
||||
dst += dstskip;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void Blit_RGB565_32Altivec(SDL_BlitInfo *info) {
|
||||
int height = info->d_height;
|
||||
Uint8 *src = (Uint8 *) info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint8 *dst = (Uint8 *) info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
unsigned alpha;
|
||||
vector unsigned char valpha;
|
||||
vector unsigned char vpermute;
|
||||
vector unsigned short vf800;
|
||||
vector unsigned int v8 = vec_splat_u32(8);
|
||||
vector unsigned int v16 = vec_add(v8, v8);
|
||||
vector unsigned short v2 = vec_splat_u16(2);
|
||||
vector unsigned short v3 = vec_splat_u16(3);
|
||||
/*
|
||||
0x10 - 0x1f is the alpha
|
||||
0x00 - 0x0e evens are the red
|
||||
0x01 - 0x0f odds are zero
|
||||
*/
|
||||
vector unsigned char vredalpha1 = (vector unsigned char)(
|
||||
0x10, 0x00, 0x01, 0x01,
|
||||
0x10, 0x02, 0x01, 0x01,
|
||||
0x10, 0x04, 0x01, 0x01,
|
||||
0x10, 0x06, 0x01, 0x01
|
||||
);
|
||||
vector unsigned char vredalpha2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vredalpha1, vec_sl(v8, v16))
|
||||
);
|
||||
/*
|
||||
0x00 - 0x0f is ARxx ARxx ARxx ARxx
|
||||
0x11 - 0x0f odds are blue
|
||||
*/
|
||||
vector unsigned char vblue1 = (vector unsigned char)(
|
||||
0x00, 0x01, 0x02, 0x11,
|
||||
0x04, 0x05, 0x06, 0x13,
|
||||
0x08, 0x09, 0x0a, 0x15,
|
||||
0x0c, 0x0d, 0x0e, 0x17
|
||||
);
|
||||
vector unsigned char vblue2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vblue1, v8)
|
||||
);
|
||||
/*
|
||||
0x00 - 0x0f is ARxB ARxB ARxB ARxB
|
||||
0x10 - 0x0e evens are green
|
||||
*/
|
||||
vector unsigned char vgreen1 = (vector unsigned char)(
|
||||
0x00, 0x01, 0x10, 0x03,
|
||||
0x04, 0x05, 0x12, 0x07,
|
||||
0x08, 0x09, 0x14, 0x0b,
|
||||
0x0c, 0x0d, 0x16, 0x0f
|
||||
);
|
||||
vector unsigned char vgreen2 = (vector unsigned char)(
|
||||
vec_add((vector unsigned int)vgreen1, vec_sl(v8, v8))
|
||||
);
|
||||
|
||||
|
||||
assert(srcfmt->BytesPerPixel == 2);
|
||||
assert(dstfmt->BytesPerPixel == 4);
|
||||
|
||||
vf800 = (vector unsigned short)vec_splat_u8(-7);
|
||||
vf800 = vec_sl(vf800, vec_splat_u16(8));
|
||||
|
||||
if (dstfmt->Amask && srcfmt->alpha) {
|
||||
((unsigned char *)&valpha)[0] = alpha = srcfmt->alpha;
|
||||
valpha = vec_splat(valpha, 0);
|
||||
} else {
|
||||
alpha = 0;
|
||||
valpha = vec_splat_u8(0);
|
||||
}
|
||||
|
||||
vpermute = calc_swizzle32(NULL, dstfmt);
|
||||
while (height--) {
|
||||
vector unsigned char valigner;
|
||||
vector unsigned char voverflow;
|
||||
vector unsigned char vsrc;
|
||||
|
||||
int width = info->d_width;
|
||||
int extrawidth;
|
||||
|
||||
/* do scalar until we can align... */
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
while (condition) { \
|
||||
unsigned sR, sG, sB; \
|
||||
unsigned short pixel = *((unsigned short *)src); \
|
||||
sR = (pixel >> 8) & 0xf8; \
|
||||
sG = (pixel >> 3) & 0xfc; \
|
||||
sB = (pixel << 3) & 0xf8; \
|
||||
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
|
||||
src += 2; \
|
||||
dst += 4; \
|
||||
widthvar--; \
|
||||
}
|
||||
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
|
||||
|
||||
/* After all that work, here's the vector part! */
|
||||
extrawidth = (width % 8); /* trailing unaligned stores */
|
||||
width -= extrawidth;
|
||||
vsrc = vec_ld(0, src);
|
||||
valigner = VEC_ALIGNER(src);
|
||||
|
||||
while (width) {
|
||||
vector unsigned short vR, vG, vB;
|
||||
vector unsigned char vdst1, vdst2;
|
||||
|
||||
voverflow = vec_ld(15, src);
|
||||
vsrc = vec_perm(vsrc, voverflow, valigner);
|
||||
|
||||
vR = vec_and((vector unsigned short)vsrc, vf800);
|
||||
vB = vec_sl((vector unsigned short)vsrc, v3);
|
||||
vG = vec_sl(vB, v2);
|
||||
|
||||
vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha1);
|
||||
vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
|
||||
vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
|
||||
vdst1 = vec_perm(vdst1, valpha, vpermute);
|
||||
vec_st(vdst1, 0, dst);
|
||||
|
||||
vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha2);
|
||||
vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
|
||||
vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
|
||||
vdst2 = vec_perm(vdst2, valpha, vpermute);
|
||||
vec_st(vdst2, 16, dst);
|
||||
|
||||
width -= 8;
|
||||
dst += 32;
|
||||
src += 16;
|
||||
vsrc = voverflow;
|
||||
}
|
||||
|
||||
assert(width == 0);
|
||||
|
||||
|
||||
/* do scalar until we can align... */
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
#undef ONE_PIXEL_BLEND
|
||||
|
||||
src += srcskip; /* move to next row, accounting for pitch. */
|
||||
dst += dstskip;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void BlitNtoNKey(SDL_BlitInfo *info);
|
||||
static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info);
|
||||
static void Blit32to32KeyAltivec(SDL_BlitInfo *info)
|
||||
{
|
||||
int height = info->d_height;
|
||||
Uint32 *srcp = (Uint32 *) info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint32 *dstp = (Uint32 *) info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
int srcbpp = srcfmt->BytesPerPixel;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
int dstbpp = dstfmt->BytesPerPixel;
|
||||
int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
|
||||
unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0;
|
||||
Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
|
||||
Uint32 ckey = info->src->colorkey;
|
||||
vector unsigned int valpha;
|
||||
vector unsigned char vpermute;
|
||||
vector unsigned char vzero;
|
||||
vector unsigned int vckey;
|
||||
vector unsigned int vrgbmask;
|
||||
vpermute = calc_swizzle32(srcfmt, dstfmt);
|
||||
if (info->d_width < 16) {
|
||||
if(copy_alpha) {
|
||||
return BlitNtoNKeyCopyAlpha(info);
|
||||
} else {
|
||||
return BlitNtoNKey(info);
|
||||
}
|
||||
}
|
||||
vzero = vec_splat_u8(0);
|
||||
if (alpha) {
|
||||
((unsigned char *)&valpha)[0] = (unsigned char)alpha;
|
||||
valpha = (vector unsigned int)vec_splat((vector unsigned char)valpha, 0);
|
||||
} else {
|
||||
valpha = (vector unsigned int)vzero;
|
||||
}
|
||||
ckey &= rgbmask;
|
||||
((unsigned int *)&vckey)[0] = ckey;
|
||||
vckey = vec_splat(vckey, 0);
|
||||
((unsigned int *)&vrgbmask)[0] = rgbmask;
|
||||
vrgbmask = vec_splat(vrgbmask, 0);
|
||||
|
||||
while (height--) {
|
||||
#define ONE_PIXEL_BLEND(condition, widthvar) \
|
||||
if (copy_alpha) { \
|
||||
while (condition) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB, sA; \
|
||||
DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, pixel, \
|
||||
sR, sG, sB, sA); \
|
||||
if ( (pixel & rgbmask) != ckey ) { \
|
||||
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
|
||||
sR, sG, sB, sA); \
|
||||
} \
|
||||
((Uint8 *)dstp) += dstbpp; \
|
||||
((Uint8 *)srcp) += srcbpp; \
|
||||
widthvar--; \
|
||||
} \
|
||||
} else { \
|
||||
while (condition) { \
|
||||
Uint32 pixel; \
|
||||
unsigned sR, sG, sB; \
|
||||
RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, pixel); \
|
||||
if ( pixel != ckey ) { \
|
||||
RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
|
||||
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
|
||||
sR, sG, sB, alpha); \
|
||||
} \
|
||||
((Uint8 *)dstp) += dstbpp; \
|
||||
((Uint8 *)srcp) += srcbpp; \
|
||||
widthvar--; \
|
||||
} \
|
||||
}
|
||||
int width = info->d_width;
|
||||
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
|
||||
assert(width > 0);
|
||||
if (width > 0) {
|
||||
int extrawidth = (width % 4);
|
||||
vector unsigned char valigner = VEC_ALIGNER(srcp);
|
||||
vector unsigned int vs = vec_ld(0, srcp);
|
||||
width -= extrawidth;
|
||||
assert(width >= 4);
|
||||
while (width) {
|
||||
vector unsigned char vsel;
|
||||
vector unsigned int vd;
|
||||
vector unsigned int voverflow = vec_ld(15, srcp);
|
||||
/* load the source vec */
|
||||
vs = vec_perm(vs, voverflow, valigner);
|
||||
/* vsel is set for items that match the key */
|
||||
vsel = (vector unsigned char)vec_and(vs, vrgbmask);
|
||||
vsel = (vector unsigned char)vec_cmpeq(vs, vckey);
|
||||
/* permute the src vec to the dest format */
|
||||
vs = vec_perm(vs, valpha, vpermute);
|
||||
/* load the destination vec */
|
||||
vd = vec_ld(0, dstp);
|
||||
/* select the source and dest into vs */
|
||||
vd = (vector unsigned int)vec_sel((vector unsigned char)vs, (vector unsigned char)vd, vsel);
|
||||
|
||||
vec_st(vd, 0, dstp);
|
||||
srcp += 4;
|
||||
width -= 4;
|
||||
dstp += 4;
|
||||
vs = voverflow;
|
||||
}
|
||||
ONE_PIXEL_BLEND((extrawidth), extrawidth);
|
||||
#undef ONE_PIXEL_BLEND
|
||||
srcp += srcskip >> 2;
|
||||
dstp += dstskip >> 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
|
||||
/* Use this on a G5 */
|
||||
static void ConvertAltivec32to32_noprefetch(SDL_BlitInfo *info)
|
||||
{
|
||||
int height = info->d_height;
|
||||
Uint32 *src = (Uint32 *) info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint32 *dst = (Uint32 *) info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
int srcbpp = srcfmt->BytesPerPixel;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
int dstbpp = dstfmt->BytesPerPixel;
|
||||
vector unsigned int vzero = vec_splat_u32(0);
|
||||
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
|
||||
if (dstfmt->Amask && !srcfmt->Amask) {
|
||||
if (srcfmt->alpha) {
|
||||
vector unsigned char valpha;
|
||||
((unsigned char *)&valpha)[0] = srcfmt->alpha;
|
||||
vzero = (vector unsigned int)vec_splat(valpha, 0);
|
||||
}
|
||||
}
|
||||
|
||||
assert(srcbpp == 4);
|
||||
assert(dstbpp == 4);
|
||||
|
||||
while (height--) {
|
||||
vector unsigned char valigner;
|
||||
vector unsigned int vbits;
|
||||
vector unsigned int voverflow;
|
||||
Uint32 bits;
|
||||
Uint8 r, g, b, a;
|
||||
|
||||
int width = info->d_width;
|
||||
int extrawidth;
|
||||
|
||||
/* do scalar until we can align... */
|
||||
while ((UNALIGNED_PTR(dst)) && (width)) {
|
||||
bits = *(src++);
|
||||
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
|
||||
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
|
||||
width--;
|
||||
}
|
||||
|
||||
/* After all that work, here's the vector part! */
|
||||
extrawidth = (width % 4);
|
||||
width -= extrawidth;
|
||||
valigner = VEC_ALIGNER(src);
|
||||
vbits = vec_ld(0, src);
|
||||
|
||||
while (width) {
|
||||
voverflow = vec_ld(15, src);
|
||||
src += 4;
|
||||
width -= 4;
|
||||
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
|
||||
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
|
||||
vec_st(vbits, 0, dst); /* store it back out. */
|
||||
dst += 4;
|
||||
vbits = voverflow;
|
||||
}
|
||||
|
||||
assert(width == 0);
|
||||
|
||||
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
|
||||
while (extrawidth) {
|
||||
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
|
||||
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
|
||||
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
|
||||
extrawidth--;
|
||||
}
|
||||
|
||||
src += srcskip >> 2; /* move to next row, accounting for pitch. */
|
||||
dst += dstskip >> 2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
|
||||
/* Use this on a G4 */
|
||||
static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info)
|
||||
{
|
||||
const int scalar_dst_lead = sizeof (Uint32) * 4;
|
||||
const int vector_dst_lead = sizeof (Uint32) * 16;
|
||||
|
||||
int height = info->d_height;
|
||||
Uint32 *src = (Uint32 *) info->s_pixels;
|
||||
int srcskip = info->s_skip;
|
||||
Uint32 *dst = (Uint32 *) info->d_pixels;
|
||||
int dstskip = info->d_skip;
|
||||
SDL_PixelFormat *srcfmt = info->src;
|
||||
int srcbpp = srcfmt->BytesPerPixel;
|
||||
SDL_PixelFormat *dstfmt = info->dst;
|
||||
int dstbpp = dstfmt->BytesPerPixel;
|
||||
vector unsigned int vzero = vec_splat_u32(0);
|
||||
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
|
||||
if (dstfmt->Amask && !srcfmt->Amask) {
|
||||
if (srcfmt->alpha) {
|
||||
vector unsigned char valpha;
|
||||
((unsigned char *)&valpha)[0] = srcfmt->alpha;
|
||||
vzero = (vector unsigned int)vec_splat(valpha, 0);
|
||||
}
|
||||
}
|
||||
|
||||
assert(srcbpp == 4);
|
||||
assert(dstbpp == 4);
|
||||
|
||||
while (height--) {
|
||||
vector unsigned char valigner;
|
||||
vector unsigned int vbits;
|
||||
vector unsigned int voverflow;
|
||||
Uint32 bits;
|
||||
Uint8 r, g, b, a;
|
||||
|
||||
int width = info->d_width;
|
||||
int extrawidth;
|
||||
|
||||
/* do scalar until we can align... */
|
||||
while ((UNALIGNED_PTR(dst)) && (width)) {
|
||||
vec_dstt(src+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
|
||||
vec_dstst(dst+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
|
||||
bits = *(src++);
|
||||
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
|
||||
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
|
||||
width--;
|
||||
}
|
||||
|
||||
/* After all that work, here's the vector part! */
|
||||
extrawidth = (width % 4);
|
||||
width -= extrawidth;
|
||||
valigner = VEC_ALIGNER(src);
|
||||
vbits = vec_ld(0, src);
|
||||
|
||||
while (width) {
|
||||
vec_dstt(src+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
|
||||
vec_dstst(dst+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
|
||||
voverflow = vec_ld(15, src);
|
||||
src += 4;
|
||||
width -= 4;
|
||||
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
|
||||
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
|
||||
vec_st(vbits, 0, dst); /* store it back out. */
|
||||
dst += 4;
|
||||
vbits = voverflow;
|
||||
}
|
||||
|
||||
assert(width == 0);
|
||||
|
||||
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
|
||||
while (extrawidth) {
|
||||
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
|
||||
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
|
||||
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
|
||||
extrawidth--;
|
||||
}
|
||||
|
||||
src += srcskip >> 2; /* move to next row, accounting for pitch. */
|
||||
dst += dstskip >> 2;
|
||||
}
|
||||
|
||||
vec_dss(DST_CHAN_SRC);
|
||||
vec_dss(DST_CHAN_DEST);
|
||||
}
|
||||
|
||||
static Uint32 GetBlitFeatures( void )
|
||||
{
|
||||
static Uint32 features = 0xffffffff;
|
||||
if (features == 0xffffffff) {
|
||||
/* Provide an override for testing .. */
|
||||
char *override = getenv("SDL_ALTIVEC_BLIT_FEATURES");
|
||||
if (override) {
|
||||
features = 0;
|
||||
sscanf(override, "%u", &features);
|
||||
} else {
|
||||
features = ( 0
|
||||
/* Feature 1 is has-MMX */
|
||||
| ((SDL_HasMMX()) ? 1 : 0)
|
||||
/* Feature 2 is has-AltiVec */
|
||||
| ((SDL_HasAltiVec()) ? 2 : 0)
|
||||
/* Feature 4 is dont-use-prefetch */
|
||||
| ((GetL3CacheSize() == 0) ? 4 : 0)
|
||||
);
|
||||
}
|
||||
}
|
||||
return features;
|
||||
}
|
||||
#else
|
||||
/* Feature 1 is has-MMX */
|
||||
#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
|
||||
#endif
|
||||
|
||||
#ifdef USE_ASMBLIT
|
||||
|
||||
/* Heheheh, we coerce Hermes into using SDL blit information */
|
||||
|
@ -406,11 +1056,7 @@ static void Blit_RGB888_RGB565(SDL_BlitInfo *info)
|
|||
|
||||
|
||||
/* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
|
||||
#if ( SDL_BYTEORDER == SDL_LIL_ENDIAN )
|
||||
#define RGB565_32(dst, src, map) (map[src[0]*2] + map[src[1]*2+1])
|
||||
#else /* ( SDL_BYTEORDER == SDL_BIG_ENDIAN ) */
|
||||
#define RGB565_32(dst, src, map) (map[src[1]*2] + map[src[0]*2+1])
|
||||
#endif
|
||||
#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
|
||||
static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map)
|
||||
{
|
||||
#ifndef USE_DUFFS_LOOP
|
||||
|
@ -1422,10 +2068,10 @@ struct blit_table {
|
|||
Uint32 srcR, srcG, srcB;
|
||||
int dstbpp;
|
||||
Uint32 dstR, dstG, dstB;
|
||||
SDL_bool cpu_mmx;
|
||||
Uint32 blit_features;
|
||||
void *aux_data;
|
||||
SDL_loblit blitfunc;
|
||||
enum { NO_ALPHA, SET_ALPHA, COPY_ALPHA } alpha;
|
||||
enum { NO_ALPHA=1, SET_ALPHA=2, COPY_ALPHA=4 } alpha;
|
||||
};
|
||||
static const struct blit_table normal_blit_1[] = {
|
||||
/* Default for 8-bit RGB source, an invalid combination */
|
||||
|
@ -1439,6 +2085,11 @@ static const struct blit_table normal_blit_2[] = {
|
|||
0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA },
|
||||
{ 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00,
|
||||
0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA },
|
||||
#endif
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
/* has-altivec */
|
||||
{ 0x0000F800,0x000007E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000,
|
||||
2, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
|
||||
#endif
|
||||
{ 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
|
||||
0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA },
|
||||
|
@ -1485,6 +2136,17 @@ static const struct blit_table normal_blit_4[] = {
|
|||
{ 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000,
|
||||
0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA },
|
||||
#else
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
/* has-altivec | dont-use-prefetch */
|
||||
{ 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
|
||||
6, NULL, ConvertAltivec32to32_noprefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
|
||||
/* has-altivec */
|
||||
{ 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
|
||||
2, NULL, ConvertAltivec32to32_prefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
|
||||
/* has-altivec */
|
||||
{ 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F,
|
||||
2, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA },
|
||||
#endif
|
||||
{ 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F,
|
||||
0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
|
||||
{ 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
|
||||
|
@ -1497,6 +2159,9 @@ static const struct blit_table *normal_blit[] = {
|
|||
normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4
|
||||
};
|
||||
|
||||
/* Mask matches table, or table entry is zero */
|
||||
#define MASKOK(x, y) (((x) == (y)) || ((y) == 0x00000000))
|
||||
|
||||
SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
|
||||
{
|
||||
struct private_swaccel *sdata;
|
||||
|
@ -1532,6 +2197,12 @@ SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
|
|||
else if(dstfmt->BytesPerPixel == 1)
|
||||
return BlitNto1Key;
|
||||
else {
|
||||
#ifdef USE_ALTIVEC_BLITTERS
|
||||
if((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) && SDL_HasAltiVec()) {
|
||||
return Blit32to32KeyAltivec;
|
||||
} else
|
||||
#endif
|
||||
|
||||
if(srcfmt->Amask && dstfmt->Amask)
|
||||
return BlitNtoNKeyCopyAlpha;
|
||||
else
|
||||
|
@ -1561,20 +2232,20 @@ SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
|
|||
}
|
||||
} else {
|
||||
/* Now the meat, choose the blitter we want */
|
||||
int a_need = 0;
|
||||
int a_need = 0;
|
||||
if(dstfmt->Amask)
|
||||
a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA;
|
||||
table = normal_blit[srcfmt->BytesPerPixel-1];
|
||||
for ( which=0; table[which].srcR; ++which ) {
|
||||
if ( srcfmt->Rmask == table[which].srcR &&
|
||||
srcfmt->Gmask == table[which].srcG &&
|
||||
srcfmt->Bmask == table[which].srcB &&
|
||||
dstfmt->BytesPerPixel == table[which].dstbpp &&
|
||||
dstfmt->Rmask == table[which].dstR &&
|
||||
dstfmt->Gmask == table[which].dstG &&
|
||||
dstfmt->Bmask == table[which].dstB &&
|
||||
(a_need & table[which].alpha) == a_need &&
|
||||
(table[which].cpu_mmx == SDL_HasMMX()))
|
||||
for ( which=0; table[which].dstbpp; ++which ) {
|
||||
if ( MASKOK(srcfmt->Rmask, table[which].srcR) &&
|
||||
MASKOK(srcfmt->Gmask, table[which].srcG) &&
|
||||
MASKOK(srcfmt->Bmask, table[which].srcB) &&
|
||||
MASKOK(dstfmt->Rmask, table[which].dstR) &&
|
||||
MASKOK(dstfmt->Gmask, table[which].dstG) &&
|
||||
MASKOK(dstfmt->Bmask, table[which].dstB) &&
|
||||
dstfmt->BytesPerPixel == table[which].dstbpp &&
|
||||
(a_need & table[which].alpha) == a_need &&
|
||||
((table[which].blit_features & GetBlitFeatures()) == table[which].blit_features) )
|
||||
break;
|
||||
}
|
||||
sdata->aux_data = table[which].aux_data;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue