diff options
author | Ian Romanick <idr@us.ibm.com> | 2004-11-02 18:25:45 +0000 |
---|---|---|
committer | Ian Romanick <idr@us.ibm.com> | 2004-11-02 18:25:45 +0000 |
commit | bdd53efe8302e85fd1be4ceda0aa576e0119b14e (patch) | |
tree | 6d53a3f0f23fd2438f83a67b93fea28ac6845427 /src | |
parent | 2302cc1a25f7ab55b7e7d6647175308cd64ab7f1 (diff) |
Added MMX optimized version of the RGB565 ReadRGBASpan routine.
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/common/spantmp2.h | 26 | ||||
-rw-r--r-- | src/mesa/x86/read_rgba_span_x86.S | 223 | ||||
-rw-r--r-- | src/mesa/x86/read_rgba_span_x86.h | 3 |
3 files changed, 245 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h index 5a161b11dd..ce48257836 100644 --- a/src/mesa/drivers/dri/common/spantmp2.h +++ b/src/mesa/drivers/dri/common/spantmp2.h @@ -377,8 +377,10 @@ static void TAG(ReadRGBASpan)( const GLcontext *ctx, #if defined(USE_MMX_ASM) && \ - (SPANTMP_PIXEL_FMT == GL_BGRA) && \ - (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV) + (((SPANTMP_PIXEL_FMT == GL_BGRA) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \ + ((SPANTMP_PIXEL_FMT == GL_RGB) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5))) static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx, GLuint n, GLint x, GLint y, GLubyte rgba[][4]) @@ -406,7 +408,12 @@ static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx, { const char * src = GET_SRC_PTR( x1, y ); +#if (SPANTMP_PIXEL_FMT == GL_RGB) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5) + _generic_read_RGBA_span_RGB565_MMX( src, rgba[i], n1 ); +#else _generic_read_RGBA_span_BGRA8888_REV_MMX( src, rgba[i], n1 ); +#endif } } HW_ENDCLIPLOOP(); @@ -539,30 +546,35 @@ static void TAG(InitPointers)(struct swrast_device_driver *swdd) swdd->WriteMonoRGBAPixels = TAG(WriteMonoRGBAPixels); swdd->ReadRGBAPixels = TAG(ReadRGBAPixels); -#if (SPANTMP_PIXEL_FMT == GL_BGRA) && \ +#if defined(USE_SSE_ASM) && \ + (SPANTMP_PIXEL_FMT == GL_BGRA) && \ (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV) -#if defined(USE_SSE_ASM) if ( cpu_has_xmm2 ) { if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE2" ); swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE2); } else #endif -#if defined(USE_SSE_ASM) +#if defined(USE_SSE_ASM) && \ + (SPANTMP_PIXEL_FMT == GL_BGRA) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV) if ( cpu_has_xmm ) { if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE" ); swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE); } else #endif -#if defined(USE_MMX_ASM) +#if defined(USE_MMX_ASM) && \ + (((SPANTMP_PIXEL_FMT == GL_BGRA) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \ + ((SPANTMP_PIXEL_FMT == GL_RGB) && \ + (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5))) if ( cpu_has_mmx ) { if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "MMX" ); swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _MMX); } else #endif -#endif { if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "C" ); swdd->ReadRGBASpan = TAG(ReadRGBASpan); diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S index e637f22da3..06bdc6d264 100644 --- a/src/mesa/x86/read_rgba_span_x86.S +++ b/src/mesa/x86/read_rgba_span_x86.S @@ -451,3 +451,226 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2: popl %esi ret .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 + + + + .section .rodata + + .align 16 +mask_565: + .word 0xf800 + .word 0x07e0 + .word 0x001f + .word 0x0000 + +/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C + * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but + * at a small cost to accuracy. + */ + +#define SCALE_ADJUST 5 +#if SCALE_ADJUST == 5 +prescale: + .word 0x0001 + .word 0x0010 + .word 0x0200 + .word 0x0000 + +scale: + .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */ + .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */ + .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */ + .word 0x0000 +#elif SCALE_ADJUST == 0 +prescale: + .word 0x0001 + .word 0x0020 + .word 0x0800 + .word 0x0000 + +scale: + .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ + .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */ + .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ + .word 0x0000 +#else +#error SCALE_ADJUST must either be 5 or 0. +#endif + + +alpha: .long 0x00000000 + .long 0x00ff0000 + +/** + * MMX optimized version of the RGB565 to RGBA copy routine. + */ + + .text + .globl _generic_read_RGBA_span_RGB565_MMX + .type _generic_read_RGBA_span_RGB565_MMX, @function + +_generic_read_RGBA_span_RGB565_MMX: + +#ifdef USE_INNER_EMMS + emms +#endif + + movl 4(%esp), %eax /* source pointer */ + movl 8(%esp), %edx /* destination pointer */ + movl 12(%esp), %ecx /* number of pixels to copy */ + + movq mask_565, %mm5 + movq prescale, %mm6 + movq scale, %mm7 + + shrl $2, %ecx + jmp .L02 + +.L03: + /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and + * second pixels into the four words of %mm0 and %mm2. + */ + + movq (%eax), %mm4 + addl $8, %eax + + pshufw $0x00, %mm4, %mm0 + pshufw $0x55, %mm4, %mm2 + + + /* Mask the pixels so that each word of each register contains only + * one color component. + */ + + pand %mm5, %mm0 + pand %mm5, %mm2 + + + /* Adjust the component values so that they are as small as possible, + * but large enough so that we can multiply them by an unsigned 16-bit + * number and get a value as large as 0x00ff0000. + */ + + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + + /* Scale the input component values to be on the range + * [0, 0x00ff0000]. This it the real magic of the whole routine. + */ + + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + + /* Always set the alpha value to 0xff. + */ + + por alpha, %mm0 + por alpha, %mm2 + + + /* Pack the 16-bit values to 8-bit values and store the converted + * pixel data. + */ + + packuswb %mm2, %mm0 + movq %mm0, (%edx) + addl $8, %edx + + + + pshufw $0xaa, %mm4, %mm0 + pshufw $0xff, %mm4, %mm2 + + pand %mm5, %mm0 + pand %mm5, %mm2 + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + por alpha, %mm0 + por alpha, %mm2 + + packuswb %mm2, %mm0 + + movq %mm0, (%edx) + addl $8, %edx + + subl $1, %ecx +.L02: + jne .L03 + + + /* At this point there can be at most 3 pixels left to process. If + * there is either 2 or 3 left, process 2. + */ + + movl 12(%esp), %ecx + testl $0x02, %ecx + je .L04 + + movd (%eax), %mm4 + addl $4, %eax + + pshufw $0x00, %mm4, %mm0 + pshufw $0x55, %mm4, %mm2 + + pand %mm5, %mm0 + pand %mm5, %mm2 + pmullw %mm6, %mm0 + pmullw %mm6, %mm2 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 + psrlw $SCALE_ADJUST, %mm2 +#endif + pmulhuw %mm7, %mm0 + pmulhuw %mm7, %mm2 + + por alpha, %mm0 + por alpha, %mm2 + + packuswb %mm2, %mm0 + + movq %mm0, (%edx) + addl $8, %edx + +.L04: + /* At this point there can be at most 1 pixel left to process. + * Process it if needed. + */ + + testl $0x01, %ecx + je .L01 + + movzxw (%eax), %ecx + movd %ecx, %mm4 + + pshufw $0x00, %mm4, %mm0 + + pand %mm5, %mm0 + pmullw %mm6, %mm0 +#if SCALE_ADJUST > 0 + psrlw $SCALE_ADJUST, %mm0 +#endif + pmulhuw %mm7, %mm0 + + por alpha, %mm0 + + packuswb %mm0, %mm0 + + movd %mm0, (%edx) + +.L01: +#ifdef USE_INNER_EMMS + emms +#endif + ret diff --git a/src/mesa/x86/read_rgba_span_x86.h b/src/mesa/x86/read_rgba_span_x86.h index 99dd0e365d..564b1bb0f9 100644 --- a/src/mesa/x86/read_rgba_span_x86.h +++ b/src/mesa/x86/read_rgba_span_x86.h @@ -48,6 +48,9 @@ extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *, #if defined(USE_MMX_ASM) extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *, unsigned char *, unsigned ); + +extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *, + unsigned char *, unsigned ); #endif #endif /* READ_RGBA_SPAN_X86_H */ |