From c0dd02219d47f45ce469abbef8044431f6d85d0a Mon Sep 17 00:00:00 2001 From: michal Date: Mon, 24 Sep 2007 12:32:26 +0100 Subject: Enable SSE2 for FS. --- src/mesa/pipe/softpipe/sp_quad_fs.c | 59 ++++---- src/mesa/pipe/tgsi/exec/tgsi_sse2.c | 248 ++++++++++++++++++++++++++++++++- src/mesa/pipe/tgsi/exec/tgsi_sse2.h | 5 + src/mesa/state_tracker/st_atom_fs.c | 9 ++ src/mesa/state_tracker/st_cb_program.c | 21 ++- src/mesa/state_tracker/st_program.h | 6 +- 6 files changed, 310 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c index 13d7eac4f2..bff5525b0f 100755 --- a/src/mesa/pipe/softpipe/sp_quad_fs.c +++ b/src/mesa/pipe/softpipe/sp_quad_fs.c @@ -56,30 +56,22 @@ quad_shade_stage(struct quad_stage *qs) } +#if !defined(XSTDCALL) +#if defined(WIN32) +#define XSTDCALL __stdcall +#else +#define XSTDCALL +#endif +#endif - -/** - * Compute quad's attribute values by linear interpolation. - * - * Push into the fp: - * - * INPUT[attr] = MAD COEF_A0[attr], COEF_DADX[attr], INPUT_WPOS.xxxx - * INPUT[attr] = MAD INPUT[attr], COEF_DADY[attr], INPUT_WPOS.yyyy - */ -static INLINE void -linterp(const struct tgsi_interp_coef *coef, - struct tgsi_exec_vector *pos, uint ch) -{ - uint j; - for (j = 0; j < QUAD_SIZE; j++) { - const float x = pos->xyzw[0].f[j]; - const float y = pos->xyzw[1].f[j]; - pos->xyzw[ch].f[j] = (coef->a0[ch] + - coef->dadx[ch] * x + - coef->dady[ch] * y); - } -} - +#if defined(USE_X86_ASM) || defined(SLANG_X86) +typedef void (XSTDCALL *sse2_function)( + const struct tgsi_exec_vector *input, + struct tgsi_exec_vector *output, + float (*constant)[4], + struct tgsi_exec_vector *temporary, + const struct tgsi_interp_coef *coef ); +#endif /* This should be done by the fragment shader execution unit (code * generated from the decl instructions). Do it here for now. @@ -127,12 +119,23 @@ shade_quad( machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f; machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f; - /* interp Z */ - linterp(&quad->coef[0], &machine.Inputs[0], 2); /* Z */ - linterp(&quad->coef[0], &machine.Inputs[0], 3); /* 1/W */ - /* run shader */ - tgsi_exec_machine_run( &machine ); + if( softpipe->fs->executable != NULL ) { +#if defined(USE_X86_ASM) || defined(SLANG_X86) + sse2_function func = (sse2_function) softpipe->fs->executable; + func( + machine.Inputs, + machine.Outputs, + machine.Consts, + machine.Temps, + machine.InterpCoefs ); +#else + assert( 0 ); +#endif + } + else { + tgsi_exec_machine_run( &machine ); + } /* store result color (always in output[1]) */ memcpy( diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c index d89bb19970..cf5e386ddf 100755 --- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c +++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c @@ -114,6 +114,23 @@ get_temp( (vec * 4 + chan) * 16 ); } +static struct x86_reg +get_coef_base( void ) +{ + return get_output_base(); +} + +static struct x86_reg +get_coef( + unsigned vec, + unsigned chan, + unsigned member ) +{ + return x86_make_disp( + get_coef_base(), + ((vec * 3 + member) * 4 + chan) * 4 ); +} + static struct x86_reg get_addr( unsigned vec, @@ -143,7 +160,7 @@ emit_const( } static void -emit_input( +emit_inputf( struct x86_function *func, unsigned xmm, unsigned vec, @@ -155,6 +172,19 @@ emit_input( get_input( vec, chan ) ); } +static void +emit_inputs( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) +{ + sse_movups( + func, + get_input( vec, chan ), + make_xmm( xmm ) ); +} + static void emit_output( struct x86_function *func, @@ -182,7 +212,7 @@ emit_tempf( } static void -emit_temps ( +emit_temps( struct x86_function *func, unsigned xmm, unsigned vec, @@ -194,6 +224,70 @@ emit_temps ( make_xmm( xmm ) ); } +static void +emit_coef( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan, + unsigned member ) +{ + sse_movss( + func, + make_xmm( xmm ), + get_coef( vec, chan, member ) ); + sse_shufps( + func, + make_xmm( xmm ), + make_xmm( xmm ), + SHUF( 0, 0, 0, 0 ) ); +} + +static void +emit_coef_a0( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) +{ + emit_coef( + func, + xmm, + vec, + chan, + 0 ); +} + +static void +emit_coef_dadx( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) +{ + emit_coef( + func, + xmm, + vec, + chan, + 1 ); +} + +static void +emit_coef_dady( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) +{ + emit_coef( + func, + xmm, + vec, + chan, + 2 ); +} + static void emit_addrf( struct x86_function *func, @@ -676,7 +770,7 @@ emit_fetch( break; case TGSI_FILE_INPUT: - emit_input( + emit_inputf( func, xmm, reg->SrcRegister.Index, @@ -1658,6 +1752,76 @@ emit_instruction( } } +static void +emit_declaration( + struct x86_function *func, + struct tgsi_full_declaration *decl ) +{ + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + unsigned i, j; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + /* Do not touch WPOS.xy */ + if( first == 0 ) { + mask &= ~TGSI_WRITEMASK_XY; + if( mask == TGSI_WRITEMASK_NONE ) { + first++; + } + } + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + emit_coef_a0( func, 0, i, j ); + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_LINEAR: + emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); + emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_coef_a0( func, 4, i, j ); + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 4 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); + emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_inputf( func, 4, 0, TGSI_SWIZZLE_W ); + emit_coef_a0( func, 5, i, j ); + emit_rcp( func, 4, 4 ); /* 1.0 / w */ + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 5 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ + emit_inputs( func, 0, i, j ); + break; + + default: + assert( 0 ); + } + } + } + } + } +} + unsigned tgsi_emit_sse2( struct tgsi_token *tokens, @@ -1715,4 +1879,82 @@ tgsi_emit_sse2( return 1; } +/** + * Fragment shaders are responsible for interpolating shader inputs. Because on + * x86 we have only 4 GP registers, and here we have 5 shader arguments (input, + * output, const, temp and coef), the code is split into two phases -- + * DECLARATION and INSTRUCTION phase. + * GP register holding the output argument is aliased with the coeff argument, + * as outputs are not needed in the DECLARATION phase. + */ +unsigned +tgsi_emit_sse2_fs( + struct tgsi_token *tokens, + struct x86_function *func ) +{ + struct tgsi_parse_context parse; + boolean instruction_phase = FALSE; + + func->csr = func->store; + + /* DECLARATION phase, do not load output argument. */ + x86_mov( + func, + get_input_base(), + get_argument( 0 ) ); + x86_mov( + func, + get_const_base(), + get_argument( 2 ) ); + x86_mov( + func, + get_temp_base(), + get_argument( 3 ) ); + x86_mov( + func, + get_coef_base(), + get_argument( 4 ) ); + + tgsi_parse_init( &parse, tokens ); + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + tgsi_parse_token( &parse ); + + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + emit_declaration( + func, + &parse.FullToken.FullDeclaration ); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if( !instruction_phase ) { + /* INSTRUCTION phase, overwrite coeff with output. */ + instruction_phase = TRUE; + x86_mov( + func, + get_output_base(), + get_argument( 1 ) ); + } + emit_instruction( + func, + &parse.FullToken.FullInstruction ); + break; + + default: + assert( 0 ); + } + } + + tgsi_parse_free( &parse ); + +#ifdef WIN32 + x86_retw( func, 16 ); +#else + x86_ret( func ); +#endif + + return 1; +} + #endif diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.h b/src/mesa/pipe/tgsi/exec/tgsi_sse2.h index 4a39658484..9bee371766 100755 --- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.h +++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.h @@ -13,6 +13,11 @@ tgsi_emit_sse2( struct tgsi_token *tokens, struct x86_function *function ); +unsigned +tgsi_emit_sse2_fs( + struct tgsi_token *tokens, + struct x86_function *function ); + #if defined __cplusplus } // extern "C" #endif // defined __cplusplus diff --git a/src/mesa/state_tracker/st_atom_fs.c b/src/mesa/state_tracker/st_atom_fs.c index f8a1dc83cf..ef5b941c17 100644 --- a/src/mesa/state_tracker/st_atom_fs.c +++ b/src/mesa/state_tracker/st_atom_fs.c @@ -36,6 +36,7 @@ #include "pipe/p_defines.h" #include "pipe/p_winsys.h" #include "pipe/tgsi/mesa/mesa_to_tgsi.h" +#include "pipe/tgsi/exec/tgsi_core.h" #include "pipe/tgsi/exec/tgsi_dump.h" #include "st_context.h" @@ -163,6 +164,14 @@ st_translate_fragment_shader(struct st_context *st, if (TGSI_DEBUG) tgsi_dump( stfp->tokens, 0/*TGSI_DUMP_VERBOSE*/ ); +#if defined(USE_X86_ASM) || defined(SLANG_X86) + if (stfp->sse2_program.csr == stfp->sse2_program.store) + tgsi_emit_sse2_fs( stfp->tokens, &stfp->sse2_program ); + + if (!cso->state.executable) + ((struct cso_fragment_shader*)cso)->state.executable = (void *) x86_get_func( &stfp->sse2_program ); +#endif + stfp->dirty = 0; return cso; diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index b9c19bdd3e..9f46f9e93f 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -99,13 +99,16 @@ static struct gl_program *st_new_program( GLcontext *ctx, } case GL_FRAGMENT_PROGRAM_ARB: - case GL_FRAGMENT_PROGRAM_NV: - { + case GL_FRAGMENT_PROGRAM_NV: { struct st_fragment_program *prog = CALLOC_STRUCT(st_fragment_program); prog->id = program_id++; prog->dirty = 1; +#if defined(USE_X86_ASM) || defined(SLANG_X86) + x86_init_func( &prog->sse2_program ); +#endif + return _mesa_init_fragment_program( ctx, &prog->Base, target, @@ -121,8 +124,7 @@ static void st_delete_program( GLcontext *ctx, struct gl_program *prog ) { switch( prog->Target ) { - case GL_VERTEX_PROGRAM_ARB: - { + case GL_VERTEX_PROGRAM_ARB: { #if defined(USE_X86_ASM) || defined(SLANG_X86) struct st_vertex_program *p = (struct st_vertex_program *) prog; @@ -130,7 +132,14 @@ static void st_delete_program( GLcontext *ctx, #endif break; } + case GL_FRAGMENT_PROGRAM_ARB: { +#if defined(USE_X86_ASM) || defined(SLANG_X86) + struct st_fragment_program *p = (struct st_fragment_program *) prog; + x86_release_func( &p->sse2_program ); +#endif + break; + } } _mesa_delete_program( ctx, prog ); } @@ -156,7 +165,7 @@ static void st_program_string_notify( GLcontext *ctx, if (prog == &ctx->FragmentProgram._Current->Base) st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM; - p->id = program_id++; + p->id = program_id++; p->param_state = p->Base.Base.Parameters->StateFlags; } else if (target == GL_VERTEX_PROGRAM_ARB) { @@ -165,7 +174,7 @@ static void st_program_string_notify( GLcontext *ctx, if (prog == &ctx->VertexProgram._Current->Base) st->dirty.st |= ST_NEW_VERTEX_PROGRAM; - p->id = program_id++; + p->id = program_id++; p->param_state = p->Base.Base.Parameters->StateFlags; /* Also tell tnl about it: diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index c21e27628e..419afa4e78 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -49,10 +49,14 @@ struct st_fragment_program GLboolean error; /* If program is malformed for any reason. */ GLuint id; /**< String id, for tracking ProgramStringNotify changes. */ - + /** The program in TGSI format */ struct tgsi_token tokens[ST_FP_MAX_TOKENS]; GLboolean dirty; +#if defined(USE_X86_ASM) || defined(SLANG_X86) + struct x86_function sse2_program; +#endif + /** Pointer to the corresponding cached shader */ const struct cso_fragment_shader *fs; -- cgit v1.2.3