diff options
author | Brian <brian.paul@tungstengraphics.com> | 2007-10-02 11:46:11 -0600 |
---|---|---|
committer | Brian <brian.paul@tungstengraphics.com> | 2007-10-02 11:46:11 -0600 |
commit | 0d13ade0cdd38759936a74824efbd6ac8b563aed (patch) | |
tree | 1fe31314186843a968470d42f0151ccea410ad7d /src | |
parent | 57d3770f35730bef17e5d93bd424a59eb6daec4c (diff) |
Move tgsi machine state init/allocations so they're done less frequently.
This, plus expanding all instructions ahead of time, seems to have improved
the performance of program execution by 8x or so.
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/pipe/draw/draw_private.h | 4 | ||||
-rw-r--r-- | src/mesa/pipe/draw/draw_vertex_shader.c | 52 | ||||
-rwxr-xr-x | src/mesa/pipe/softpipe/sp_quad_fs.c | 81 | ||||
-rw-r--r-- | src/mesa/pipe/tgsi/exec/tgsi_exec.c | 176 | ||||
-rw-r--r-- | src/mesa/pipe/tgsi/exec/tgsi_exec.h | 8 |
5 files changed, 179 insertions, 142 deletions
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h index 12a970a671..a54fef41e7 100644 --- a/src/mesa/pipe/draw/draw_private.h +++ b/src/mesa/pipe/draw/draw_private.h @@ -47,6 +47,8 @@ #include "draw_vertex.h" #include "x86/rtasm/x86sse.h" +#include "pipe/tgsi/exec/tgsi_core.h" + /** * Basic vertex info. @@ -187,6 +189,8 @@ struct draw_context unsigned prim; /**< current prim type: PIPE_PRIM_x */ unsigned reduced_prim; + /** TGSI program interpreter runtime state */ + struct tgsi_exec_machine machine; /* Post-tnl vertex cache: */ diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c index 3518bd52a3..e3bcd35334 100644 --- a/src/mesa/pipe/draw/draw_vertex_shader.c +++ b/src/mesa/pipe/draw/draw_vertex_shader.c @@ -86,7 +86,7 @@ run_vertex_program(struct draw_context *draw, unsigned elts[4], unsigned count, struct vertex_header *vOut[]) { - struct tgsi_exec_machine machine; + struct tgsi_exec_machine *machine = &draw->machine; unsigned int j; ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); @@ -98,35 +98,39 @@ run_vertex_program(struct draw_context *draw, assert(draw->vertex_shader->state->output_semantic_name[0] == TGSI_SEMANTIC_POSITION); -#ifdef DEBUG - memset( &machine, 0, sizeof( machine ) ); +#ifdef DEBUG_foo + memset( machine, 0, sizeof( *machine ) ); #endif +#if 0 /* init machine state */ - tgsi_exec_machine_init(&machine, + tgsi_exec_machine_init(machine, draw->vertex_shader->state->tokens, PIPE_MAX_SAMPLERS, NULL /*samplers*/ ); +#endif /* Consts does not require 16 byte alignment. */ - machine.Consts = (float (*)[4]) draw->mapped_constants; + machine->Consts = (float (*)[4]) draw->mapped_constants; - machine.Inputs = ALIGN16_ASSIGN(inputs); - machine.Outputs = ALIGN16_ASSIGN(outputs); + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); - draw_vertex_fetch( draw, &machine, elts, count ); + draw_vertex_fetch( draw, machine, elts, count ); /* run shader */ if( draw->vertex_shader->state->executable != NULL ) { + /* SSE */ codegen_function func = (codegen_function) draw->vertex_shader->state->executable; func( - machine.Inputs, - machine.Outputs, - machine.Consts, - machine.Temps ); + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps ); } else { - tgsi_exec_machine_run( &machine ); + /* interpreter */ + tgsi_exec_machine_run( machine ); } @@ -136,10 +140,10 @@ run_vertex_program(struct draw_context *draw, float x, y, z, w; /* Handle attr[0] (position) specially: */ - x = vOut[j]->clip[0] = machine.Outputs[0].xyzw[0].f[j]; - y = vOut[j]->clip[1] = machine.Outputs[0].xyzw[1].f[j]; - z = vOut[j]->clip[2] = machine.Outputs[0].xyzw[2].f[j]; - w = vOut[j]->clip[3] = machine.Outputs[0].xyzw[3].f[j]; + x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j]; vOut[j]->clipmask = compute_clipmask(x, y, z, w) | draw->user_clipmask; vOut[j]->edgeflag = 1; @@ -162,10 +166,10 @@ run_vertex_program(struct draw_context *draw, * Subtract two because of the VERTEX_HEADER, CLIP_POS attribs. */ for (slot = 1; slot < draw->vertex_info.num_attribs - 2; slot++) { - vOut[j]->data[slot][0] = machine.Outputs[slot].xyzw[0].f[j]; - vOut[j]->data[slot][1] = machine.Outputs[slot].xyzw[1].f[j]; - vOut[j]->data[slot][2] = machine.Outputs[slot].xyzw[2].f[j]; - vOut[j]->data[slot][3] = machine.Outputs[slot].xyzw[3].f[j]; + vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; /* printf("output %d: %f %f %f %f\n", slot, vOut[j]->data[slot][0], @@ -235,6 +239,12 @@ void draw_bind_vertex_shader(struct draw_context *draw, { draw_flush(draw); draw->vertex_shader = (struct draw_vertex_shader*)(vcso); + + /* init machine state */ + tgsi_exec_machine_init(&draw->machine, + draw->vertex_shader->state->tokens, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/ ); } void draw_delete_vertex_shader(struct draw_context *draw, diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c index 673d339f41..57c01dcfcc 100755 --- a/src/mesa/pipe/softpipe/sp_quad_fs.c +++ b/src/mesa/pipe/softpipe/sp_quad_fs.c @@ -45,6 +45,8 @@ struct quad_shade_stage { struct quad_stage stage; struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS]; + struct tgsi_exec_machine machine; + struct tgsi_exec_vector *inputs, *outputs; }; @@ -83,58 +85,41 @@ shade_quad( struct softpipe_context *softpipe = qs->softpipe; const float fx = (float) quad->x0; const float fy = (float) quad->y0; - struct tgsi_exec_machine machine; - - ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); - ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX); - -#ifdef DEBUG - memset( &machine, 0, sizeof( machine ) ); -#endif - - /* init machine state */ - tgsi_exec_machine_init( - &machine, - softpipe->fs->tokens, - PIPE_MAX_SAMPLERS, - qss->samplers ); + struct tgsi_exec_machine *machine = &qss->machine; /* Consts does not require 16 byte alignment. */ - machine.Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT]; - - machine.Inputs = ALIGN16_ASSIGN(inputs); - machine.Outputs = ALIGN16_ASSIGN(outputs); + machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT]; - machine.InterpCoefs = quad->coef; + machine->InterpCoefs = quad->coef; - machine.Inputs[0].xyzw[0].f[0] = fx; - machine.Inputs[0].xyzw[0].f[1] = fx + 1.0f; - machine.Inputs[0].xyzw[0].f[2] = fx; - machine.Inputs[0].xyzw[0].f[3] = fx + 1.0f; + machine->Inputs[0].xyzw[0].f[0] = fx; + machine->Inputs[0].xyzw[0].f[1] = fx + 1.0f; + machine->Inputs[0].xyzw[0].f[2] = fx; + machine->Inputs[0].xyzw[0].f[3] = fx + 1.0f; - machine.Inputs[0].xyzw[1].f[0] = fy; - machine.Inputs[0].xyzw[1].f[1] = fy; - machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f; - machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f; + machine->Inputs[0].xyzw[1].f[0] = fy; + machine->Inputs[0].xyzw[1].f[1] = fy; + machine->Inputs[0].xyzw[1].f[2] = fy + 1.0f; + machine->Inputs[0].xyzw[1].f[3] = fy + 1.0f; /* run shader */ if( softpipe->fs->executable != NULL ) { codegen_function func = (codegen_function) softpipe->fs->executable; func( - machine.Inputs, - machine.Outputs, - machine.Consts, - machine.Temps, - machine.InterpCoefs ); + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps, + machine->InterpCoefs ); } else { - tgsi_exec_machine_run( &machine ); + tgsi_exec_machine_run( machine ); } /* store result color (always in output[1]) */ memcpy( quad->outputs.color, - &machine.Outputs[1].xyzw[0].f[0], + &machine->Outputs[1].xyzw[0].f[0], sizeof( quad->outputs.color ) ); #if 0 @@ -142,14 +127,14 @@ shade_quad( /* XXX temporary */ memcpy( quad->outputs.depth, - &machine.Outputs[0].xyzw[2], + machine->Outputs[0].xyzw[2], sizeof( quad->outputs.depth ) ); } #else { uint i; for (i = 0; i < 4; i++) { - quad->outputs.depth[i] = machine.Inputs[0].xyzw[2].f[i]; + quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i]; #if 0 printf("output z %f\n", quad->outputs.depth[i]); #endif @@ -188,6 +173,12 @@ static void shade_begin(struct quad_stage *qs) } } + /* XXX only do this if the fragment shader changes... */ + tgsi_exec_machine_init(&qss->machine, + softpipe->fs->tokens, + PIPE_MAX_SAMPLERS, + qss->samplers ); + if (qs->next) qs->next->begin(qs->next); } @@ -195,11 +186,17 @@ static void shade_begin(struct quad_stage *qs) struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe ) { - struct quad_shade_stage *stage = CALLOC_STRUCT(quad_shade_stage); + struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage); + + /* allocate storage for program inputs/outputs, aligned to 16 bytes */ + qss->inputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16); + qss->outputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16); + qss->machine.Inputs = align16(qss->inputs); + qss->machine.Outputs = align16(qss->outputs); - stage->stage.softpipe = softpipe; - stage->stage.begin = shade_begin; - stage->stage.run = shade_quad; + qss->stage.softpipe = softpipe; + qss->stage.begin = shade_begin; + qss->stage.run = shade_quad; - return &stage->stage; + return &qss->stage; } diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c index 77a24ec1d8..1c515a26e3 100644 --- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c +++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c @@ -65,6 +65,80 @@ #define CHAN_Z 2 #define CHAN_W 3 + +static void +expand_program(struct tgsi_exec_machine *mach ) +{ + struct tgsi_full_instruction *instructions; + struct tgsi_full_declaration *declarations; + struct tgsi_parse_context parse; + uint k; + uint maxInstructions = 10, numInstructions = 0; + uint maxDeclarations = 10, numDeclarations = 0; + + k = tgsi_parse_init( &parse, mach->Tokens ); + if (k != TGSI_PARSE_OK) { + printf("Problem parsing!\n"); + return; + } + + declarations = (struct tgsi_full_declaration *) + malloc(maxDeclarations * sizeof(struct tgsi_full_declaration)); + + instructions = (struct tgsi_full_instruction *) + malloc(maxInstructions * sizeof(struct tgsi_full_instruction)); + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + tgsi_parse_token( &parse ); + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + /* + exec_declaration( mach, &parse.FullToken.FullDeclaration ); + */ + if (numDeclarations == maxDeclarations) { + maxDeclarations += 10; + declarations = realloc(declarations, + maxDeclarations + * sizeof(struct tgsi_full_instruction)); + } + memcpy(declarations + numDeclarations, + &parse.FullToken.FullInstruction, + sizeof(declarations[0])); + numDeclarations++; + break; + case TGSI_TOKEN_TYPE_IMMEDIATE: + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (numInstructions == maxInstructions) { + maxInstructions += 10; + instructions = realloc(instructions, + maxInstructions + * sizeof(struct tgsi_full_instruction)); + } + memcpy(instructions + numInstructions, + &parse.FullToken.FullInstruction, + sizeof(instructions[0])); + numInstructions++; + break; + default: + assert( 0 ); + } + } + tgsi_parse_free (&parse); + + assert(!mach->Instructions); + mach->Instructions = instructions; + mach->NumInstructions = numInstructions; + mach->Declarations = declarations; + mach->NumDeclarations = numDeclarations; +} + + +/** + * Initialize machine state by expanding tokens to full instructions, + * allocating temporary storage, setting up constants, etc. + * After this, we can call tgsi_exec_machine_run() many times. + */ void tgsi_exec_machine_init( struct tgsi_exec_machine *mach, @@ -103,16 +177,32 @@ tgsi_exec_machine_init( mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f; } + if (mach->Declarations) { + free(mach->Declarations); + mach->Declarations = NULL; + mach->NumDeclarations = 0; + } + if (mach->Instructions) { + free(mach->Instructions); + mach->Instructions = NULL; + mach->NumInstructions = 0; + } + mach->CondMask = 0xf; mach->LoopMask = 0xf; mach->ExecMask = 0xf; + +#if 01 + tgsi_exec_prepare( mach ); + expand_program(mach); +#endif } void tgsi_exec_prepare( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ) + struct tgsi_exec_machine *mach ) { + struct tgsi_exec_labels *labels = &mach->Labels; struct tgsi_parse_context parse; GLuint k; GLuint instno = 0; @@ -164,10 +254,10 @@ void tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) { - struct tgsi_exec_labels labels; - - tgsi_exec_prepare( mach, &labels ); - tgsi_exec_machine_run2( mach, &labels ); +#if 0 + tgsi_exec_prepare( mach ); +#endif + tgsi_exec_machine_run2( mach ); } static void @@ -2170,77 +2260,9 @@ exec_instruction( } -static void -expand_program(struct tgsi_exec_machine *mach ) -{ - struct tgsi_full_instruction *instructions; - struct tgsi_full_declaration *declarations; - struct tgsi_parse_context parse; - uint k; - uint maxInstructions = 10, numInstructions = 0; - uint maxDeclarations = 10, numDeclarations = 0; - - k = tgsi_parse_init( &parse, mach->Tokens ); - if (k != TGSI_PARSE_OK) { - printf("Problem parsing!\n"); - return; - } - - declarations = (struct tgsi_full_declaration *) - malloc(maxDeclarations * sizeof(struct tgsi_full_declaration)); - - instructions = (struct tgsi_full_instruction *) - malloc(maxInstructions * sizeof(struct tgsi_full_instruction)); - - while( !tgsi_parse_end_of_tokens( &parse ) ) { - tgsi_parse_token( &parse ); - switch( parse.FullToken.Token.Type ) { - case TGSI_TOKEN_TYPE_DECLARATION: - /* - exec_declaration( mach, &parse.FullToken.FullDeclaration ); - */ - if (numDeclarations == maxDeclarations) { - maxDeclarations += 10; - declarations = realloc(declarations, - maxDeclarations - * sizeof(struct tgsi_full_instruction)); - } - memcpy(declarations + numDeclarations, - &parse.FullToken.FullInstruction, - sizeof(declarations[0])); - numDeclarations++; - break; - case TGSI_TOKEN_TYPE_IMMEDIATE: - break; - case TGSI_TOKEN_TYPE_INSTRUCTION: - if (numInstructions == maxInstructions) { - maxInstructions += 10; - instructions = realloc(instructions, - maxInstructions - * sizeof(struct tgsi_full_instruction)); - } - memcpy(instructions + numInstructions, - &parse.FullToken.FullInstruction, - sizeof(instructions[0])); - numInstructions++; - break; - default: - assert( 0 ); - } - } - tgsi_parse_free (&parse); - - mach->Instructions = instructions; - mach->NumInstructions = numInstructions; - mach->Declarations = declarations; - mach->NumDeclarations = numDeclarations; -} - - void tgsi_exec_machine_run2( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ) + struct tgsi_exec_machine *mach ) { #if 0 && MESA GET_CURRENT_CONTEXT(ctx); @@ -2255,9 +2277,11 @@ tgsi_exec_machine_run2( GLuint k; #endif +#if 0 if (!mach->Instructions) { expand_program(mach); } +#endif mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; @@ -2305,8 +2329,10 @@ tgsi_exec_machine_run2( exec_instruction( mach, mach->Instructions + pc, &pc ); } +#if 0 free(mach->Declarations); free(mach->Instructions); +#endif } #endif diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.h b/src/mesa/pipe/tgsi/exec/tgsi_exec.h index 8997ea9c09..2b493ff682 100644 --- a/src/mesa/pipe/tgsi/exec/tgsi_exec.h +++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.h @@ -154,6 +154,8 @@ struct tgsi_exec_machine struct tgsi_full_declaration *Declarations; uint NumDeclarations; + + struct tgsi_exec_labels Labels; }; @@ -166,8 +168,7 @@ tgsi_exec_machine_init( void tgsi_exec_prepare( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ); + struct tgsi_exec_machine *mach ); void tgsi_exec_machine_run( @@ -175,8 +176,7 @@ tgsi_exec_machine_run( void tgsi_exec_machine_run2( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ); + struct tgsi_exec_machine *mach ); #if defined __cplusplus } // extern "C" |