From b27eb7cb4f5b49b9e7c24deb6c1fb52908f63703 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sun, 11 Jan 2009 15:11:00 -0700 Subject: cell: re-order the z/stencil fetch/extract/convert instructions for better perf The new instruction order is 10 cycles faster. --- src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 106 +++++++++++------------ 1 file changed, 51 insertions(+), 55 deletions(-) (limited to 'src/gallium/drivers/cell') diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 4d28d4801f..b3cce68157 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1813,93 +1813,88 @@ gen_depth_stencil(struct cell_context *cell, const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; boolean write_depth_stencil; - /* We may or may not need to allocate a register for Z or stencil values */ - int fbS_reg = -1, fbZ_reg = -1; - - /* framebuffer's combined z/stencil values for quad */ + /* framebuffer's combined z/stencil values register */ int fbZS_reg = spe_allocate_available_register(f); + /* Framebufer Z values register */ + int fbZ_reg = spe_allocate_available_register(f); - spe_comment(f, 0, "Fetch Z/stencil quad from tile"); + /* Framebuffer stencil values register (may not be used) */ + int fbS_reg = spe_allocate_available_register(f); + + /* 24-bit mask register (may not be used) */ + int zmask_reg = spe_allocate_available_register(f); - /* fetch quad of depth/stencil values from tile at (x,y) */ - /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ - /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */ - spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); - - /* From the Z/stencil buffer format, pull out the bits we need for - * Z and/or stencil. We'll also convert the incoming fragment Z - * value in fragZ_reg from a floating point value in [0.0..1.0] to - * an unsigned integer value with the appropriate resolution. - * Note that even if depth or stencil is *not* enabled, if it's - * present in the buffer, we pull it out and put it back later; - * otherwise, we can inadvertently destroy the contents of - * buffers we're not supposed to touch (e.g., if the user is - * clearing the depth buffer but not the stencil buffer, a - * quad of constant depth is drawn over the surface; the stencil - * buffer must be maintained). + /** + * The following code: + * 1. fetch quad of packed Z/S values from the framebuffer tile. + * 2. extract the separate the Z and S values from packed values + * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints + * + * The instructions for doing this are interleaved for better performance. */ + spe_comment(f, 0, "Fetch Z/stencil quad from tile"); + switch(zs_format) { case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ case PIPE_FORMAT_X8Z24_UNORM: - /* Pull out both Z and stencil */ - setup_optional_register(f, &fbZ_reg); - setup_optional_register(f, &fbS_reg); + /* prepare mask to extract Z vals from ZS vals */ + spe_load_uint(f, zmask_reg, 0x00ffffff); - /* four 24-bit Z values in the low-order bits */ - spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ + /* convert fragment Z from [0,1] to 32-bit ints */ spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + /* right shift 32-bit fragment Z to 24 bits */ spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - /* four 8-bit stencil values in the high-order bits */ + /* extract 24-bit Z values from ZS values by masking */ + spe_and(f, fbZ_reg, fbZS_reg, zmask_reg); + + /* extract 8-bit stencil values by shifting */ spe_rotmi(f, fbS_reg, fbZS_reg, -24); break; case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ case PIPE_FORMAT_Z24X8_UNORM: - setup_optional_register(f, &fbZ_reg); - setup_optional_register(f, &fbS_reg); + /* convert fragment Z from [0,1] to 32-bit ints */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - /* shift by 8 to get the upper 24-bit values */ - spe_rotmi(f, fbS_reg, fbZS_reg, -8); + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* right shift 32-bit fragment Z to 24 bits */ spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - /* 8-bit stencil in the low-order bits - mask them out */ + /* extract 24-bit Z values from ZS values by shifting */ + spe_rotmi(f, fbZ_reg, fbZS_reg, -8); + + /* extract 8-bit stencil values by masking */ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); break; case PIPE_FORMAT_Z32_UNORM: - setup_optional_register(f, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); + /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg); - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 32-bit unsigned integer - */ + /* convert fragment Z from [0,1] to 32-bit ints */ spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* No stencil, so can't do anything there */ break; case PIPE_FORMAT_Z16_UNORM: - /* XXX Not sure this is correct, but it was here before, so we're - * going with it for now - */ - setup_optional_register(f, &fbZ_reg); + /* XXX This code for 16bpp Z is broken! */ + + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + /* Copy over 4 32-bit values */ spe_move(f, fbZ_reg, fbZS_reg); - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 16-bit unsigned integer - */ + /* convert Z from [0,1] to 16-bit ints */ spe_cfltu(f, fragZ_reg, fragZ_reg, 32); spe_rotmi(f, fragZ_reg, fragZ_reg, -16); /* No stencil */ @@ -1979,9 +1974,10 @@ gen_depth_stencil(struct cell_context *cell, } /* Don't need these any more */ - release_optional_register(f, fbZ_reg); - release_optional_register(f, fbS_reg); spe_release_register(f, fbZS_reg); + spe_release_register(f, fbZ_reg); + spe_release_register(f, fbS_reg); + spe_release_register(f, zmask_reg); } -- cgit v1.2.3