summaryrefslogtreecommitdiff
path: root/src/mesa/drivers/dri
diff options
context:
space:
mode:
authorRune Peterson <rune@megahurts.dk>2007-02-12 00:24:36 +0100
committerJerome Glisse <glisse@freedesktop.org>2007-02-12 00:24:36 +0100
commitee5417bca883d82d618e1c0b65011940253555dd (patch)
tree44964a8506f4dabbf5c077a0ff53c11d2f53d410 /src/mesa/drivers/dri
parent3bacb68c70e4a0813bf944061a97e727cbd89c43 (diff)
r300: Add proper support for sin/cos instruction in fragment program
Getting proper SIN and COS wasn't as easy as it appeared. I had to make make some changes to the fragment program code. general FP changes: - support HHH swizzle for vector instructions. - don't copy a source to a temp when it is not XYZW swizzled, but combine the two and have the swizzle resolve any issues. (saves temps/instructions with more elaborate shader code) - fix overflow in cnstv[].
Diffstat (limited to 'src/mesa/drivers/dri')
-rw-r--r--src/mesa/drivers/dri/r300/r300_context.h5
-rw-r--r--src/mesa/drivers/dri/r300/r300_fragprog.c271
-rw-r--r--src/mesa/drivers/dri/r300/r300_fragprog.h5
-rw-r--r--src/mesa/drivers/dri/r300/r300_render.c2
-rw-r--r--src/mesa/drivers/dri/r300/r300_state.c2
-rw-r--r--src/mesa/drivers/dri/radeon/radeon_screen.c11
6 files changed, 175 insertions, 121 deletions
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 02f8e9107d..b140235159 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -729,6 +729,11 @@ struct r300_fragment_program {
GLboolean params_uptodate;
int max_temp_idx;
+
+ /* the index of the sin constant is stored here */
+ GLint const_sin;
+
+ GLuint optimization;
};
#define R300_MAX_AOS_ARRAYS 16
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 6e85f0b5dd..b00cf9ed33 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -33,7 +33,7 @@
/*TODO'S
*
- * - COS/SIN/SCS instructions
+ * - SCS instructions
* - Depth write, WPOS/FOGC inputs
* - FogOption
* - Verify results of opcodes for accuracy, I've only checked them
@@ -187,6 +187,10 @@ static const struct {
#define SLOT_VECTOR (1<<0)
#define SLOT_SCALAR (1<<3)
#define SLOT_BOTH (SLOT_VECTOR | SLOT_SCALAR)
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
SWIZZLE_##y, \
SWIZZLE_##z, \
@@ -208,7 +212,7 @@ static const struct r300_pfs_swizzle {
{ MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
{ MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
{ MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
- { PFS_INVAL, R300_FPI0_ARGC_HALF, 0, 0},
+ { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
{ PFS_INVAL, 0, 0, 0},
};
@@ -232,8 +236,6 @@ static const struct {
{ PFS_INVAL, PFS_INVAL, PFS_INVAL}
};
-/* mapping from SWIZZLE_* to r300 native values for scalar insns */
-#define SWIZZLE_HALF 6
static const struct {
int base; /* hw value of swizzle */
int stride; /* difference between SRC0/1/2 */
@@ -590,6 +592,7 @@ static GLuint do_swizzle(struct r300_fragment_program *rp,
/* If swizzling from something without an XYZW native swizzle,
* emit result to a temp, and do new swizzle from the temp.
*/
+#if 0
if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
REG_GET_SSWZ(src) != SWIZZLE_W) {
GLuint temp = get_temp_reg(rp);
@@ -603,10 +606,30 @@ static GLuint do_swizzle(struct r300_fragment_program *rp,
0);
src = temp;
}
+#endif
- /* set scalar swizzling */
- REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+ if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
+ REG_GET_SSWZ(src) != SWIZZLE_W) {
+ GLuint vsrcswz = (v_swiz[REG_GET_VSWZ(src)].hash & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK)) | REG_GET_SSWZ(src) << 9;
+ GLint i;
+ GLuint newswz = 0;
+ GLuint offset;
+ for(i=0; i < 4; ++i){
+ offset = GET_SWZ(arbswz, i);
+
+ newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
+ }
+
+ arbswz = newswz & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK);
+ REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+ }
+ else
+ {
+ /* set scalar swizzling */
+ REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+ }
do {
vswz = REG_GET_VSWZ(src);
do {
@@ -1234,62 +1257,87 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
break;
case OPCODE_COS:
/*
- * cos using taylor serie:
- * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6!
+ * cos using a parabola (see SIN):
+ * cos(x):
+ * x += PI/2
+ * x = (x < PI)?x : x-2*PI
+ * result = sin(x)
*/
temp = get_temp_reg(rp);
- cnstv[0] = 0.5;
- cnstv[1] = 0.041666667;
- cnstv[2] = 0.001388889;
- cnstv[4] = 0.0;
- cnst = emit_const4fv(rp, cnstv);
+ if(rp->const_sin == -1){
+ cnstv[0] = 1.273239545;
+ cnstv[1] =-0.405284735;
+ cnstv[2] = 3.141592654;
+ cnstv[3] = 0.225;
+ rp->const_sin = emit_const4fv(rp, cnstv);
+ }
+ cnst = rp->const_sin;
src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_XYZ,
- src[0],
- src[0],
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_Y | WRITEMASK_Z,
- temp, temp,
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_Z,
- temp,
- swizzle(temp, X, X, X, W),
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_XYZ,
- temp, cnst,
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- pfs_one,
- pfs_one,
- negate(temp),
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- temp,
- pfs_one,
- swizzle(temp, Y, Y, Y, W),
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- temp,
- pfs_one,
- negate(swizzle(temp, Z, Z, Z, W)),
- flags);
- emit_arith(rp, PFS_OP_MAD, dest, mask,
+ emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
+ pfs_half,
+ undef,
+ undef,
+ 0);
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+ swizzle(cnst, Z, Z, Z, Z), //PI
+ pfs_half,
+ swizzle(keep(src[0]), X, X, X, X),
+ 0);
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
+ negate(swizzle(temp, W, W, W, W)), //-2
+ swizzle(cnst, Z, Z, Z, Z), //PI
swizzle(temp, X, X, X, X),
- pfs_one,
+ 0);
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+ swizzle(cnst, Z, Z, Z, Z), //PI
+ negate(pfs_half),
+ swizzle(src[0], X, X, X, X),
+ 0);
+
+ emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
+ swizzle(temp, W, W, W, W),
+ swizzle(temp, X, X, X, X),
+ swizzle(temp, Y, Y, Y, Y),
+ 0);
+
+ /* SIN */
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X | WRITEMASK_Y,
+ swizzle(temp, Z, Z, Z, Z),
+ cnst,
pfs_zero,
- flags);
+ 0);
+
+ if(rp->optimization == DRI_CONF_FP_OPTIMIZATION_SPEED){
+ emit_arith(rp, PFS_OP_MAD, dest, mask,
+ swizzle(temp, Y, Y, Y, Y),
+ absolute(swizzle(temp, Z, Z, Z, Z)),
+ swizzle(temp, X, X, X, X),
+ flags);
+ }else{
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+ swizzle(temp, Y, Y, Y, Y),
+ absolute(swizzle(temp, Z, Z, Z, Z)),
+ swizzle(temp, X, X, X, X),
+ 0);
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+ swizzle(temp, X, X, X, X),
+ absolute(swizzle(temp, X, X, X, X)),
+ negate(swizzle(temp, X, X, X, X)),
+ 0);
+
+
+ emit_arith(rp, PFS_OP_MAD, dest, mask,
+ swizzle(temp, Y, Y, Y, Y),
+ swizzle(cnst, W, W, W, W),
+ swizzle(temp, X, X, X, X),
+ flags);
+ }
free_temp(rp, temp);
break;
case OPCODE_DP3:
@@ -1398,7 +1446,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
* change the compare to (t.x + 0.5) > 0.5 we may
* save one instruction by doing CMP -t.x
*/
- cnstv[0] = cnstv[1] = cnstv[2] = cnstv[4] = 0.50001;
+ cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001;
src[0] = t_src(rp, fpi->SrcReg[0]);
temp = get_temp_reg(rp);
cnst = emit_const4fv(rp, cnstv);
@@ -1548,68 +1596,55 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
break;
case OPCODE_SIN:
/*
- * sin using taylor serie:
- * sin(x) = x - x^3/3! + x^5/5! - x^7/7!
+ * using a parabola:
+ * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+ * extra precision is obtained by weighting against
+ * itself squared.
*/
+
temp = get_temp_reg(rp);
- cnstv[0] = 0.333333333;
- cnstv[1] = 0.008333333;
- cnstv[2] = 0.000198413;
- cnstv[4] = 0.0;
- cnst = emit_const4fv(rp, cnstv);
+ if(rp->const_sin == -1){
+ cnstv[0] = 1.273239545;
+ cnstv[1] =-0.405284735;
+ cnstv[2] = 3.141592654;
+ cnstv[3] = 0.225;
+ rp->const_sin = emit_const4fv(rp, cnstv);
+ }
+ cnst = rp->const_sin;
src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_XYZ,
- src[0],
- src[0],
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_Y | WRITEMASK_Z,
- temp, temp,
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_Z,
- temp,
- swizzle(temp, X, X, X, W),
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X | WRITEMASK_Y,
+ swizzle(keep(src[0]), X, X, X, X),
+ cnst,
pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_XYZ,
- src[0],
- temp,
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_XYZ,
- temp, cnst,
- pfs_zero,
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- src[0],
- pfs_one,
- negate(temp),
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- temp,
- pfs_one,
- swizzle(temp, Y, Y, Y, W),
- flags);
- emit_arith(rp, PFS_OP_MAD, temp,
- WRITEMASK_X,
- temp,
- pfs_one,
- negate(swizzle(temp, Z, Z, Z, W)),
- flags);
- emit_arith(rp, PFS_OP_MAD, dest, mask,
- swizzle(temp, X, X, X, X),
- pfs_one,
- pfs_zero,
- flags);
+ 0);
+
+ if(rp->optimization == DRI_CONF_FP_OPTIMIZATION_SPEED){
+ emit_arith(rp, PFS_OP_MAD, dest, mask,
+ swizzle(temp, Y, Y, Y, Y),
+ absolute(swizzle(src[0], X, X, X, X)),
+ swizzle(temp, X, X, X, X),
+ flags);
+ }else{
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+ swizzle(temp, Y, Y, Y, Y),
+ absolute(swizzle(src[0], X, X, X, X)),
+ swizzle(temp, X, X, X, X),
+ 0);
+
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
+ swizzle(temp, X, X, X, X),
+ absolute(swizzle(temp, X, X, X, X)),
+ negate(swizzle(temp, X, X, X, X)),
+ 0);
+
+
+ emit_arith(rp, PFS_OP_MAD, dest, mask,
+ swizzle(temp, Y, Y, Y, Y),
+ swizzle(cnst, W, W, W, W),
+ swizzle(temp, X, X, X, X),
+ flags);
+ }
free_temp(rp, temp);
break;
case OPCODE_SLT:
@@ -1681,7 +1716,7 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
/* - Init structures
* - Determine what hwregs each input corresponds to
*/
-static void init_program(struct r300_fragment_program *rp)
+static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
{
struct r300_pfs_compile_state *cs = NULL;
struct gl_fragment_program *mp = &rp->mesa_program;
@@ -1691,6 +1726,7 @@ static void init_program(struct r300_fragment_program *rp)
int i,j;
/* New compile, reset tracking data */
+ rp->optimization = driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
rp->translated = GL_FALSE;
rp->error = GL_FALSE;
rp->cs = cs = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
@@ -1703,6 +1739,7 @@ static void init_program(struct r300_fragment_program *rp)
rp->max_temp_idx = 0;
rp->node[0].alu_end = -1;
rp->node[0].tex_end = -1;
+ rp->const_sin = -1;
_mesa_memset(cs, 0, sizeof(*rp->cs));
for (i=0;i<PFS_MAX_ALU_INST;i++) {
@@ -1816,13 +1853,13 @@ static void update_params(struct r300_fragment_program *rp)
rp->params_uptodate = GL_TRUE;
}
-void r300_translate_fragment_shader(struct r300_fragment_program *rp)
+void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
{
struct r300_pfs_compile_state *cs = NULL;
if (!rp->translated) {
- init_program(rp);
+ init_program(r300, rp);
cs = rp->cs;
if (parse_program(rp) == GL_FALSE) {
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index b0cebe60bb..73986abc3c 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -112,8 +112,11 @@ typedef struct r300_fragment_program_swizzle {
((0 | SRC_CONST) << R300_FPI3_SRC1A_SHIFT) | \
((0 | SRC_CONST) << R300_FPI3_SRC2A_SHIFT))
+#define DRI_CONF_FP_OPTIMIZATION_SPEED 0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
struct r300_fragment_program;
-extern void r300_translate_fragment_shader(struct r300_fragment_program *rp);
+extern void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp);
#endif
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index e29df87696..211c451f66 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -392,7 +392,7 @@ int r300Fallback(GLcontext *ctx)
if (rp) {
if (!rp->translated)
- r300_translate_fragment_shader(rp);
+ r300_translate_fragment_shader(r300, rp);
FALLBACK_IF(!rp->translated);
}
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index a12f3bb531..906dfceb48 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -1820,7 +1820,7 @@ void r300SetupPixelShader(r300ContextPtr rmesa)
if (!rp) /* should only happenen once, just after context is created */
return;
- r300_translate_fragment_shader(rp);
+ r300_translate_fragment_shader(rmesa, rp);
if (!rp->translated) {
fprintf(stderr, "%s: No valid fragment shader, exiting\n", __func__);
return;
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index cee1f7e2f9..fc5aa11462 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -55,6 +55,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include "r200_span.h"
#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
#include "r300_context.h"
+#include "r300_fragprog.h"
#include "radeon_span.h"
#endif
@@ -168,6 +169,13 @@ DRI_CONF_OPT_BEGIN(disable_stencil_two_side,bool,def) \
DRI_CONF_DESC(en,"Disable GL_EXT_stencil_two_side") \
DRI_CONF_OPT_END
+#define DRI_CONF_FP_OPTIMIZATION(def) \
+DRI_CONF_OPT_BEGIN_V(fp_optimization,enum,def,"0:1") \
+ DRI_CONF_DESC_BEGIN(en,"Fragment Program optimization") \
+ DRI_CONF_ENUM(0,"Optimize for Speed") \
+ DRI_CONF_ENUM(1,"Optimize for Quality") \
+ DRI_CONF_DESC_END \
+DRI_CONF_OPT_END
const char __driConfigOptions[] =
DRI_CONF_BEGIN
@@ -190,12 +198,13 @@ DRI_CONF_BEGIN
DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
+ DRI_CONF_FP_OPTIMIZATION(DRI_CONF_FP_OPTIMIZATION_SPEED)
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
DRI_CONF_NO_RAST(false)
DRI_CONF_SECTION_END
DRI_CONF_END;
-static const GLuint __driNConfigOptions = 17;
+static const GLuint __driNConfigOptions = 18;
#ifndef RADEON_DEBUG
int RADEON_DEBUG = 0;