1 files changed, 464 insertions, 125 deletions
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 12051ff1c8..4e100d854e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -43,46 +43,222 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/glheader.h"
 #include "swrast/swrast.h"
 
-#include "radeon_context.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "radeon_span.h"
-#include "radeon_tex.h"
-
-#include "drirenderbuffer.h"
 
 #define DBG 0
 
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
+
+
+/* r200 depth buffer is always tiled - this is the formula
+   according to the docs unless I typo'ed in it
+*/
+#if defined(RADEON_COMMON_FOR_R200)
+static GLubyte *r200_depth_2byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y  >> 4) * (rrb->pitch >> 8) + (x >> 6)));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 8) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 3) & 0x1) << 8;
+	offset += ((x >> 4) & 0x3) << 6;
+	offset += ((x >> 2) & 0x1) << 5;
+	offset += ((y >> 1) & 0x1) << 4;
+	offset += ((x >> 1) & 0x1) << 3;
+	offset += (y & 0x1) << 2;
+	offset += (x & 0x1) << 1;
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y & 0x7ff) >> 4) * (rrb->pitch >> 7) + (x >> 5));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 7) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 2) & 0x1) << 8;
+	offset += ((x >> 3) & 0x3) << 6;
+	offset += ((y >> 1) & 0x1) << 5;
+	offset += ((x >> 1) & 0x1) << 4;
+	offset += (y & 0x1) << 3;
+	offset += (x & 0x1) << 2;
+    }
+    return &ptr[offset];
+}
+#endif
+
+/* radeon tiling on r300-r500 has 4 states,
+   macro-linear/micro-linear
+   macro-linear/micro-tiled
+   macro-tiled /micro-linear
+   macro-tiled /micro-tiled
+   1 byte surface 
+   2 byte surface - two types - we only provide 8x2 microtiling
+   4 byte surface
+   8/16 byte (unused)
+*/
+static GLubyte *radeon_ptr_4byte(const struct radeon_renderbuffer * rrb,
+			     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 5)) << 11;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 4)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 3)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 2) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 6)) << 11;
+		offset += (((y >> 2) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 2)) << 5;
+	    offset += (y & 1) << 4;
+	    offset += (x & 3) << 2;
+        }
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
+				     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 6)) << 11;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 4)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 3) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 7)) << 11;
+		offset += (((y >> 2) ^ (x >> 7)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 6)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += ((x >> 4) & 0x1) << 5;
+                offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 3)) << 5;
+	    offset += (y & 0x1) << 4;
+	    offset += (x & 0x7) << 1;
+        }
+    }
+    return &ptr[offset];
+}
+
+#ifndef COMPILE_R300
+static uint32_t
+z24s8_to_s8z24(uint32_t val)
+{
+   return (val << 24) | (val >> 8);
+}
+
+static uint32_t
+s8z24_to_z24s8(uint32_t val)
+{
+   return (val >> 24) | (val << 8);
+}
+#endif
+
 /*
  * Note that all information needed to access pixels in a renderbuffer
  * should be obtained through the gl_renderbuffer parameter, not per-context
  * information.
  */
 #define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;		\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+   GLuint p;						\
+   (void)p;						\
+   radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;	\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+  radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
-#define Y_FLIP(Y) (bottom - (Y))
+#define Y_FLIP(_y) ((_y) * yScale + yBias)
 
 #define HW_LOCK()
 
 #define HW_UNLOCK()
 
+/* XXX FBO: this is identical to the macro in spantmp2.h except we get
+ * the cliprect info from the context, not the driDrawable.
+ * Move this into spantmp2.h someday.
+ */
+#define HW_CLIPLOOP()							\
+   do {									\
+      int _nc = num_cliprects;						\
+      while ( _nc-- ) {							\
+	 int minx = cliprects[_nc].x1 - x_off;				\
+	 int miny = cliprects[_nc].y1 - y_off;				\
+	 int maxx = cliprects[_nc].x2 - x_off;				\
+	 int maxy = cliprects[_nc].y2 - y_off;
+
 /* ================================================================
  * Color buffer
  */
@@ -94,7 +270,41 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, ARGB1555 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+
+#define TAG(x)    radeon##x##_ARGB1555
+#define TAG2(x,y) radeon##x##_ARGB1555##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, RGBA4 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+
+#define TAG(x)    radeon##x##_ARGB4444
+#define TAG2(x,y) radeon##x##_ARGB4444##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 32 bit, xRGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_xRGB8888
+#define TAG2(x,y) radeon##x##_xRGB8888##y
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0xff000000))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* 32 bit, ARGB8888 color spanline and pixel functions
@@ -104,7 +314,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#define GET_VALUE(_x, _y) (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* ================================================================
@@ -121,106 +335,127 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * too...
  */
 
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
 /* 16-bit depth buffer functions
  */
 #define VALUE_TYPE GLushort
 
+#if defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off) = d
+#else
 #define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+   *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
+#endif
 
+#if defined(RADEON_COMMON_FOR_R200)
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+   d = *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off)
+#else
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off)
+#endif
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
 
-/* 24 bit depth, 8 bit stencil depthbuffer functions
+/* 24 bit depth
  *
  * Careful: It looks like the R300 uses ZZZS byte order while the R200
  * uses SZZZ for 24 bit depth, 8 bit stencil mode.
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = *_ptr;							\
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
+  do {									\
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
+  }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_DEPTH( d, _x, _y )						\
+  do {									\
+    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)) & 0x00ffffff; \
   }while(0)
 #else
+#define READ_DEPTH( d, _x, _y )	\
+  d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
+#endif
+
+#define TAG(x) radeon##x##_z24
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ * EXT_depth_stencil
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#define VALUE_TYPE GLuint
+
+#if defined(COMPILE_R300)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;								\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;					\
+} while (0)
+#endif
+
+#if defined(COMPILE_R300)
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
+  do { \
+    d = s8z24_to_z24s8(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )	do {					\
+    d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off ))); \
+  } while (0)
 #endif
 
 #define TAG(x) radeon##x##_z24_s8
@@ -235,35 +470,51 @@ do {									\
 #ifdef COMPILE_R300
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
 #ifdef COMPILE_R300
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = tmp & 0x000000ff;						\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #endif
@@ -271,29 +522,110 @@ do {									\
 #define TAG(x) radeon##x##_z24_s8
 #include "stenciltmp.h"
 
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
+
+static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
+{
+	struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+	int r;
+
+	if (rrb == NULL || !rrb->bo)
+		return;
+
+	if (flag) {
+		if (rrb->bo->bom->funcs->bo_wait)
+			radeon_bo_wait(rrb->bo);
+		r = radeon_bo_map(rrb->bo, 1);
+		if (r) {
+			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
+				__FUNCTION__, r);
+		}
+
+		radeonSetSpanFunctions(rrb);
+	} else {
+		radeon_bo_unmap(rrb->bo);
+		rb->GetRow = NULL;
+		rb->PutRow = NULL;
+	}
+}
+
+static void
+radeon_map_unmap_buffers(GLcontext *ctx, GLboolean map)
+{
+	GLuint i, j;
+
+	/* color draw buffers */
+	for (j = 0; j < ctx->DrawBuffer->_NumColorDrawBuffers; j++)
+		map_unmap_rb(ctx->DrawBuffer->_ColorDrawBuffers[j], map);
+
+	/* check for render to textures */
+	for (i = 0; i < BUFFER_COUNT; i++) {
+		struct gl_renderbuffer_attachment *att =
+			ctx->DrawBuffer->Attachment + i;
+		struct gl_texture_object *tex = att->Texture;
+		if (tex) {
+			/* Render to texture. Note that a mipmapped texture need not
+			 * be complete for render to texture, so we must restrict to
+			 * mapping only the attached image.
+			 */
+			radeon_texture_image *image = get_radeon_texture_image(tex->Image[att->CubeMapFace][att->TextureLevel]);
+			ASSERT(att->Renderbuffer);
+
+			if (map)
+				radeon_teximage_map(image, GL_TRUE);
+			else
+				radeon_teximage_unmap(image);
+		}
+	}
+
+	map_unmap_rb(ctx->ReadBuffer->_ColorReadBuffer, map);
+
+	/* depth buffer (Note wrapper!) */
+	if (ctx->DrawBuffer->_DepthBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_DepthBuffer->Wrapped, map);
+
+	if (ctx->DrawBuffer->_StencilBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_StencilBuffer->Wrapped, map);
+}
 
 static void radeonSpanRenderStart(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
+	int i;
+
+	radeon_firevertices(rmesa);
+
+	/* The locking and wait for idle should really only be needed in classic mode.
+	 * In a future memory manager based implementation, this should become
+	 * unnecessary due to the fact that mapping our buffers, textures, etc.
+	 * should implicitly wait for any previous rendering commands that must
+	 * be waited on. */
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(rmesa);
+		radeonWaitForIdleLocked(rmesa);
+	}
+
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 1);
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	int i;
 	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		UNLOCK_HARDWARE(rmesa);
+	}
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 0);
 }
 
 void radeonInitSpanFuncs(GLcontext * ctx)
@@ -307,20 +639,27 @@ void radeonInitSpanFuncs(GLcontext * ctx)
 /**
  * Plug in the Get/Put routines for the given driRenderbuffer.
  */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
+	if (rrb->base._ActualFormat == GL_RGB5) {
+		radeonInitPointers_RGB565(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB8) {
+		radeonInitPointers_xRGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA8) {
+		radeonInitPointers_ARGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA4) {
+		radeonInitPointers_ARGB4444(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB5_A1) {
+		radeonInitPointers_ARGB1555(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+		radeonInitDepthPointers_z24_s8(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&rrb->base);
+	} else {
+		fprintf(stderr, "radeonSetSpanFunctions: bad actual format: 0x%04X\n", rrb->base._ActualFormat);
 	}
 }