89 files changed, 1793 insertions, 3391 deletions
diff --git a/src/glx/x11/Makefile b/src/glx/x11/Makefile
index 2d44e8df7a..f12d1f70dc 100644
--- a/src/glx/x11/Makefile
+++ b/src/glx/x11/Makefile
@@ -10,12 +10,14 @@ SOURCES = \
 	  compsize.c \
 	  eval.c \
 	  glxcmds.c \
+	  glxcurrent.c \
 	  glxext.c \
 	  glxextensions.c \
 	  indirect.c \
 	  indirect_init.c \
 	  indirect_size.c \
 	  indirect_window_pos.c \
+	  indirect_texture_compression.c \
 	  indirect_transpose_matrix.c \
 	  indirect_vertex_array.c \
 	  indirect_vertex_program.c \
@@ -29,7 +31,6 @@ SOURCES = \
 	  xfont.c \
 	  glx_pbuffer.c \
 	  glx_query.c \
-	  glx_texture_compression.c \
 	  dri_glx.c \
 	  XF86dri.c \
 	  glxhash.c \
diff --git a/src/glx/x11/dri2_glx.c b/src/glx/x11/dri2_glx.c
index 1a6abaedab..f24492672b 100644
--- a/src/glx/x11/dri2_glx.c
+++ b/src/glx/x11/dri2_glx.c
@@ -33,20 +33,16 @@
 #ifdef GLX_DIRECT_RENDERING
 
 #include <unistd.h>
-#include <X11/Xlibint.h>
-#include <X11/extensions/Xext.h>
-#include <X11/extensions/extutil.h>
+#include <X11/Xlib.h>
 #include <X11/extensions/Xfixes.h>
 #include <X11/extensions/Xdamage.h>
 #include "glheader.h"
 #include "glxclient.h"
+#include "glcontextmodes.h"
 #include "xf86dri.h"
 #include "sarea.h"
-#include <stdio.h>
 #include <dlfcn.h>
 #include <sys/types.h>
-#include <stdarg.h>
-#include "glcontextmodes.h"
 #include <sys/mman.h>
 #include "xf86drm.h"
 #include "dri2.h"
diff --git a/src/glx/x11/dri_glx.c b/src/glx/x11/dri_glx.c
index 70873c2cc4..a58060da3a 100644
--- a/src/glx/x11/dri_glx.c
+++ b/src/glx/x11/dri_glx.c
@@ -35,20 +35,16 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifdef GLX_DIRECT_RENDERING
 
 #include <unistd.h>
-#include <X11/Xlibint.h>
-#include <X11/extensions/Xext.h>
-#include <X11/extensions/extutil.h>
+#include <X11/Xlib.h>
 #include <X11/extensions/Xfixes.h>
 #include <X11/extensions/Xdamage.h>
 #include "glheader.h"
 #include "glxclient.h"
+#include "glcontextmodes.h"
 #include "xf86dri.h"
 #include "sarea.h"
-#include <stdio.h>
 #include <dlfcn.h>
 #include <sys/types.h>
-#include <stdarg.h>
-#include "glcontextmodes.h"
 #include <sys/mman.h>
 #include "xf86drm.h"
 
diff --git a/src/glx/x11/glxclient.h b/src/glx/x11/glxclient.h
index 5fd64209df..d19cc04d07 100644
--- a/src/glx/x11/glxclient.h
+++ b/src/glx/x11/glxclient.h
@@ -160,8 +160,6 @@ extern const char *glXGetScreenDriver (Display *dpy, int scrNum);
 
 extern const char *glXGetDriverConfig (const char *driverName);
 
-extern Bool __glXWindowExists(Display *dpy, GLXDrawable draw);
-
 #endif
 
 /************************************************************************/
@@ -572,8 +570,6 @@ struct __GLXdisplayPrivateRec {
 };
 
 
-void __glXFreeContext(__GLXcontext*);
-
 extern GLubyte *__glXFlushRenderBuffer(__GLXcontext*, GLubyte*);
 
 extern void __glXSendLargeChunk(__GLXcontext *gc, GLint requestNumber, 
@@ -616,6 +612,10 @@ extern __GLXcontext *__glXcurrentContext;
 
 #endif /* defined( USE_XTHREADS ) || defined( PTHREADS ) */
 
+extern void __glXSetCurrentContextNull(void);
+
+extern void __glXFreeContext(__GLXcontext*);
+
 
 /*
 ** Global lock for all threads in this address space using the GLX
diff --git a/src/glx/x11/glxcmds.c b/src/glx/x11/glxcmds.c
index ddb006193c..2110b2cd86 100644
--- a/src/glx/x11/glxcmds.c
+++ b/src/glx/x11/glxcmds.c
@@ -39,22 +39,17 @@
  * Client-side GLX interface.
  */
 
-#include <inttypes.h>
 #include "glxclient.h"
-#include <X11/extensions/extutil.h>
-#include <X11/extensions/Xext.h>
-#include <assert.h>
-#include <string.h>
 #include "glapi.h"
-#ifdef GLX_DIRECT_RENDERING
-#include "indirect_init.h"
-#include <X11/extensions/xf86vmode.h>
-#include "xf86dri.h"
-#endif
 #include "glxextensions.h"
 #include "glcontextmodes.h"
 #include "glheader.h"
+
+#ifdef GLX_DIRECT_RENDERING
 #include <sys/time.h>
+#include <X11/extensions/xf86vmode.h>
+#include "xf86dri.h"
+#endif
 
 static const char __glXGLXClientVendorName[] = "SGI";
 static const char __glXGLXClientVersion[] = "1.4";
diff --git a/src/glx/x11/glxcurrent.c b/src/glx/x11/glxcurrent.c
new file mode 100644
index 0000000000..ad648fd438
--- /dev/null
+++ b/src/glx/x11/glxcurrent.c
@@ -0,0 +1,510 @@
+/*
+** License Applicability. Except to the extent portions of this file are
+** made subject to an alternative license as permitted in the SGI Free
+** Software License B, Version 1.1 (the "License"), the contents of this
+** file are subject only to the provisions of the License. You may not use
+** this file except in compliance with the License. You may obtain a copy
+** of the License at Silicon Graphics, Inc., attn: Legal Services, 1600
+** Amphitheatre Parkway, Mountain View, CA 94043-1351, or at:
+** 
+** http://oss.sgi.com/projects/FreeB
+** 
+** Note that, as provided in the License, the Software is distributed on an
+** "AS IS" basis, with ALL EXPRESS AND IMPLIED WARRANTIES AND CONDITIONS
+** DISCLAIMED, INCLUDING, WITHOUT LIMITATION, ANY IMPLIED WARRANTIES AND
+** CONDITIONS OF MERCHANTABILITY, SATISFACTORY QUALITY, FITNESS FOR A
+** PARTICULAR PURPOSE, AND NON-INFRINGEMENT.
+** 
+** Original Code. The Original Code is: OpenGL Sample Implementation,
+** Version 1.2.1, released January 26, 2000, developed by Silicon Graphics,
+** Inc. The Original Code is Copyright (c) 1991-2000 Silicon Graphics, Inc.
+** Copyright in any portions created by third parties is as indicated
+** elsewhere herein. All Rights Reserved.
+** 
+** Additional Notice Provisions: The application programming interfaces
+** established by SGI in conjunction with the Original Code are The
+** OpenGL(R) Graphics System: A Specification (Version 1.2.1), released
+** April 1, 1999; The OpenGL(R) Graphics System Utility Library (Version
+** 1.3), released November 4, 1998; and OpenGL(R) Graphics with the X
+** Window System(R) (Version 1.3), released October 19, 1998. This software
+** was created using the OpenGL(R) version 1.2.1 Sample Implementation
+** published by SGI, but has not been independently verified as being
+** compliant with the OpenGL(R) version 1.2.1 Specification.
+**
+*/
+
+/**
+ * \file glxcurrent.c
+ * Client-side GLX interface for current context management.
+ */
+
+#include "glxclient.h"
+#include "glapi.h"
+#include "glheader.h"
+#include "indirect_init.h"
+
+#ifdef GLX_DIRECT_RENDERING
+#include "xf86dri.h"
+#endif
+
+/*
+** We setup some dummy structures here so that the API can be used
+** even if no context is current.
+*/
+
+static GLubyte dummyBuffer[__GLX_BUFFER_LIMIT_SIZE];
+
+/*
+** Dummy context used by small commands when there is no current context.
+** All the
+** gl and glx entry points are designed to operate as nop's when using
+** the dummy context structure.
+*/
+static __GLXcontext dummyContext = {
+    &dummyBuffer[0],
+    &dummyBuffer[0],
+    &dummyBuffer[0],
+    &dummyBuffer[__GLX_BUFFER_LIMIT_SIZE],
+    sizeof(dummyBuffer),
+};
+
+
+/*
+** All indirect rendering contexts will share the same indirect dispatch table.
+*/
+static __GLapi *IndirectAPI = NULL;
+
+
+/*
+ * Current context management and locking
+ */
+
+#if defined( USE_XTHREADS )
+
+/* thread safe */
+static GLboolean TSDinitialized = GL_FALSE;
+static xthread_key_t ContextTSD;
+
+_X_HIDDEN __GLXcontext *__glXGetCurrentContext(void)
+{
+   if (!TSDinitialized) {
+      xthread_key_create(&ContextTSD, NULL);
+      TSDinitialized = GL_TRUE;
+      return &dummyContext;
+   }
+   else {
+      void *p;
+      xthread_get_specific(ContextTSD, &p);
+      if (!p)
+         return &dummyContext;
+      else
+         return (__GLXcontext *) p;
+   }
+}
+
+_X_HIDDEN void __glXSetCurrentContext(__GLXcontext *c)
+{
+   if (!TSDinitialized) {
+      xthread_key_create(&ContextTSD, NULL);
+      TSDinitialized = GL_TRUE;
+   }
+   xthread_set_specific(ContextTSD, c);
+}
+
+
+/* Used by the __glXLock() and __glXUnlock() macros */
+_X_HIDDEN xmutex_rec __glXmutex;
+
+#elif defined( PTHREADS )
+
+_X_HIDDEN pthread_mutex_t __glXmutex = PTHREAD_MUTEX_INITIALIZER;
+
+# if defined( GLX_USE_TLS )
+
+/**
+ * Per-thread GLX context pointer.
+ * 
+ * \c __glXSetCurrentContext is written is such a way that this pointer can
+ * \b never be \c NULL.  This is important!  Because of this
+ * \c __glXGetCurrentContext can be implemented as trivial macro.
+ */
+__thread void * __glX_tls_Context __attribute__((tls_model("initial-exec")))
+    = &dummyContext;
+
+_X_HIDDEN void __glXSetCurrentContext( __GLXcontext * c )
+{
+    __glX_tls_Context = (c != NULL) ? c : &dummyContext;
+}
+
+# else
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+
+/**
+ * Per-thread data key.
+ * 
+ * Once \c init_thread_data has been called, the per-thread data key will
+ * take a value of \c NULL.  As each new thread is created the default
+ * value, in that thread, will be \c NULL.
+ */
+static pthread_key_t ContextTSD;
+
+/**
+ * Initialize the per-thread data key.
+ * 
+ * This function is called \b exactly once per-process (not per-thread!) to
+ * initialize the per-thread data key.  This is ideally done using the
+ * \c pthread_once mechanism.
+ */
+static void init_thread_data( void )
+{
+    if ( pthread_key_create( & ContextTSD, NULL ) != 0 ) {
+	perror( "pthread_key_create" );
+	exit( -1 );
+    }
+}
+
+_X_HIDDEN void __glXSetCurrentContext( __GLXcontext * c )
+{
+    pthread_once( & once_control, init_thread_data );
+    pthread_setspecific( ContextTSD, c );
+}
+
+_X_HIDDEN __GLXcontext * __glXGetCurrentContext( void )
+{
+    void * v;
+
+    pthread_once( & once_control, init_thread_data );
+
+    v = pthread_getspecific( ContextTSD );
+    return (v == NULL) ? & dummyContext : (__GLXcontext *) v;
+}
+
+# endif /* defined( GLX_USE_TLS ) */
+
+#elif defined( THREADS )
+
+#error Unknown threading method specified.
+
+#else
+
+/* not thread safe */
+_X_HIDDEN __GLXcontext *__glXcurrentContext = &dummyContext;
+
+#endif
+
+
+_X_HIDDEN void __glXSetCurrentContextNull(void)
+{
+    __glXSetCurrentContext(&dummyContext);
+#ifdef GLX_DIRECT_RENDERING
+    _glapi_set_dispatch(NULL);  /* no-op functions */
+#endif
+}
+
+
+/************************************************************************/
+
+PUBLIC GLXContext glXGetCurrentContext(void)
+{
+    GLXContext cx = __glXGetCurrentContext();
+    
+    if (cx == &dummyContext) {
+	return NULL;
+    } else {
+	return cx;
+    }
+}
+
+PUBLIC GLXDrawable glXGetCurrentDrawable(void)
+{
+    GLXContext gc = __glXGetCurrentContext();
+    return gc->currentDrawable;
+}
+
+
+/************************************************************************/
+
+/**
+ * Sends a GLX protocol message to the specified display to make the context
+ * and the drawables current.
+ *
+ * \param dpy     Display to send the message to.
+ * \param opcode  Major opcode value for the display.
+ * \param gc_id   Context tag for the context to be made current.
+ * \param draw    Drawable ID for the "draw" drawable.
+ * \param read    Drawable ID for the "read" drawable.
+ * \param reply   Space to store the X-server's reply.
+ *
+ * \warning
+ * This function assumes that \c dpy is locked with \c LockDisplay on entry.
+ */
+static Bool SendMakeCurrentRequest(Display *dpy, CARD8 opcode,
+				   GLXContextID gc_id, GLXContextTag gc_tag,
+				   GLXDrawable draw, GLXDrawable read,
+				   xGLXMakeCurrentReply *reply)
+{
+    Bool ret;
+
+
+    LockDisplay(dpy);
+
+    if (draw == read) {
+	xGLXMakeCurrentReq *req;
+
+	GetReq(GLXMakeCurrent,req);
+	req->reqType = opcode;
+	req->glxCode = X_GLXMakeCurrent;
+	req->drawable = draw;
+	req->context = gc_id;
+	req->oldContextTag = gc_tag;
+    }
+    else {
+	__GLXdisplayPrivate *priv = __glXInitialize(dpy);
+
+	/* If the server can support the GLX 1.3 version, we should
+	 * perfer that.  Not only that, some servers support GLX 1.3 but
+	 * not the SGI extension.
+	 */
+
+	if ((priv->majorVersion > 1) || (priv->minorVersion >= 3)) {
+	    xGLXMakeContextCurrentReq *req;
+
+	    GetReq(GLXMakeContextCurrent,req);
+	    req->reqType = opcode;
+	    req->glxCode = X_GLXMakeContextCurrent;
+	    req->drawable = draw;
+	    req->readdrawable = read;
+	    req->context = gc_id;
+	    req->oldContextTag = gc_tag;
+	}
+	else {
+	    xGLXVendorPrivateWithReplyReq *vpreq;
+	    xGLXMakeCurrentReadSGIReq *req;
+
+	    GetReqExtra(GLXVendorPrivateWithReply,
+			sz_xGLXMakeCurrentReadSGIReq-sz_xGLXVendorPrivateWithReplyReq,vpreq);
+	    req = (xGLXMakeCurrentReadSGIReq *)vpreq;
+	    req->reqType = opcode;
+	    req->glxCode = X_GLXVendorPrivateWithReply;
+	    req->vendorCode = X_GLXvop_MakeCurrentReadSGI;
+	    req->drawable = draw;
+	    req->readable = read;
+	    req->context = gc_id;
+	    req->oldContextTag = gc_tag;
+	}
+    }
+
+    ret = _XReply(dpy, (xReply*) reply, 0, False);
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return ret;
+}
+
+
+#ifdef GLX_DIRECT_RENDERING
+static __GLXDRIdrawable *
+FetchDRIDrawable(Display *dpy,
+		 GLXDrawable glxDrawable, GLXContext gc, Bool pre13)
+{
+    __GLXdisplayPrivate * const priv = __glXInitialize(dpy);
+    __GLXDRIdrawable *pdraw;
+    __GLXscreenConfigs *psc;
+    XID drawable;
+
+    if (priv == NULL)
+	return NULL;
+    
+    psc = &priv->screenConfigs[gc->screen];
+    if (psc->drawHash == NULL)
+	return NULL;
+
+    if (__glxHashLookup(psc->drawHash, glxDrawable, (void *) &pdraw) == 0)
+	return pdraw;
+
+    /* If this is glXMakeCurrent (pre GLX 1.3) we allow creating the
+     * GLX drawable on the fly.  Otherwise we pass None as the X
+     * drawable */
+    if (pre13)
+	drawable = glxDrawable;
+    else
+	drawable = None;
+
+    pdraw = psc->driScreen->createDrawable(psc, drawable,
+					   glxDrawable, gc->mode);
+    if (__glxHashInsert(psc->drawHash, glxDrawable, pdraw)) {
+	(*pdraw->destroyDrawable)(pdraw);
+	return NULL;
+    }
+
+    return pdraw;
+}
+#endif /* GLX_DIRECT_RENDERING */
+
+
+/**
+ * Make a particular context current.
+ * 
+ * \note This is in this file so that it can access dummyContext.
+ */
+static Bool MakeContextCurrent(Display *dpy, GLXDrawable draw,
+			       GLXDrawable read, GLXContext gc,
+			       Bool pre13)
+{
+    xGLXMakeCurrentReply reply;
+    const GLXContext oldGC = __glXGetCurrentContext();
+    const CARD8 opcode = __glXSetupForCommand(dpy);
+    const CARD8 oldOpcode = ((gc == oldGC) || (oldGC == &dummyContext))
+      ? opcode : __glXSetupForCommand(oldGC->currentDpy);
+    Bool bindReturnValue;
+
+
+    if (!opcode || !oldOpcode) {
+	return GL_FALSE;
+    }
+
+    /* Make sure that the new context has a nonzero ID.  In the request,
+     * a zero context ID is used only to mean that we bind to no current
+     * context.
+     */
+    if ((gc != NULL) && (gc->xid == None)) {
+	return GL_FALSE;
+    }
+
+    _glapi_check_multithread();
+
+#ifdef GLX_DIRECT_RENDERING
+    /* Bind the direct rendering context to the drawable */
+    if (gc && gc->driContext) {
+	__GLXDRIdrawable *pdraw = FetchDRIDrawable(dpy, draw, gc, pre13);
+	__GLXDRIdrawable *pread = FetchDRIDrawable(dpy, read, gc, pre13);
+
+	bindReturnValue =
+	    (gc->driContext->bindContext) (gc->driContext, pdraw, pread);
+    } else
+#endif
+    {
+	/* Send a glXMakeCurrent request to bind the new context. */
+	bindReturnValue = 
+	  SendMakeCurrentRequest(dpy, opcode, gc ? gc->xid : None,
+				 ((dpy != oldGC->currentDpy) || oldGC->isDirect)
+				 ? None : oldGC->currentContextTag,
+				 draw, read, &reply);
+    }
+
+
+    if (!bindReturnValue) {
+	return False;
+    }
+
+    if ((dpy != oldGC->currentDpy || (gc && gc->driContext)) &&
+	!oldGC->isDirect && oldGC != &dummyContext) {
+	xGLXMakeCurrentReply dummy_reply;
+
+	/* We are either switching from one dpy to another and have to
+	 * send a request to the previous dpy to unbind the previous
+	 * context, or we are switching away from a indirect context to
+	 * a direct context and have to send a request to the dpy to
+	 * unbind the previous context.
+	 */
+	(void) SendMakeCurrentRequest(oldGC->currentDpy, oldOpcode, None,
+				      oldGC->currentContextTag, None, None,
+				      & dummy_reply);
+    }
+#ifdef GLX_DIRECT_RENDERING
+    else if (oldGC->driContext) {
+	oldGC->driContext->unbindContext(oldGC->driContext);
+    }
+#endif
+
+
+    /* Update our notion of what is current */
+    __glXLock();
+    if (gc == oldGC) {
+	/* Even though the contexts are the same the drawable might have
+	 * changed.  Note that gc cannot be the dummy, and that oldGC
+	 * cannot be NULL, therefore if they are the same, gc is not
+	 * NULL and not the dummy.
+	 */
+	gc->currentDrawable = draw;
+	gc->currentReadable = read;
+    } else {
+	if (oldGC != &dummyContext) {
+	    /* Old current context is no longer current to anybody */
+	    oldGC->currentDpy = 0;
+	    oldGC->currentDrawable = None;
+	    oldGC->currentReadable = None;
+	    oldGC->currentContextTag = 0;
+
+	    if (oldGC->xid == None) {
+		/* We are switching away from a context that was
+		 * previously destroyed, so we need to free the memory
+		 * for the old handle.
+		 */
+#ifdef GLX_DIRECT_RENDERING
+		/* Destroy the old direct rendering context */
+		if (oldGC->driContext) {
+		    oldGC->driContext->destroyContext(oldGC->driContext,
+						      oldGC->psc,
+						      oldGC->createDpy);
+		    oldGC->driContext = NULL;
+		}
+#endif
+		__glXFreeContext(oldGC);
+	    }
+	}
+	if (gc) {
+	    __glXSetCurrentContext(gc);
+
+	    gc->currentDpy = dpy;
+	    gc->currentDrawable = draw;
+	    gc->currentReadable = read;
+
+            if (!gc->driContext) {
+               if (!IndirectAPI)
+                  IndirectAPI = __glXNewIndirectAPI();
+               _glapi_set_dispatch(IndirectAPI);
+
+#ifdef GLX_USE_APPLEGL
+               do {
+                   extern void XAppleDRIUseIndirectDispatch(void);
+                   XAppleDRIUseIndirectDispatch();
+               } while (0);
+#endif
+
+		__GLXattribute *state = 
+		  (__GLXattribute *)(gc->client_state_private);
+
+		gc->currentContextTag = reply.contextTag;
+		if (state->array_state == NULL) {
+		    (void) glGetString(GL_EXTENSIONS);
+		    (void) glGetString(GL_VERSION);
+		    __glXInitVertexArrayState(gc);
+		}
+	    }
+	    else {
+		gc->currentContextTag = -1;
+	    }
+	} else {
+	    __glXSetCurrentContextNull();
+	}
+    }
+    __glXUnlock();
+    return GL_TRUE;
+}
+
+
+PUBLIC Bool glXMakeCurrent(Display *dpy, GLXDrawable draw, GLXContext gc)
+{
+    return MakeContextCurrent(dpy, draw, draw, gc, True);
+}
+
+PUBLIC GLX_ALIAS(Bool, glXMakeCurrentReadSGI,
+	  (Display *dpy, GLXDrawable d, GLXDrawable r, GLXContext ctx),
+	  (dpy, d, r, ctx, False), MakeContextCurrent)
+
+PUBLIC GLX_ALIAS(Bool, glXMakeContextCurrent,
+	  (Display *dpy, GLXDrawable d, GLXDrawable r, GLXContext ctx),
+	  (dpy, d, r, ctx, False), MakeContextCurrent)
diff --git a/src/glx/x11/glxext.c b/src/glx/x11/glxext.c
index cd5c3196e3..75b7374a3f 100644
--- a/src/glx/x11/glxext.c
+++ b/src/glx/x11/glxext.c
@@ -45,26 +45,13 @@
  */     
 
 #include "glxclient.h"
-#include <stdio.h>
 #include <X11/extensions/Xext.h>
 #include <X11/extensions/extutil.h>
-#include <X11/extensions/Xfixes.h>
-#include <X11/extensions/Xdamage.h>
-#include <assert.h>
-#include "indirect_init.h"
 #include "glapi.h"
 #include "glxextensions.h"
 #include "glcontextmodes.h"
 #include "glheader.h"
 
-#ifdef GLX_DIRECT_RENDERING
-#include <inttypes.h>
-#include <sys/mman.h>
-#include "xf86dri.h"
-#include "xf86drm.h"
-#include "sarea.h"
-#endif
-
 #ifdef USE_XCB
 #include <X11/Xlib-xcb.h>
 #include <xcb/xcb.h>
@@ -77,24 +64,6 @@ void __glXDumpDrawBuffer(__GLXcontext *ctx);
 #endif
 
 #ifdef USE_SPARC_ASM
-/*
- * This is where our dispatch table's bounds are.
- * And the static mesa_init is taken directly from
- * Mesa's 'sparc.c' initializer.
- *
- * We need something like this here, because this version
- * of openGL/glx never initializes a Mesa context, and so
- * the address of the dispatch table pointer never gets stuffed
- * into the dispatch jump table otherwise.
- *
- * It matters only on SPARC, and only if you are using assembler
- * code instead of C-code indirect dispatch.
- *
- * -- FEM, 04.xii.03
- */
-extern unsigned int _mesa_sparc_glapi_begin;
-extern unsigned int _mesa_sparc_glapi_end;
-extern void __glapi_sparc_icache_flush(unsigned int *);
 static void _glx_mesa_init_sparc_glapi_relocs(void);
 static int _mesa_sparc_needs_init = 1;
 #define INIT_MESA_SPARC { \
@@ -108,153 +77,6 @@ static int _mesa_sparc_needs_init = 1;
 #endif
 
 /*
-** We setup some dummy structures here so that the API can be used
-** even if no context is current.
-*/
-
-static GLubyte dummyBuffer[__GLX_BUFFER_LIMIT_SIZE];
-
-/*
-** Dummy context used by small commands when there is no current context.
-** All the
-** gl and glx entry points are designed to operate as nop's when using
-** the dummy context structure.
-*/
-static __GLXcontext dummyContext = {
-    &dummyBuffer[0],
-    &dummyBuffer[0],
-    &dummyBuffer[0],
-    &dummyBuffer[__GLX_BUFFER_LIMIT_SIZE],
-    sizeof(dummyBuffer),
-};
-
-
-/*
-** All indirect rendering contexts will share the same indirect dispatch table.
-*/
-static __GLapi *IndirectAPI = NULL;
-
-
-/*
- * Current context management and locking
- */
-
-#if defined( USE_XTHREADS )
-
-/* thread safe */
-static GLboolean TSDinitialized = GL_FALSE;
-static xthread_key_t ContextTSD;
-
-_X_HIDDEN __GLXcontext *__glXGetCurrentContext(void)
-{
-   if (!TSDinitialized) {
-      xthread_key_create(&ContextTSD, NULL);
-      TSDinitialized = GL_TRUE;
-      return &dummyContext;
-   }
-   else {
-      void *p;
-      xthread_get_specific(ContextTSD, &p);
-      if (!p)
-         return &dummyContext;
-      else
-         return (__GLXcontext *) p;
-   }
-}
-
-_X_HIDDEN void __glXSetCurrentContext(__GLXcontext *c)
-{
-   if (!TSDinitialized) {
-      xthread_key_create(&ContextTSD, NULL);
-      TSDinitialized = GL_TRUE;
-   }
-   xthread_set_specific(ContextTSD, c);
-}
-
-
-/* Used by the __glXLock() and __glXUnlock() macros */
-_X_HIDDEN xmutex_rec __glXmutex;
-
-#elif defined( PTHREADS )
-
-_X_HIDDEN pthread_mutex_t __glXmutex = PTHREAD_MUTEX_INITIALIZER;
-
-# if defined( GLX_USE_TLS )
-
-/**
- * Per-thread GLX context pointer.
- * 
- * \c __glXSetCurrentContext is written is such a way that this pointer can
- * \b never be \c NULL.  This is important!  Because of this
- * \c __glXGetCurrentContext can be implemented as trivial macro.
- */
-__thread void * __glX_tls_Context __attribute__((tls_model("initial-exec")))
-    = &dummyContext;
-
-_X_HIDDEN void __glXSetCurrentContext( __GLXcontext * c )
-{
-    __glX_tls_Context = (c != NULL) ? c : &dummyContext;
-}
-
-# else
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-
-/**
- * Per-thread data key.
- * 
- * Once \c init_thread_data has been called, the per-thread data key will
- * take a value of \c NULL.  As each new thread is created the default
- * value, in that thread, will be \c NULL.
- */
-static pthread_key_t ContextTSD;
-
-/**
- * Initialize the per-thread data key.
- * 
- * This function is called \b exactly once per-process (not per-thread!) to
- * initialize the per-thread data key.  This is ideally done using the
- * \c pthread_once mechanism.
- */
-static void init_thread_data( void )
-{
-    if ( pthread_key_create( & ContextTSD, NULL ) != 0 ) {
-	perror( "pthread_key_create" );
-	exit( -1 );
-    }
-}
-
-_X_HIDDEN void __glXSetCurrentContext( __GLXcontext * c )
-{
-    pthread_once( & once_control, init_thread_data );
-    pthread_setspecific( ContextTSD, c );
-}
-
-_X_HIDDEN __GLXcontext * __glXGetCurrentContext( void )
-{
-    void * v;
-
-    pthread_once( & once_control, init_thread_data );
-
-    v = pthread_getspecific( ContextTSD );
-    return (v == NULL) ? & dummyContext : (__GLXcontext *) v;
-}
-
-# endif /* defined( GLX_USE_TLS ) */
-
-#elif defined( THREADS )
-
-#error Unknown threading method specified.
-
-#else
-
-/* not thread safe */
-_X_HIDDEN __GLXcontext *__glXcurrentContext = &dummyContext;
-
-#endif
-
-
-/*
 ** You can set this cell to 1 to force the gl drawing stuff to be
 ** one command per packet
 */
@@ -287,10 +109,7 @@ static int __glXCloseDisplay(Display *dpy, XExtCodes *codes)
 
   gc = __glXGetCurrentContext();
   if (dpy == gc->currentDpy) {
-    __glXSetCurrentContext(&dummyContext);
-#ifdef GLX_DIRECT_RENDERING
-    _glapi_set_dispatch(NULL);  /* no-op functions */
-#endif
+    __glXSetCurrentContextNull();
     __glXFreeContext(gc);
   }
 
@@ -1086,318 +905,6 @@ _X_HIDDEN void __glXSendLargeCommand(__GLXcontext *ctx,
 
 /************************************************************************/
 
-PUBLIC GLXContext glXGetCurrentContext(void)
-{
-    GLXContext cx = __glXGetCurrentContext();
-    
-    if (cx == &dummyContext) {
-	return NULL;
-    } else {
-	return cx;
-    }
-}
-
-PUBLIC GLXDrawable glXGetCurrentDrawable(void)
-{
-    GLXContext gc = __glXGetCurrentContext();
-    return gc->currentDrawable;
-}
-
-
-/************************************************************************/
-
-static Bool SendMakeCurrentRequest( Display *dpy, CARD8 opcode,
-    GLXContextID gc, GLXContextTag old_gc, GLXDrawable draw, GLXDrawable read,
-    xGLXMakeCurrentReply * reply );
-
-/**
- * Sends a GLX protocol message to the specified display to make the context
- * and the drawables current.
- *
- * \param dpy     Display to send the message to.
- * \param opcode  Major opcode value for the display.
- * \param gc_id   Context tag for the context to be made current.
- * \param draw    Drawable ID for the "draw" drawable.
- * \param read    Drawable ID for the "read" drawable.
- * \param reply   Space to store the X-server's reply.
- *
- * \warning
- * This function assumes that \c dpy is locked with \c LockDisplay on entry.
- */
-static Bool SendMakeCurrentRequest(Display *dpy, CARD8 opcode,
-				   GLXContextID gc_id, GLXContextTag gc_tag,
-				   GLXDrawable draw, GLXDrawable read,
-				   xGLXMakeCurrentReply *reply)
-{
-    Bool ret;
-
-
-    LockDisplay(dpy);
-
-    if (draw == read) {
-	xGLXMakeCurrentReq *req;
-
-	GetReq(GLXMakeCurrent,req);
-	req->reqType = opcode;
-	req->glxCode = X_GLXMakeCurrent;
-	req->drawable = draw;
-	req->context = gc_id;
-	req->oldContextTag = gc_tag;
-    }
-    else {
-	__GLXdisplayPrivate *priv = __glXInitialize(dpy);
-
-	/* If the server can support the GLX 1.3 version, we should
-	 * perfer that.  Not only that, some servers support GLX 1.3 but
-	 * not the SGI extension.
-	 */
-
-	if ((priv->majorVersion > 1) || (priv->minorVersion >= 3)) {
-	    xGLXMakeContextCurrentReq *req;
-
-	    GetReq(GLXMakeContextCurrent,req);
-	    req->reqType = opcode;
-	    req->glxCode = X_GLXMakeContextCurrent;
-	    req->drawable = draw;
-	    req->readdrawable = read;
-	    req->context = gc_id;
-	    req->oldContextTag = gc_tag;
-	}
-	else {
-	    xGLXVendorPrivateWithReplyReq *vpreq;
-	    xGLXMakeCurrentReadSGIReq *req;
-
-	    GetReqExtra(GLXVendorPrivateWithReply,
-			sz_xGLXMakeCurrentReadSGIReq-sz_xGLXVendorPrivateWithReplyReq,vpreq);
-	    req = (xGLXMakeCurrentReadSGIReq *)vpreq;
-	    req->reqType = opcode;
-	    req->glxCode = X_GLXVendorPrivateWithReply;
-	    req->vendorCode = X_GLXvop_MakeCurrentReadSGI;
-	    req->drawable = draw;
-	    req->readable = read;
-	    req->context = gc_id;
-	    req->oldContextTag = gc_tag;
-	}
-    }
-
-    ret = _XReply(dpy, (xReply*) reply, 0, False);
-
-    UnlockDisplay(dpy);
-    SyncHandle();
-
-    return ret;
-}
-
-
-#ifdef GLX_DIRECT_RENDERING
-static __GLXDRIdrawable *
-FetchDRIDrawable(Display *dpy,
-		 GLXDrawable glxDrawable, GLXContext gc, Bool pre13)
-{
-    __GLXdisplayPrivate * const priv = __glXInitialize(dpy);
-    __GLXDRIdrawable *pdraw;
-    __GLXscreenConfigs *psc;
-    XID drawable;
-
-    if (priv == NULL)
-	return NULL;
-    
-    psc = &priv->screenConfigs[gc->screen];
-    if (psc->drawHash == NULL)
-	return NULL;
-
-    if (__glxHashLookup(psc->drawHash, glxDrawable, (void *) &pdraw) == 0)
-	return pdraw;
-
-    /* If this is glXMakeCurrent (pre GLX 1.3) we allow creating the
-     * GLX drawable on the fly.  Otherwise we pass None as the X
-     * drawable */
-    if (pre13)
-	drawable = glxDrawable;
-    else
-	drawable = None;
-
-    pdraw = psc->driScreen->createDrawable(psc, drawable,
-					   glxDrawable, gc->mode);
-    if (__glxHashInsert(psc->drawHash, glxDrawable, pdraw)) {
-	(*pdraw->destroyDrawable)(pdraw);
-	return NULL;
-    }
-
-    return pdraw;
-}
-#endif /* GLX_DIRECT_RENDERING */
-
-
-/**
- * Make a particular context current.
- * 
- * \note This is in this file so that it can access dummyContext.
- */
-static Bool MakeContextCurrent(Display *dpy, GLXDrawable draw,
-			       GLXDrawable read, GLXContext gc,
-			       Bool pre13)
-{
-    xGLXMakeCurrentReply reply;
-    const GLXContext oldGC = __glXGetCurrentContext();
-    const CARD8 opcode = __glXSetupForCommand(dpy);
-    const CARD8 oldOpcode = ((gc == oldGC) || (oldGC == &dummyContext))
-      ? opcode : __glXSetupForCommand(oldGC->currentDpy);
-    Bool bindReturnValue;
-
-
-    if (!opcode || !oldOpcode) {
-	return GL_FALSE;
-    }
-
-    /* Make sure that the new context has a nonzero ID.  In the request,
-     * a zero context ID is used only to mean that we bind to no current
-     * context.
-     */
-    if ((gc != NULL) && (gc->xid == None)) {
-	return GL_FALSE;
-    }
-
-    _glapi_check_multithread();
-
-#ifdef GLX_DIRECT_RENDERING
-    /* Bind the direct rendering context to the drawable */
-    if (gc && gc->driContext) {
-	__GLXDRIdrawable *pdraw = FetchDRIDrawable(dpy, draw, gc, pre13);
-	__GLXDRIdrawable *pread = FetchDRIDrawable(dpy, read, gc, pre13);
-
-	bindReturnValue =
-	    (gc->driContext->bindContext) (gc->driContext, pdraw, pread);
-    } else
-#endif
-    {
-	/* Send a glXMakeCurrent request to bind the new context. */
-	bindReturnValue = 
-	  SendMakeCurrentRequest(dpy, opcode, gc ? gc->xid : None,
-				 ((dpy != oldGC->currentDpy) || oldGC->isDirect)
-				 ? None : oldGC->currentContextTag,
-				 draw, read, &reply);
-    }
-
-
-    if (!bindReturnValue) {
-	return False;
-    }
-
-    if ((dpy != oldGC->currentDpy || (gc && gc->driContext)) &&
-	!oldGC->isDirect && oldGC != &dummyContext) {
-	xGLXMakeCurrentReply dummy_reply;
-
-	/* We are either switching from one dpy to another and have to
-	 * send a request to the previous dpy to unbind the previous
-	 * context, or we are switching away from a indirect context to
-	 * a direct context and have to send a request to the dpy to
-	 * unbind the previous context.
-	 */
-	(void) SendMakeCurrentRequest(oldGC->currentDpy, oldOpcode, None,
-				      oldGC->currentContextTag, None, None,
-				      & dummy_reply);
-    }
-#ifdef GLX_DIRECT_RENDERING
-    else if (oldGC->driContext) {
-	oldGC->driContext->unbindContext(oldGC->driContext);
-    }
-#endif
-
-
-    /* Update our notion of what is current */
-    __glXLock();
-    if (gc == oldGC) {
-	/* Even though the contexts are the same the drawable might have
-	 * changed.  Note that gc cannot be the dummy, and that oldGC
-	 * cannot be NULL, therefore if they are the same, gc is not
-	 * NULL and not the dummy.
-	 */
-	gc->currentDrawable = draw;
-	gc->currentReadable = read;
-    } else {
-	if (oldGC != &dummyContext) {
-	    /* Old current context is no longer current to anybody */
-	    oldGC->currentDpy = 0;
-	    oldGC->currentDrawable = None;
-	    oldGC->currentReadable = None;
-	    oldGC->currentContextTag = 0;
-
-	    if (oldGC->xid == None) {
-		/* We are switching away from a context that was
-		 * previously destroyed, so we need to free the memory
-		 * for the old handle.
-		 */
-#ifdef GLX_DIRECT_RENDERING
-		/* Destroy the old direct rendering context */
-		if (oldGC->driContext) {
-		    oldGC->driContext->destroyContext(oldGC->driContext,
-						      oldGC->psc,
-						      oldGC->createDpy);
-		    oldGC->driContext = NULL;
-		}
-#endif
-		__glXFreeContext(oldGC);
-	    }
-	}
-	if (gc) {
-	    __glXSetCurrentContext(gc);
-
-	    gc->currentDpy = dpy;
-	    gc->currentDrawable = draw;
-	    gc->currentReadable = read;
-
-            if (!gc->driContext) {
-               if (!IndirectAPI)
-                  IndirectAPI = __glXNewIndirectAPI();
-               _glapi_set_dispatch(IndirectAPI);
-
-#ifdef GLX_USE_APPLEGL
-               do {
-                   extern void XAppleDRIUseIndirectDispatch(void);
-                   XAppleDRIUseIndirectDispatch();
-               } while (0);
-#endif
-
-		__GLXattribute *state = 
-		  (__GLXattribute *)(gc->client_state_private);
-
-		gc->currentContextTag = reply.contextTag;
-		if (state->array_state == NULL) {
-		    (void) glGetString(GL_EXTENSIONS);
-		    (void) glGetString(GL_VERSION);
-		    __glXInitVertexArrayState(gc);
-		}
-	    }
-	    else {
-		gc->currentContextTag = -1;
-	    }
-	} else {
-	    __glXSetCurrentContext(&dummyContext);
-#ifdef GLX_DIRECT_RENDERING
-            _glapi_set_dispatch(NULL);  /* no-op functions */
-#endif
-	}
-    }
-    __glXUnlock();
-    return GL_TRUE;
-}
-
-
-PUBLIC Bool glXMakeCurrent(Display *dpy, GLXDrawable draw, GLXContext gc)
-{
-    return MakeContextCurrent(dpy, draw, draw, gc, True);
-}
-
-PUBLIC GLX_ALIAS(Bool, glXMakeCurrentReadSGI,
-	  (Display *dpy, GLXDrawable d, GLXDrawable r, GLXContext ctx),
-	  (dpy, d, r, ctx, False), MakeContextCurrent)
-
-PUBLIC GLX_ALIAS(Bool, glXMakeContextCurrent,
-	  (Display *dpy, GLXDrawable d, GLXDrawable r, GLXContext ctx),
-	  (dpy, d, r, ctx, False), MakeContextCurrent)
-
-
 #ifdef DEBUG
 _X_HIDDEN void __glXDumpDrawBuffer(__GLXcontext *ctx)
 {
@@ -1424,9 +931,23 @@ _X_HIDDEN void __glXDumpDrawBuffer(__GLXcontext *ctx)
 
 #ifdef  USE_SPARC_ASM
 /*
- * Used only when we are sparc, using sparc assembler.
+ * This is where our dispatch table's bounds are.
+ * And the static mesa_init is taken directly from
+ * Mesa's 'sparc.c' initializer.
  *
+ * We need something like this here, because this version
+ * of openGL/glx never initializes a Mesa context, and so
+ * the address of the dispatch table pointer never gets stuffed
+ * into the dispatch jump table otherwise.
+ *
+ * It matters only on SPARC, and only if you are using assembler
+ * code instead of C-code indirect dispatch.
+ *
+ * -- FEM, 04.xii.03
  */
+extern unsigned int _mesa_sparc_glapi_begin;
+extern unsigned int _mesa_sparc_glapi_end;
+extern void __glapi_sparc_icache_flush(unsigned int *);
 
 static void
 _glx_mesa_init_sparc_glapi_relocs(void)
diff --git a/src/glx/x11/glx_texture_compression.c b/src/glx/x11/indirect_texture_compression.c
index 5676858017..5676858017 100644
--- a/src/glx/x11/glx_texture_compression.c
+++ b/src/glx/x11/indirect_texture_compression.c
diff --git a/src/glx/x11/indirect_vertex_array.c b/src/glx/x11/indirect_vertex_array.c
index 4f8284576e..09d7244ba9 100644
--- a/src/glx/x11/indirect_vertex_array.c
+++ b/src/glx/x11/indirect_vertex_array.c
@@ -32,7 +32,7 @@
 #include <GL/glxproto.h>
 #include "glxextensions.h"
 #include "indirect_vertex_array.h"
-#include "indirect_va_private.h"
+#include "indirect_vertex_array_priv.h"
 
 #define __GLX_PAD(n) (((n)+3) & ~3)
 
diff --git a/src/glx/x11/indirect_va_private.h b/src/glx/x11/indirect_vertex_array_priv.h
index ab97dc645f..ab97dc645f 100644
--- a/src/glx/x11/indirect_va_private.h
+++ b/src/glx/x11/indirect_vertex_array_priv.h
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 695a416094..633bfb19a3 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -103,9 +103,11 @@ OSMESA16_OBJECTS = \
 	$(OSMESA_DRIVER_OBJECTS)
 
 
-stand-alone: depend subdirs $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
+stand-alone: depend subdirs libmesa.a \
+	$(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
 
-osmesa-only: depend subdirs $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
+osmesa-only: depend subdirs \
+	$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
 
 # Make the GL library
 $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS)
diff --git a/src/mesa/drivers/common/sources b/src/mesa/drivers/common/sources
deleted file mode 100644
index 90e29d78d3..0000000000
--- a/src/mesa/drivers/common/sources
+++ /dev/null
@@ -1,2 +0,0 @@
-MESA_DRIVER_COMMON_SOURCES = \
-driverfuncs.c
diff --git a/src/mesa/drivers/dri/common/dri_bufmgr_fake.c b/src/mesa/drivers/dri/common/dri_bufmgr_fake.c
index 61dd49d9dd..9bf3f3437c 100644
--- a/src/mesa/drivers/dri/common/dri_bufmgr_fake.c
+++ b/src/mesa/drivers/dri/common/dri_bufmgr_fake.c
@@ -238,7 +238,7 @@ alloc_block(dri_bo *bo)
    dri_bo_fake *bo_fake = (dri_bo_fake *)bo;
    dri_bufmgr_fake *bufmgr_fake= (dri_bufmgr_fake *)bo->bufmgr;
    struct block *block = (struct block *)calloc(sizeof *block, 1);
-   unsigned int align_log2 = _mesa_ffs(bo_fake->alignment);
+   unsigned int align_log2 = _mesa_ffs(bo_fake->alignment) - 1;
    GLuint sz;
 
    if (!block)
@@ -272,16 +272,16 @@ alloc_block(dri_bo *bo)
 static void free_block(dri_bufmgr_fake *bufmgr_fake, struct block *block)
 {
    dri_bo_fake *bo_fake;
-   DBG("free block %p\n", block);
+   DBG("free block %p %08x %d %d\n", block, block->mem->ofs, block->on_hardware, block->fenced);
 
    if (!block)
       return;
 
    bo_fake = (dri_bo_fake *)block->bo;
-   if (bo_fake->card_dirty == GL_TRUE) {
-      memcpy(bo_fake->backing_store, block->virtual, block->bo->size);
-      bo_fake->card_dirty = GL_FALSE;
-      bo_fake->dirty = GL_TRUE;
+   if (!(bo_fake->flags & BM_NO_BACKING_STORE) && (bo_fake->card_dirty == 1)) {
+     memcpy(bo_fake->backing_store, block->virtual, block->bo->size);
+     bo_fake->card_dirty = 1;
+     bo_fake->dirty = 1;
    }
 
    if (block->on_hardware) {
@@ -427,6 +427,8 @@ static int clear_fenced(dri_bufmgr_fake *bufmgr_fake,
 	 /* Blocks are ordered by fence, so if one fails, all from
 	  * here will fail also:
 	  */
+	DBG("fence not passed: offset %x sz %x %d %d \n",
+	    block->mem->ofs, block->mem->size, block->fence, bufmgr_fake->last_fence);
 	 break;
       }
    }
@@ -440,8 +442,8 @@ static void fence_blocks(dri_bufmgr_fake *bufmgr_fake, unsigned fence)
    struct block *block, *tmp;
 
    foreach_s (block, tmp, &bufmgr_fake->on_hardware) {
-      DBG("Fence block %p (sz 0x%x buf %p) with fence %d\n", block,
-	  block->mem->size, block->bo, fence);
+      DBG("Fence block %p (sz 0x%x ofs %x buf %p) with fence %d\n", block,
+	  block->mem->size, block->mem->ofs, block->bo, fence);
       block->fence = fence;
 
       block->on_hardware = 0;
@@ -815,8 +817,8 @@ dri_fake_kick_all(dri_bufmgr_fake *bufmgr_fake)
       free_block(bufmgr_fake, block);
       bo_fake->block = NULL;
       bo_fake->validated = GL_FALSE;
-      bo_fake->dirty = GL_TRUE;
-      block->bo->offset = -1;
+      if (!(bo_fake->flags & BM_NO_BACKING_STORE))
+         bo_fake->dirty = 1;
    }
 }
 
@@ -875,16 +877,18 @@ dri_fake_bo_validate(dri_bo *bo, uint64_t flags)
        */
       dri_bufmgr_fake_wait_idle(bufmgr_fake);
 
-      /* we may never have mapped this BO so it might not have any backing store */
-      /* if this happens it should be rare, but 0 the card memory in any case */
+      /* we may never have mapped this BO so it might not have any backing
+       * store if this happens it should be rare, but 0 the card memory
+       * in any case */
       if (bo_fake->backing_store)
-          memcpy(bo_fake->block->virtual, bo_fake->backing_store, bo->size);
+         memcpy(bo_fake->block->virtual, bo_fake->backing_store, bo->size);
       else
-	  memset(bo_fake->block->virtual, 0, bo->size);
+         memset(bo_fake->block->virtual, 0, bo->size);
 
       bo_fake->dirty = 0;
    }
 
+   bo_fake->block->fenced = 0;
    bo_fake->block->on_hardware = 1;
    move_to_tail(&bufmgr_fake->on_hardware, bo_fake->block);
 
@@ -970,16 +974,12 @@ dri_fake_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
    struct fake_buffer_reloc *r;
    dri_bo_fake *reloc_fake = (dri_bo_fake *)reloc_buf;
    dri_bo_fake *target_fake = (dri_bo_fake *)target_buf;
-   int ret, i;
+   int i;
 
    assert(reloc_buf);
    assert(target_buf);
 
-   if (!target_fake->is_static && !target_fake->size_accounted) {
-       ret = dri_fake_check_aperture_space(target_buf);
-       if (ret)
-	   return ret;
-   }
+   assert(target_fake->is_static || target_fake->size_accounted);
 
    if (reloc_fake->relocs == NULL) {
       reloc_fake->relocs = malloc(sizeof(struct fake_buffer_reloc) *
@@ -1060,12 +1060,12 @@ dri_fake_reloc_and_validate_buffer(dri_bo *bo)
 
       /* Validate the target buffer if that hasn't been done. */
       if (!target_fake->validated) {
-	 ret = dri_fake_reloc_and_validate_buffer(r->target_buf);
-	 if (ret != 0) {
-	    if (bo->virtual != NULL)
-	       dri_bo_unmap(bo);
-	    return ret;
-	 }
+         ret = dri_fake_reloc_and_validate_buffer(r->target_buf);
+         if (ret != 0) {
+            if (bo->virtual != NULL)
+                dri_bo_unmap(bo);
+            return ret;
+         }
       }
 
       /* Calculate the value of the relocation entry. */
@@ -1087,9 +1087,9 @@ dri_fake_reloc_and_validate_buffer(dri_bo *bo)
    if (bo_fake->validate_flags & DRM_BO_FLAG_WRITE) {
       if (!(bo_fake->flags & (BM_NO_BACKING_STORE|BM_PINNED))) {
          if (bo_fake->backing_store == 0)
-	    alloc_backing_store(bo);
+            alloc_backing_store(bo);
 
-	 bo_fake->card_dirty = GL_TRUE;
+         bo_fake->card_dirty = 1;
       }
       bufmgr_fake->performed_rendering = GL_TRUE;
    }
@@ -1116,12 +1116,14 @@ dri_fake_process_relocs(dri_bo *batch_buf, GLuint *count_p)
    ret = dri_fake_reloc_and_validate_buffer(batch_buf);
    if (bufmgr_fake->fail == 1) {
       if (retry_count == 0) {
-	  retry_count++;
-	  dri_fake_kick_all(bufmgr_fake);
-	  bufmgr_fake->fail = 0;
-	  goto restart;
-      }
+         retry_count++;
+         dri_fake_kick_all(bufmgr_fake);
+         bufmgr_fake->fail = 0;
+         goto restart;
+      } else /* dump out the memory here */
+         mmDumpMemInfo(bufmgr_fake->heap);
    }
+
    assert(ret == 0);
 
    *count_p = 0; /* junk */
@@ -1186,13 +1188,13 @@ dri_fake_check_aperture_space(dri_bo *bo)
       return 0;
 
    if (bufmgr_fake->current_total_size + sz > bufmgr_fake->size) {
-      DBG("check_space: bo %d %d overflowed bufmgr\n", bo_fake->id, sz);
+     DBG("check_space: %s bo %d %d overflowed bufmgr size %d\n", bo_fake->name, bo_fake->id, sz, bufmgr_fake->size);
       return -1;
    }
 
    bufmgr_fake->current_total_size += sz;
    bo_fake->size_accounted = 1;
-   DBG("check_space: bo %d %d %d\n", bo_fake->id, bo->size, bufmgr_fake->current_total_size);
+   DBG("drm_check_space: buf %d, %s %d %d\n", bo_fake->id, bo_fake->name, bo->size, bufmgr_fake->current_total_size);
    return 0;
 }
 
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index d4401b407e..06e1d20a3c 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -50,9 +50,10 @@
 #define _DRI_UTIL_H_
 
 #include <GL/gl.h>
-#include "drm.h"
-#include "drm_sarea.h"
-#include "xf86drm.h"
+#include <drm.h>
+#include <drm_sarea.h>
+#include <xf86drm.h>
+#include <xf86mm.h>
 #include "GL/internal/glcore.h"
 #include "GL/internal/dri_interface.h"
 #include "GL/internal/dri_sarea.h"
diff --git a/src/mesa/drivers/dri/common/xmlconfig.c b/src/mesa/drivers/dri/common/xmlconfig.c
index c313d71e80..8602d47cf9 100644
--- a/src/mesa/drivers/dri/common/xmlconfig.c
+++ b/src/mesa/drivers/dri/common/xmlconfig.c
@@ -279,7 +279,7 @@ static GLfloat strToF (const XML_Char *string, const XML_Char **tail) {
 /** \brief Parse a value of a given type. */
 static GLboolean parseValue (driOptionValue *v, driOptionType type,
 			     const XML_Char *string) {
-    const XML_Char *tail;
+    const XML_Char *tail = NULL;
   /* skip leading white-space */
     string += strspn (string, " \f\n\r\t\v");
     switch (type) {
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 5cf74d5906..135bfaa265 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -322,7 +322,8 @@ i915_emit_state(struct intel_context *intel)
    ret = 0;
    if (dirty & I915_UPLOAD_BUFFERS) {
      ret |= dri_bufmgr_check_aperture_space(state->draw_region->buffer);
-     ret |= dri_bufmgr_check_aperture_space(state->depth_region->buffer);
+     if (state->depth_region)
+        ret |= dri_bufmgr_check_aperture_space(state->depth_region->buffer);
    }
 
    if (dirty & I915_UPLOAD_TEX_ALL) {
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 9def04d248..9d8984f05c 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -37,7 +37,7 @@
 #include "macros.h"
 #include "enums.h"
 
-static void upload_cc_vp( struct brw_context *brw )
+static int upload_cc_vp( struct brw_context *brw )
 {
    struct brw_cc_viewport ccv;
 
@@ -48,6 +48,7 @@ static void upload_cc_vp( struct brw_context *brw )
 
    dri_bo_unreference(brw->cc.vp_bo);
    brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
+   return dri_bufmgr_check_aperture_space(brw->cc.vp_bo);
 }
 
 const struct brw_tracked_state brw_cc_vp = {
@@ -56,7 +57,7 @@ const struct brw_tracked_state brw_cc_vp = {
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
-   .update = upload_cc_vp
+   .prepare = upload_cc_vp
 };
 
 struct brw_cc_unit_key {
@@ -264,7 +265,7 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    return bo;
 }
 
-static void upload_cc_unit( struct brw_context *brw )
+static int prepare_cc_unit( struct brw_context *brw )
 {
    struct brw_cc_unit_key key;
 
@@ -278,6 +279,7 @@ static void upload_cc_unit( struct brw_context *brw )
 
    if (brw->cc.state_bo == NULL)
       brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
+   return dri_bufmgr_check_aperture_space(brw->cc.state_bo);
 }
 
 const struct brw_tracked_state brw_cc_unit = {
@@ -286,7 +288,7 @@ const struct brw_tracked_state brw_cc_unit = {
       .brw = 0,
       .cache = CACHE_NEW_CC_VP
    },
-   .update = upload_cc_unit,
+   .prepare = prepare_cc_unit,
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index ce34da165c..540108e5f4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -131,7 +131,7 @@ static void compile_clip_prog( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_clip_prog( struct brw_context *brw )
+static int upload_clip_prog( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct brw_clip_prog_key key;
@@ -242,6 +242,8 @@ static void upload_clip_prog( struct brw_context *brw )
 					&brw->clip.prog_data);
    if (brw->clip.prog_bo == NULL)
       compile_clip_prog( brw, &key );
+
+   return dri_bufmgr_check_aperture_space(brw->clip.prog_bo);
 }
 
 
@@ -254,5 +256,5 @@ const struct brw_tracked_state brw_clip_prog = {
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
-   .update = upload_clip_prog
+   .prepare = upload_clip_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index ab962562e5..7d51cddfc3 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -148,10 +148,12 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
-		   brw_imm_ud(1<<20));
-   brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   if (!BRW_IS_IGD(p->brw)) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   }
 
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index cbf9cdcfce..7cb21f894e 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -128,9 +128,10 @@ clip_unit_create_from_key(struct brw_context *brw,
    return bo;
 }
 
-static void upload_clip_unit( struct brw_context *brw )
+static int upload_clip_unit( struct brw_context *brw )
 {
    struct brw_clip_unit_key key;
+   int ret = 0;
 
    clip_unit_populate_key(brw, &key);
 
@@ -142,6 +143,9 @@ static void upload_clip_unit( struct brw_context *brw )
    if (brw->clip.state_bo == NULL) {
       brw->clip.state_bo = clip_unit_create_from_key(brw, &key);
    }
+
+   ret = dri_bufmgr_check_aperture_space(brw->clip.state_bo);
+   return ret;
 }
 
 const struct brw_tracked_state brw_clip_unit = {
@@ -151,5 +155,5 @@ const struct brw_tracked_state brw_clip_unit = {
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
    },
-   .update = upload_clip_unit,
+   .prepare = upload_clip_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 9d7b3def66..f1fc6e1e9d 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -536,14 +536,16 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
-		   brw_imm_ud(1<<20));
-   neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
-   {
-	   brw_clip_test(c);
+   if (!BRW_IS_IGD(p->brw)) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
+              brw_imm_ud(1<<20));
+      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p, neg_rhw);
    }
-   brw_ENDIF(p, neg_rhw);
    /* Can't push into do_clip_tri because with polygon (or quad)
     * flatshading, need to apply the flatshade here because we don't
     * respect the PV when converting to trifan for emit:
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 56021fa209..bef425f2da 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -332,7 +332,8 @@ struct brw_state_pointers {
  */
 struct brw_tracked_state {
    struct brw_state_flags dirty;
-   void (*update)( struct brw_context *brw );
+   int (*prepare)( struct brw_context *brw );
+   void (*emit)( struct brw_context *brw );
 };
 
 /* Flags for brw->state.cache.
@@ -640,7 +641,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 /*======================================================================
  * brw_state.c
  */
-void brw_validate_state( struct brw_context *brw );
+int brw_validate_state( struct brw_context *brw );
 void brw_init_state( struct brw_context *brw );
 void brw_destroy_state( struct brw_context *brw );
 
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index f41f659b33..5ff4e2964e 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -46,7 +46,7 @@
 
 /* Partition the CURBE between the various users of constant values:
  */
-static void calculate_curbe_offsets( struct brw_context *brw )
+static int calculate_curbe_offsets( struct brw_context *brw )
 {
    /* CACHE_NEW_WM_PROG */
    GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
@@ -117,6 +117,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
       brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
    }
+   return 0;
 }
 
 
@@ -126,7 +127,7 @@ const struct brw_tracked_state brw_curbe_offsets = {
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
-   .update = calculate_curbe_offsets
+   .prepare = calculate_curbe_offsets
 };
 
 
@@ -182,9 +183,8 @@ static GLfloat fixed_plane[6][4] = {
  * cache mechanism, but maybe would benefit from a comparison against
  * the current uploaded set of constants.
  */
-static void upload_constant_buffer(struct brw_context *brw)
+static int prepare_constant_buffer(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
    struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
@@ -201,10 +201,6 @@ static void upload_constant_buffer(struct brw_context *brw)
    brw->curbe.tracked_state.dirty.mesa |= fp->param_state;
 
    if (sz == 0) {
-      BEGIN_BATCH(2, IGNORE_CLIPRECTS);
-      OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
 
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
@@ -212,7 +208,7 @@ static void upload_constant_buffer(struct brw_context *brw)
 	 brw->curbe.last_bufsz  = 0;
       }
        
-      return;
+      return 0;
    }
 
    buf = (GLfloat *)malloc(bufsz);
@@ -326,6 +322,7 @@ static void upload_constant_buffer(struct brw_context *brw)
       dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf);
    }
 
+
    /* Because this provokes an action (ie copy the constants into the
     * URB), it shouldn't be shortcircuited if identical to the
     * previous time - because eg. the urb destination may have
@@ -339,10 +336,26 @@ static void upload_constant_buffer(struct brw_context *brw)
     * flushes as necessary when doublebuffering of CURBEs isn't
     * possible.
     */
+
+   /* check aperture space for this bo */
+   return dri_bufmgr_check_aperture_space(brw->curbe.curbe_bo);
+}
+
+
+static void emit_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLuint sz = brw->curbe.total_size;
+
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
-   OUT_RELOC(brw->curbe.curbe_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-	     (sz - 1) + brw->curbe.curbe_offset);
+   if (sz == 0) {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
+      OUT_RELOC(brw->curbe.curbe_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		(sz - 1) + brw->curbe.curbe_offset);
+   }
    ADVANCE_BATCH();
 }
 
@@ -363,6 +376,7 @@ const struct brw_tracked_state brw_constant_buffer = {
 	       BRW_NEW_BATCH),
       .cache = (CACHE_NEW_WM_PROG) 
    },
-   .update = upload_constant_buffer,
+   .prepare = prepare_constant_buffer,
+   .emit = emit_constant_buffer,
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 0990dcfac4..6124ab6b0f 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -83,8 +83,9 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
  * programs be immune to the active primitive (ie. cope with all
  * possibilities).  That may not be realistic however.
  */
-static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
+static GLuint brw_set_prim(struct brw_context *brw, GLenum prim, GLboolean *need_flush)
 {
+   int ret;
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
    
@@ -105,7 +106,9 @@ static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
 	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
       }
 
-      brw_validate_state(brw);
+      ret = brw_validate_state(brw);
+      if (ret)
+         *need_flush = GL_TRUE;
    }
 
    return hw_prim[prim];
@@ -128,6 +131,7 @@ static void brw_emit_prim( struct brw_context *brw,
 
 {
    struct brw_3d_primitive prim_packet;
+   GLboolean need_flush = GL_FALSE;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
@@ -136,7 +140,7 @@ static void brw_emit_prim( struct brw_context *brw,
    prim_packet.header.opcode = CMD_3D_PRIM;
    prim_packet.header.length = sizeof(prim_packet)/4 - 2;
    prim_packet.header.pad = 0;
-   prim_packet.header.topology = brw_set_prim(brw, prim->mode);
+   prim_packet.header.topology = brw_set_prim(brw, prim->mode, &need_flush);
    prim_packet.header.indexed = prim->indexed;
 
    prim_packet.verts_per_instance = trim(prim->mode, prim->count);
@@ -149,6 +153,8 @@ static void brw_emit_prim( struct brw_context *brw,
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
+
+   assert(need_flush == GL_FALSE);
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -251,8 +257,10 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    struct intel_context *intel = intel_context(ctx);
    struct brw_context *brw = brw_context(ctx);
    GLboolean retval = GL_FALSE;
-   GLuint i;
-
+   GLuint i, ret;
+   GLuint ib_offset;
+   dri_bo *ib_bo;
+   GLboolean force_flush = GL_FALSE;
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
@@ -284,20 +292,49 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
        * an upper bound of how much we might emit in a single
        * brw_try_draw_prims().
        */
+   flush:
+      if (force_flush)
+         brw->no_batch_wrap = GL_FALSE;
+
       if (intel->batch->ptr - intel->batch->map > intel->batch->size * 3 / 4
 	/* brw_emit_prim may change the cliprect_mode to LOOP_CLIPRECTS */
-	|| intel->batch->cliprect_mode != LOOP_CLIPRECTS) 
+	  || intel->batch->cliprect_mode != LOOP_CLIPRECTS || (force_flush == GL_TRUE))
 	      intel_batchbuffer_flush(intel->batch);
 
+      force_flush = GL_FALSE;
       brw->no_batch_wrap = GL_TRUE;
 
       /* Set the first primitive early, ahead of validate_state:
        */
-      brw_set_prim(brw, prim[0].mode);
+      brw_set_prim(brw, prim[0].mode, &force_flush);
 
       /* XXX:  Need to separate validate and upload of state.  
        */
-      brw_validate_state( brw );
+      ret = brw_validate_state( brw );
+      if (ret) {
+         force_flush = GL_TRUE;
+         goto flush;
+      }
+
+      /* need to account for index buffer and vertex buffer */
+      if (ib) {
+         ret = brw_prepare_indices( brw, ib , &ib_bo, &ib_offset);
+         if (ret) {
+            force_flush = GL_TRUE;
+            goto flush;
+         }
+      }
+
+      ret = brw_prepare_vertices( brw, min_index, max_index);
+      if (ret < 0)
+         goto out;
+
+      if (ret > 0) {
+         force_flush = GL_TRUE;
+         goto flush;
+      }
+
+
 
       /* Various fallback checks:
        */
@@ -310,11 +347,9 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       /* Upload index, vertex data: 
        */
       if (ib)
-	 brw_upload_indices( brw, ib );
+	brw_emit_indices( brw, ib, ib_bo, ib_offset);
 
-      if (!brw_upload_vertices( brw, min_index, max_index)) {
-	 goto out;
-      }
+      brw_emit_vertices( brw, min_index, max_index);
 
       for (i = 0; i < nr_prims; i++) {
 	 brw_emit_prim(brw, &prim[i]);
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index 0f7b738310..b3547400d4 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -31,6 +31,7 @@
 #include "mtypes.h"		/* for GLcontext... */
 #include "vbo/vbo.h"
 
+#include "dri_bufmgr.h"
 struct brw_context;
 
 
@@ -53,10 +54,21 @@ void brw_init_current_values(GLcontext *ctx,
 
 /* brw_draw_upload.c
  */
-void brw_upload_indices( struct brw_context *brw,
-			 const struct _mesa_index_buffer *index_buffer);
+int brw_prepare_indices( struct brw_context *brw,
+			 const struct _mesa_index_buffer *index_buffer,
+			 dri_bo **bo_return,
+			 GLuint *offset_return);
 
-GLboolean brw_upload_vertices( struct brw_context *brw,
+void brw_emit_indices( struct brw_context *brw,
+		       const struct _mesa_index_buffer *index_buffer,
+		       dri_bo *bo,
+		       GLuint offset);
+
+int brw_prepare_vertices( struct brw_context *brw,
+			       GLuint min_index,
+			       GLuint max_index );
+
+void brw_emit_vertices( struct brw_context *brw,
 			       GLuint min_index,
 			       GLuint max_index );
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 839735daec..aa985d68b6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -255,8 +255,10 @@ static void wrap_buffers( struct brw_context *brw,
    /* Set the internal VBO\ to no-backing-store.  We only use them as a
     * temporary within a brw_try_draw_prims while the lock is held.
     */
-   if (!brw->intel.ttm)
-      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
+   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
+      FAKE TO PUSH THIS STUFF */
+//   if (!brw->intel.ttm)
+//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
 }
 
 static void get_space( struct brw_context *brw,
@@ -303,7 +305,7 @@ copy_array_to_vbo_array( struct brw_context *brw,
    dri_bo_unmap(element->bo);
 }
 
-GLboolean brw_upload_vertices( struct brw_context *brw,
+int brw_prepare_vertices( struct brw_context *brw,
 			       GLuint min_index,
 			       GLuint max_index )
 {
@@ -313,6 +315,7 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
    GLuint i;
    const unsigned char *ptr = NULL;
    GLuint interleave = 0;
+   int ret = 0;
 
    struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint nr_enabled = 0;
@@ -341,7 +344,7 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
     * isn't an issue at this point.
     */
    if (nr_enabled >= BRW_VEP_MAX)
-	 return GL_FALSE;
+       return -1;
 
    for (i = 0; i < nr_enabled; i++) {
       struct brw_vertex_element *input = enabled[i];
@@ -359,6 +362,8 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
 	 dri_bo_reference(input->bo);
 	 input->offset = (unsigned long)input->glarray->Ptr;
 	 input->stride = input->glarray->StrideB;
+
+	 ret |= dri_bufmgr_check_aperture_space(input->bo);
       } else {
 	 /* Queue the buffer object up to be uploaded in the next pass,
 	  * when we've decided if we're doing interleaved or not.
@@ -367,7 +372,7 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
 	    /* Position array not properly enabled:
 	     */
 	    if (input->glarray->StrideB == 0)
-	       return GL_FALSE;
+	      return -1;
 
 	    interleave = input->glarray->StrideB;
 	    ptr = input->glarray->Ptr;
@@ -415,6 +420,38 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
       }
    }
 
+   if (brw->vb.upload.bo) {
+     ret |= dri_bufmgr_check_aperture_space(brw->vb.upload.bo);
+   }
+
+   if (ret)
+     return 1;
+
+
+   return 0;
+}
+
+void brw_emit_vertices( struct brw_context *brw,
+                        GLuint min_index,
+                        GLuint max_index )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = intel_context(ctx);
+   GLuint tmp = brw->vs.prog_data->inputs_read;
+   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
+   GLuint i;
+   GLuint nr_enabled = 0;
+
+  /* Accumulate the list of enabled arrays. */
+   while (tmp) {
+      i = _mesa_ffsll(tmp)-1;
+      struct brw_vertex_element *input = &brw->vb.inputs[i];
+
+      tmp &= ~(1<<i);
+      enabled[nr_enabled++] = input;
+   }
+
+
    /* Now emit VB and VEP state packets.
     *
     * This still defines a hardware VB for each input, even if they
@@ -476,12 +513,12 @@ GLboolean brw_upload_vertices( struct brw_context *brw,
 		((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
    ADVANCE_BATCH();
-
-   return GL_TRUE;
 }
 
-void brw_upload_indices( struct brw_context *brw,
-			 const struct _mesa_index_buffer *index_buffer )
+int brw_prepare_indices( struct brw_context *brw,
+			 const struct _mesa_index_buffer *index_buffer,
+			 dri_bo **bo_return,
+			 GLuint *offset_return)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
@@ -489,6 +526,7 @@ void brw_upload_indices( struct brw_context *brw,
    dri_bo *bo;
    struct gl_buffer_object *bufferobj = index_buffer->obj;
    GLuint offset = (GLuint)index_buffer->ptr;
+   int ret;
 
    /* Turn into a proper VBO:
     */
@@ -524,6 +562,19 @@ void brw_upload_indices( struct brw_context *brw,
        }
    }
 
+   *bo_return = bo;
+   *offset_return = offset;
+   ret = dri_bufmgr_check_aperture_space(bo);
+   return ret;
+}
+
+void brw_emit_indices(struct brw_context *brw,
+                      const struct _mesa_index_buffer *index_buffer,
+                      dri_bo *bo,
+                      GLuint offset)
+{
+   struct intel_context *intel = &brw->intel;
+   GLuint ib_size = get_size(index_buffer->type) * index_buffer->count;
    /* Emit the indexbuffer packet:
     */
    {
@@ -548,3 +599,4 @@ void brw_upload_indices( struct brw_context *brw,
       dri_bo_unreference(bo);
    }
 }
+
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 25f1f896f7..c138d15fe8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -335,14 +335,14 @@ static __inline struct brw_reg brw_imm_ud( GLuint ud )
 static __inline struct brw_reg brw_imm_uw( GLushort uw )
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
-   imm.dw1.ud = uw;
+   imm.dw1.ud = uw | (uw << 16);
    return imm;
 }
 
 static __inline struct brw_reg brw_imm_w( GLshort w )
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
-   imm.dw1.d = w;
+   imm.dw1.d = w | (w << 16);
    return imm;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index ce0df0357b..2cf29cc341 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -93,9 +93,10 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    return GL_FALSE;
 }
 
-static void check_fallback(struct brw_context *brw)
+static int check_fallback(struct brw_context *brw)
 {
    brw->intel.Fallback = do_check_fallback(brw);
+   return 0;
 }
 
 const struct brw_tracked_state brw_check_fallback = {
@@ -104,7 +105,7 @@ const struct brw_tracked_state brw_check_fallback = {
       .brw  = BRW_NEW_METAOPS,
       .cache = 0
    },
-   .update = check_fallback
+   .prepare = check_fallback
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 922a3ba3a5..9419315c7a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -162,10 +162,10 @@ static void populate_key( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_gs_prog( struct brw_context *brw )
+static int prepare_gs_prog( struct brw_context *brw )
 {
    struct brw_gs_prog_key key;
-
+   int ret = 0;
    /* Populate the key:
     */
    populate_key(brw, &key);
@@ -183,7 +183,11 @@ static void upload_gs_prog( struct brw_context *brw )
 					 &brw->gs.prog_data);
       if (brw->gs.prog_bo == NULL)
 	 compile_gs_prog( brw, &key );
+
+      ret |= dri_bufmgr_check_aperture_space(brw->gs.prog_bo);
    }
+
+   return ret;
 }
 
 
@@ -193,5 +197,5 @@ const struct brw_tracked_state brw_gs_prog = {
       .brw   = BRW_NEW_PRIMITIVE,
       .cache = CACHE_NEW_VS_PROG
    },
-   .update = upload_gs_prog
+   .prepare = prepare_gs_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index bf38fd7385..f1f9e018f1 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -116,7 +116,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    return bo;
 }
 
-static void upload_gs_unit( struct brw_context *brw )
+static int prepare_gs_unit( struct brw_context *brw )
 {
    struct brw_gs_unit_key key;
 
@@ -130,6 +130,7 @@ static void upload_gs_unit( struct brw_context *brw )
    if (brw->gs.state_bo == NULL) {
       brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
    }
+   return dri_bufmgr_check_aperture_space(brw->gs.state_bo);
 }
 
 const struct brw_tracked_state brw_gs_unit = {
@@ -139,5 +140,5 @@ const struct brw_tracked_state brw_gs_unit = {
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_GS_PROG
    },
-   .update = upload_gs_unit,
+   .prepare = prepare_gs_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index ba90496d19..26ec797b5f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -68,7 +68,7 @@ const struct brw_tracked_state brw_blend_constant_color = {
       .brw = 0,
       .cache = 0
    },
-   .update = upload_blend_constant_color
+   .emit = upload_blend_constant_color
 };
 
 /**
@@ -98,7 +98,7 @@ const struct brw_tracked_state brw_binding_table_pointers = {
       .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_SURF_BIND,
    },
-   .update = upload_binding_table_pointers,
+   .emit = upload_binding_table_pointers,
 };
 
 
@@ -145,7 +145,7 @@ const struct brw_tracked_state brw_pipelined_state_pointers = {
 		CACHE_NEW_WM_UNIT | 
 		CACHE_NEW_CC_UNIT)
    },
-   .update = upload_pipelined_state_pointers
+   .emit = upload_pipelined_state_pointers
 };
 #endif
 
@@ -169,7 +169,7 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
 		CACHE_NEW_WM_UNIT | 
 		CACHE_NEW_CC_UNIT)
    },
-   .update = upload_psp_urb_cbs,
+   .emit = upload_psp_urb_cbs,
 };
 
 /**
@@ -178,7 +178,17 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
  * We have to do this per state validation as we need to emit the relocation
  * in the batch buffer.
  */
-static void upload_depthbuffer(struct brw_context *brw)
+
+static int prepare_depthbuffer(struct brw_context *brw)
+{
+   struct intel_region *region = brw->state.depth_region;
+
+   if (!region || !region->buffer)
+      return 0;
+   return dri_bufmgr_check_aperture_space(region->buffer);
+}
+
+static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
@@ -242,7 +252,8 @@ const struct brw_tracked_state brw_depthbuffer = {
       .brw = BRW_NEW_DEPTH_BUFFER | BRW_NEW_BATCH,
       .cache = 0,
    },
-   .update = upload_depthbuffer,
+   .prepare = prepare_depthbuffer,
+   .emit = emit_depthbuffer,
 };
 
 
@@ -272,7 +283,7 @@ const struct brw_tracked_state brw_polygon_stipple = {
       .brw = 0,
       .cache = 0
    },
-   .update = upload_polygon_stipple
+   .emit = upload_polygon_stipple
 };
 
 
@@ -303,7 +314,7 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
       .brw = 0,
       .cache = 0
    },
-   .update = upload_polygon_stipple_offset
+   .emit = upload_polygon_stipple_offset
 };
 
 /**********************************************************************
@@ -330,7 +341,7 @@ const struct brw_tracked_state brw_aa_line_parameters = {
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
-   .update = upload_aa_line_parameters
+   .emit = upload_aa_line_parameters
 };
 
 /***********************************************************************
@@ -365,7 +376,7 @@ const struct brw_tracked_state brw_line_stipple = {
       .brw = 0,
       .cache = 0
    },
-   .update = upload_line_stipple
+   .emit = upload_line_stipple
 };
 
 
@@ -399,7 +410,7 @@ const struct brw_tracked_state brw_pipe_control = {
       .brw = BRW_NEW_BATCH,
       .cache = 0
    },
-   .update = upload_pipe_control
+   .emit = upload_pipe_control
 };
 
 
@@ -465,7 +476,7 @@ const struct brw_tracked_state brw_invarient_state = {
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
-   .update = upload_invarient_state
+   .emit = upload_invarient_state
 };
 
 /**
@@ -499,5 +510,5 @@ const struct brw_tracked_state brw_state_base_address = {
       .brw = BRW_NEW_CONTEXT,
       .cache = 0,
    },
-   .update = upload_state_base_address
+   .emit = upload_state_base_address
 };
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 18285bef66..0b61748321 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -125,7 +125,7 @@ static void compile_sf_prog( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_sf_prog( struct brw_context *brw )
+static int upload_sf_prog( struct brw_context *brw )
 {
    struct brw_sf_prog_key key;
 
@@ -174,6 +174,7 @@ static void upload_sf_prog( struct brw_context *brw )
 				      &brw->sf.prog_data);
    if (brw->sf.prog_bo == NULL)
       compile_sf_prog( brw, &key );
+   return dri_bufmgr_check_aperture_space(brw->sf.prog_bo);
 }
 
 
@@ -183,6 +184,6 @@ const struct brw_tracked_state brw_sf_prog = {
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
-   .update = upload_sf_prog
+   .prepare = upload_sf_prog
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 398048429b..24388b79a5 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -37,7 +37,7 @@
 #include "macros.h"
 #include "intel_fbo.h"
 
-static void upload_sf_vp(struct brw_context *brw)
+static int upload_sf_vp(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
@@ -98,6 +98,8 @@ static void upload_sf_vp(struct brw_context *brw)
 
    dri_bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+
+   return dri_bufmgr_check_aperture_space(brw->sf.vp_bo);
 }
 
 const struct brw_tracked_state brw_sf_vp = {
@@ -107,7 +109,7 @@ const struct brw_tracked_state brw_sf_vp = {
       .brw   = BRW_NEW_METAOPS,
       .cache = 0
    },
-   .update = upload_sf_vp
+   .prepare = upload_sf_vp
 };
 
 struct brw_sf_unit_key {
@@ -267,10 +269,11 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    return bo;
 }
 
-static void upload_sf_unit( struct brw_context *brw )
+static int upload_sf_unit( struct brw_context *brw )
 {
    struct brw_sf_unit_key key;
    dri_bo *reloc_bufs[2];
+   int ret = 0;
 
    sf_unit_populate_key(brw, &key);
 
@@ -285,6 +288,15 @@ static void upload_sf_unit( struct brw_context *brw )
    if (brw->sf.state_bo == NULL) {
       brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs);
    }
+
+   if (reloc_bufs[0])
+     ret |= dri_bufmgr_check_aperture_space(reloc_bufs[0]);
+
+   if (reloc_bufs[1])
+     ret |= dri_bufmgr_check_aperture_space(reloc_bufs[1]);
+
+   ret |= dri_bufmgr_check_aperture_space(brw->sf.state_bo);
+   return ret;
 }
 
 const struct brw_tracked_state brw_sf_unit = {
@@ -298,5 +310,5 @@ const struct brw_tracked_state brw_sf_unit = {
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
    },
-   .update = upload_sf_unit,
+   .prepare = upload_sf_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 106a54a100..3b2ccd48c3 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -173,10 +173,10 @@ static void xor_states( struct brw_state_flags *result,
 /***********************************************************************
  * Emit all state:
  */
-void brw_validate_state( struct brw_context *brw )
+int brw_validate_state( struct brw_context *brw )
 {
    struct brw_state_flags *state = &brw->state.dirty;
-   GLuint i;
+   GLuint i, ret, count;
 
    state->mesa |= brw->intel.NewGLState;
    brw->intel.NewGLState = 0;
@@ -202,13 +202,34 @@ void brw_validate_state( struct brw_context *brw )
    if (state->mesa == 0 &&
        state->cache == 0 &&
        state->brw == 0)
-      return;
+      return 0;
 
    if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
       brw_clear_batch_cache_flush(brw);
 
    brw->intel.Fallback = 0;
 
+   count = 0;
+
+   /* do prepare stage for all atoms */
+   for (i = 0; i < Elements(atoms); i++) {
+      const struct brw_tracked_state *atom = brw->state.atoms[i];
+
+      if (brw->intel.Fallback)
+         break;
+
+      if (check_state(state, &atom->dirty)) {
+         if (atom->prepare) {
+            ret = atom->prepare(brw);
+            if (ret)
+               return ret;
+        }
+      }
+   }
+
+   if (brw->intel.Fallback)
+      return 0;
+
    if (INTEL_DEBUG) {
       /* Debug version which enforces various sanity checks on the
        * state flags which are generated and checked to help ensure
@@ -225,15 +246,13 @@ void brw_validate_state( struct brw_context *brw )
 	 assert(atom->dirty.mesa ||
 		atom->dirty.brw ||
 		atom->dirty.cache);
-	 assert(atom->update);
 
 	 if (brw->intel.Fallback)
 	    break;
 
 	 if (check_state(state, &atom->dirty)) {
-	    atom->update( brw );
-	    
-/* 	    emit_foo(brw); */
+	    if (atom->emit)
+	       atom->emit( brw );
 	 }
 
 	 accumulate_state(&examined, &atom->dirty);
@@ -254,11 +273,14 @@ void brw_validate_state( struct brw_context *brw )
 	 if (brw->intel.Fallback)
 	    break;
 
-	 if (check_state(state, &atom->dirty))
-	    atom->update( brw );
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit)
+	       atom->emit( brw );
+	 }
       }
    }
 
    if (!brw->intel.Fallback)
       memset(state, 0, sizeof(*state));
+   return 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index cf805cef28..c423dbe7d7 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -52,7 +52,7 @@ static const struct {
    GLuint min_entry_size;
    GLuint max_entry_size;
 } limits[CS+1] = {
-   { 8, 32, 1, 5 },			/* vs */
+   { 16, 32, 1, 5 },			/* vs */
    { 4, 8,  1, 5 },			/* gs */
    { 6, 8,  1, 5 },			/* clp */
    { 1, 8,  1, 12 },		        /* sf */
@@ -74,7 +74,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
 /* Most minimal update, forces re-emit of URB fence packet after GS
  * unit turned on/off.
  */
-static void recalculate_urb_fence( struct brw_context *brw )
+static int recalculate_urb_fence( struct brw_context *brw )
 {
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
@@ -142,6 +142,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
+   return 0;
 }
 
 
@@ -152,7 +153,7 @@ const struct brw_tracked_state brw_recalculate_urb_fence = {
       .cache = (CACHE_NEW_VS_PROG |
 		CACHE_NEW_SF_PROG)
    },
-   .update = recalculate_urb_fence
+   .prepare = recalculate_urb_fence
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 656fa2e783..f89b0e14a1 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -83,7 +83,7 @@ static void do_vs_prog( struct brw_context *brw,
 }
 
 
-static void brw_upload_vs_prog( struct brw_context *brw )
+static int brw_upload_vs_prog( struct brw_context *brw )
 {
    struct brw_vs_prog_key key;
    struct brw_vertex_program *vp = 
@@ -115,6 +115,7 @@ static void brw_upload_vs_prog( struct brw_context *brw )
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+   return dri_bufmgr_check_aperture_space(brw->vs.prog_bo);
 }
 
 
@@ -126,5 +127,5 @@ const struct brw_tracked_state brw_vs_prog = {
       .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_METAOPS,
       .cache = 0
    },
-   .update = brw_upload_vs_prog
+   .prepare = brw_upload_vs_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index caef042f1c..a0106b8975 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -166,7 +166,7 @@ static GLuint get_input_size(struct brw_context *brw,
 /* Calculate sizes of vertex program outputs.  Size is the largest
  * component index which might vary from [0,0,0,1]
  */
-static void calc_wm_input_sizes( struct brw_context *brw )
+static int calc_wm_input_sizes( struct brw_context *brw )
 {
    /* BRW_NEW_VERTEX_PROGRAM */
    struct brw_vertex_program *vp = 
@@ -210,6 +210,7 @@ static void calc_wm_input_sizes( struct brw_context *brw )
       memcpy(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks));
       brw->state.dirty.brw |= BRW_NEW_WM_INPUT_DIMENSIONS;
    }
+   return 0;
 }
 
 const struct brw_tracked_state brw_wm_input_sizes = {
@@ -218,6 +219,6 @@ const struct brw_tracked_state brw_wm_input_sizes = {
       .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_INPUT_DIMENSIONS,
       .cache = 0
    },
-   .update = calc_wm_input_sizes
+   .prepare = calc_wm_input_sizes
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 447e1182b3..3cac97c71f 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -867,7 +867,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!c->key.know_w_is_one) {
+      if (!BRW_IS_IGD(p->brw) && !c->key.know_w_is_one) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 573be01a2b..2a64f3df33 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -124,7 +124,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    return bo;
 }
 
-static void upload_vs_unit( struct brw_context *brw )
+static int prepare_vs_unit( struct brw_context *brw )
 {
    struct brw_vs_unit_key key;
 
@@ -138,6 +138,7 @@ static void upload_vs_unit( struct brw_context *brw )
    if (brw->vs.state_bo == NULL) {
       brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
    }
+   return dri_bufmgr_check_aperture_space(brw->vs.state_bo);
 }
 
 const struct brw_tracked_state brw_vs_unit = {
@@ -147,5 +148,5 @@ const struct brw_tracked_state brw_vs_unit = {
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_VS_PROG
    },
-   .update = upload_vs_unit,
+   .prepare = prepare_vs_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_vs_tnl.c b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
index 160fc34cb5..e409620bbf 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_tnl.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
@@ -1581,7 +1581,7 @@ static GLuint hash_key( struct state_key *key )
    return hash;
 }
 
-static void update_tnl_program( struct brw_context *brw )
+static int prepare_tnl_program( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct state_key key;
@@ -1590,7 +1590,7 @@ static void update_tnl_program( struct brw_context *brw )
 
    /* _NEW_PROGRAM */
    if (brw->attribs.VertexProgram->_Current) 
-      return;
+      return 0;
       
    /* Grab all the relevent state and put it in a single structure:
     */
@@ -1623,6 +1623,7 @@ static void update_tnl_program( struct brw_context *brw )
 
    if (old != brw->tnl_program)
       brw->state.dirty.brw |= BRW_NEW_TNL_PROGRAM;
+   return 0;
 }
 
 /* Note: See brw_draw.c - the vertex program must not rely on
@@ -1642,13 +1643,13 @@ const struct brw_tracked_state brw_tnl_vertprog = {
 	      BRW_NEW_INPUT_VARYING),
       .cache = 0
    },
-   .update = update_tnl_program
+   .prepare = prepare_tnl_program
 };
 
 
 
 
-static void update_active_vertprog( struct brw_context *brw )
+static int prepare_active_vertprog( struct brw_context *brw )
 {
    const struct gl_vertex_program *prev = brw->vertex_program;
 
@@ -1663,6 +1664,8 @@ static void update_active_vertprog( struct brw_context *brw )
 
    if (brw->vertex_program != prev) 
       brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+
+   return 0;
 }
 
 
@@ -1673,7 +1676,7 @@ const struct brw_tracked_state brw_active_vertprog = {
       .brw = BRW_NEW_TNL_PROGRAM,
       .cache = 0
    },
-   .update = update_active_vertprog
+   .prepare = prepare_active_vertprog
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index abdc92bf01..acbaf178d4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -325,7 +325,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 }
 
 
-static void brw_upload_wm_prog( struct brw_context *brw )
+static int brw_prepare_wm_prog( struct brw_context *brw )
 {
    struct brw_wm_prog_key key;
    struct brw_fragment_program *fp = (struct brw_fragment_program *)
@@ -342,6 +342,8 @@ static void brw_upload_wm_prog( struct brw_context *brw )
 				      &brw->wm.prog_data);
    if (brw->wm.prog_bo == NULL)
       do_wm_prog(brw, fp, &key);
+
+   return dri_bufmgr_check_aperture_space(brw->wm.prog_bo);
 }
 
 
@@ -362,6 +364,6 @@ const struct brw_tracked_state brw_wm_prog = {
 		BRW_NEW_REDUCED_PRIMITIVE),
       .cache = 0
    },
-   .update = brw_upload_wm_prog
+   .prepare = brw_prepare_wm_prog
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index a02f70a50c..4cda55914c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -724,9 +724,6 @@ static void emit_tex( struct brw_wm_compile *c,
 	      responseLength,
 	      msgLength,
 	      0);	
-
-   if (shadow)
-       brw_MOV(p, dst[3], brw_imm_f(1.0));
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 6ca7709916..d40332e9ae 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -255,10 +255,11 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
  * complicates various things.  However, this is still too confusing -
  * FIXME: simplify all the different new texture state flags.
  */
-static void upload_wm_samplers( struct brw_context *brw )
+static int upload_wm_samplers( struct brw_context *brw )
 {
    struct wm_sampler_key key;
    int i;
+   int ret = 0;
 
    brw_wm_sampler_populate_key(brw, &key);
 
@@ -270,7 +271,7 @@ static void upload_wm_samplers( struct brw_context *brw )
    dri_bo_unreference(brw->wm.sampler_bo);
    brw->wm.sampler_bo = NULL;
    if (brw->wm.sampler_count == 0)
-      return;
+      return 0;
 
    brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER,
 					 &key, sizeof(key),
@@ -303,6 +304,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 	 if (!brw->attribs.Texture->Unit[i]._ReallyEnabled)
 	    continue;
 
+	 ret |= dri_bufmgr_check_aperture_space(brw->wm.sdc_bo[i]);
 	 dri_emit_reloc(brw->wm.sampler_bo,
 			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
 			0,
@@ -311,6 +313,10 @@ static void upload_wm_samplers( struct brw_context *brw )
 			brw->wm.sdc_bo[i]);
       }
    }
+
+   ret |= dri_bufmgr_check_aperture_space(brw->wm.sampler_bo);
+   return ret;
+
 }
 
 const struct brw_tracked_state brw_wm_samplers = {
@@ -319,7 +325,7 @@ const struct brw_tracked_state brw_wm_samplers = {
       .brw = 0,
       .cache = 0
    },
-   .update = upload_wm_samplers,
+   .prepare = upload_wm_samplers,
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 4b9d7aac1b..f4da0f279e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -227,12 +227,12 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 }
 
 
-static void upload_wm_unit( struct brw_context *brw )
+static int upload_wm_unit( struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_key key;
    dri_bo *reloc_bufs[3];
-
+   int ret = 0, i;
    wm_unit_populate_key(brw, &key);
 
    /* Allocate the necessary scratch space if we haven't already.  Don't
@@ -267,6 +267,12 @@ static void upload_wm_unit( struct brw_context *brw )
    if (brw->wm.state_bo == NULL) {
       brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs);
    }
+
+   for (i = 0; i < 3; i++)
+     if (reloc_bufs[i])
+       ret |= dri_bufmgr_check_aperture_space(reloc_bufs[i]);
+   ret |= dri_bufmgr_check_aperture_space(brw->wm.state_bo);
+   return ret;
 }
 
 const struct brw_tracked_state brw_wm_unit = {
@@ -284,6 +290,6 @@ const struct brw_tracked_state brw_wm_unit = {
 		CACHE_NEW_WM_PROG | 
 		CACHE_NEW_SAMPLER)
    },
-   .update = upload_wm_unit,
+   .prepare = upload_wm_unit,
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index c5c944f781..0d91391964 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -69,7 +69,7 @@ static GLuint translate_tex_target( GLenum target )
 }
 
 
-static GLuint translate_tex_format( GLuint mesa_format )
+static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
 {
    switch( mesa_format ) {
    case MESA_FORMAT_L8:
@@ -114,7 +114,12 @@ static GLuint translate_tex_format( GLuint mesa_format )
       return BRW_SURFACEFORMAT_FXT1;
 
    case MESA_FORMAT_Z16:
-      return BRW_SURFACEFORMAT_I16_UNORM;
+      if (depth_mode == GL_INTENSITY) 
+	  return BRW_SURFACEFORMAT_I16_UNORM;
+      else if (depth_mode == GL_ALPHA)
+	  return BRW_SURFACEFORMAT_A16_UNORM;
+      else
+	  return BRW_SURFACEFORMAT_L16_UNORM;
 
    case MESA_FORMAT_RGB_DXT1:
        return BRW_SURFACEFORMAT_DXT1_RGB;
@@ -143,7 +148,7 @@ static GLuint translate_tex_format( GLuint mesa_format )
 }
 
 struct brw_wm_surface_key {
-   GLenum target;
+   GLenum target, depthmode;
    dri_bo *bo;
    GLint format;
    GLint first_level, last_level;
@@ -163,7 +168,7 @@ brw_create_texture_surface( struct brw_context *brw,
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    surf.ss0.surface_type = translate_tex_target(key->target);
-   surf.ss0.surface_format = translate_tex_format(key->format);
+   surf.ss0.surface_format = translate_tex_format(key->format, key->depthmode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
@@ -207,7 +212,7 @@ brw_create_texture_surface( struct brw_context *brw,
    return bo;
 }
 
-static void
+static int
 brw_update_texture_surface( GLcontext *ctx, GLuint unit )
 {
    struct brw_context *brw = brw_context(ctx);
@@ -215,9 +220,11 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
    struct brw_wm_surface_key key;
+   int ret = 0;
 
    memset(&key, 0, sizeof(key));
    key.target = tObj->Target;
+   key.depthmode = tObj->DepthMode;
    key.format = firstImage->TexFormat->MesaFormat;
    key.bo = intelObj->mt->region->buffer;
    key.first_level = intelObj->firstLevel;
@@ -229,13 +236,19 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.depth = firstImage->Depth;
    key.tiled = intelObj->mt->region->tiled;
 
+   ret |= dri_bufmgr_check_aperture_space(key.bo);
+
    dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
    brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
 						&key, sizeof(key),
 						&key.bo, 1,
 						NULL);
-   if (brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] == NULL)
+   if (brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] == NULL) {
       brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_create_texture_surface(brw, &key);
+   }
+
+   ret |= dri_bufmgr_check_aperture_space(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
+   return ret;
 }
 
 /**
@@ -243,12 +256,12 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
  * While it is only used for the front/back buffer currently, it should be
  * usable for further buffers when doing ARB_draw_buffer support.
  */
-static void
+static int
 brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 			  unsigned int unit, GLboolean cached)
 {
    dri_bo *region_bo = NULL;
-
+   int ret = 0;
    struct {
       unsigned int surface_type;
       unsigned int surface_format;
@@ -271,6 +284,8 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       key.width = region->pitch; /* XXX: not really! */
       key.height = region->height;
       key.cpp = region->cpp;
+
+      ret |= dri_bufmgr_check_aperture_space(region->buffer);
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -331,6 +346,10 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 			region_bo);
       }
    }
+
+   ret |= dri_bufmgr_check_aperture_space(brw->wm.surf_bo[unit]);
+
+   return ret;
 }
 
 
@@ -384,17 +403,24 @@ brw_wm_get_binding_table(struct brw_context *brw)
    return bind_bo;
 }
 
-static void upload_wm_surfaces(struct brw_context *brw )
+static int prepare_wm_surfaces(struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
-   GLuint i;
+   GLuint i, ret;
+
    if (brw->state.nr_draw_regions  > 1) {
-       for (i = 0; i < brw->state.nr_draw_regions; i++) 
-	   brw_update_region_surface(brw, brw->state.draw_regions[i], i, 
-		GL_FALSE);
-   }else
-       brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE);
+      for (i = 0; i < brw->state.nr_draw_regions; i++) {
+         ret = brw_update_region_surface(brw, brw->state.draw_regions[i], i,
+                                         GL_FALSE);
+         if (ret)
+            return ret;
+      }
+   }else {
+      ret = brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE);
+      if (ret)
+         return ret;
+   }
 
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
@@ -402,33 +428,40 @@ static void upload_wm_surfaces(struct brw_context *brw )
       struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
 
       /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
-      if(texUnit->_ReallyEnabled &&
-	 texUnit->_Current == intel->frame_buffer_texobj)
-      {
-	 dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-	 brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = brw->wm.surf_bo[0];
-	 dri_bo_reference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-	 brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
-      } else if (texUnit->_ReallyEnabled) {
-	 brw_update_texture_surface(ctx, i);
-	 brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+      if(texUnit->_ReallyEnabled) {
+         if (texUnit->_Current == intel->frame_buffer_texobj) {
+            dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
+            brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = brw->wm.surf_bo[0];
+            dri_bo_reference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
+            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+         } else {
+            ret = brw_update_texture_surface(ctx, i);
+            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+
+            if (ret)
+               return ret;
+         }
       } else {
-	 dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-	 brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = NULL;
+         dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
+         brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = NULL;
       }
+
    }
 
    dri_bo_unreference(brw->wm.bind_bo);
    brw->wm.bind_bo = brw_wm_get_binding_table(brw);
+
+   return dri_bufmgr_check_aperture_space(brw->wm.bind_bo);
 }
 
+
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
       .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
-   .update = upload_wm_surfaces,
+   .prepare = prepare_wm_surfaces,
 };
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 0f990c00b4..4890826a19 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -275,8 +275,16 @@ intelEmitCopyBlit(struct intel_context *intel,
    GLuint CMD, BR13;
    int dst_y2 = dst_y + h;
    int dst_x2 = dst_x + w;
+   int ret;
    BATCH_LOCALS;
 
+ again:
+   ret = dri_bufmgr_check_aperture_space(dst_buffer);
+   ret |= dri_bufmgr_check_aperture_space(src_buffer);
+   if (ret) {
+     intel_batchbuffer_flush(intel->batch);
+     goto again;
+   }
 
    DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
        __FUNCTION__,
diff --git a/src/mesa/drivers/dri/intel/intel_bufmgr_ttm.c b/src/mesa/drivers/dri/intel/intel_bufmgr_ttm.c
index 6828425e77..545913fa31 100644
--- a/src/mesa/drivers/dri/intel/intel_bufmgr_ttm.c
+++ b/src/mesa/drivers/dri/intel/intel_bufmgr_ttm.c
@@ -889,7 +889,7 @@ dri_ttm_bo_process_reloc(dri_bo *bo)
 	   struct intel_validate_entry *entry =
 	      &bufmgr_ttm->validate_array[target_buf_ttm->validate_index];
 
-	   entry->bo_arg.d.req.bo_req.flags &= ~DRM_BO_HINT_PRESUMED_OFFSET;
+	   entry->bo_arg.d.req.bo_req.hint &= ~DRM_BO_HINT_PRESUMED_OFFSET;
 	}
     }
 }
@@ -993,7 +993,7 @@ dri_ttm_bo_post_submit(dri_bo *bo)
 	/* Continue walking the tree depth-first. */
 	dri_ttm_bo_post_submit(r->target_buf);
 
-	r->last_target_offset = bo->offset;
+	r->last_target_offset = r->target_buf->offset;
     }
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 55503f45ae..9205627813 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -272,6 +272,11 @@ intel_miptree_match_image(struct intel_mipmap_tree *mt,
        image->IsCompressed != mt->compressed)
       return GL_FALSE;
 
+   if (!image->IsCompressed &&
+       !mt->compressed &&
+       image->TexFormat->TexelBytes != mt->cpp)
+      return GL_FALSE;
+
    /* Test image dimensions against the base level image adjusted for
     * minification.  This will also catch images not present in the
     * tree, changed targets, etc.
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 5aeb2a18f4..52e062eece 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -68,7 +68,7 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE(false)
-      DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+      DRI_CONF_ALLOW_LARGE_TEXTURES(2)
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_DEBUG
      DRI_CONF_NO_RAST(false)
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index a56a395646..bcb65835c6 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -348,8 +348,10 @@ intelTexImage(GLcontext * ctx,
 	 postConvWidth = 32 / texelBytes;
 	 texImage->RowStride = postConvWidth;
       }
-      
-      assert(texImage->RowStride == postConvWidth);
+
+      if (!intelImage->mt) {      
+	  assert(texImage->RowStride == postConvWidth);
+      }
    }
 
    /* Release the reference to a potentially orphaned buffer.   
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 20c1107947..c567349335 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -69,6 +69,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_ATI_fragment_shader
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
+#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
@@ -132,6 +133,7 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
     { "GL_EXT_blend_subtract",             NULL },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 05ff5952f3..4edf304438 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -1815,6 +1815,12 @@ void r200UpdateTextureState( GLcontext *ctx )
    GLboolean ok;
    GLuint dbg;
 
+   /* NOTE: must not manipulate rmesa->state.texture.unit[].unitneeded or
+      rmesa->state.envneeded before a R200_STATECHANGE (or R200_NEWPRIM) since
+      we use these to determine if we want to emit the corresponding state
+      atoms. */
+   R200_NEWPRIM( rmesa );
+
    if (ctx->ATIFragmentShader._Enabled) {
       GLuint i;
       for (i = 0; i < R200_MAX_TEXTURE_UNITS; i++) {
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index b1e027dc1c..4be0344859 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -215,6 +215,7 @@ static unsigned long t_src_class(enum register_file file)
    case PROGRAM_LOCAL_PARAM:
    case PROGRAM_ENV_PARAM:
    case PROGRAM_NAMED_PARAM:
+   case PROGRAM_CONSTANT:
    case PROGRAM_STATE_VAR:
       return VSF_IN_CLASS_PARAM;
    /*
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index d2ed3105d1..c56a762289 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -84,6 +84,7 @@ int hw_tcl_on = 1;
 #define need_GL_ARB_vertex_program
 #define need_GL_EXT_blend_minmax
 //#define need_GL_EXT_fog_coord
+#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
@@ -112,6 +113,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
   {"GL_EXT_blend_subtract",		NULL},
 //  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+  {"GL_EXT_multi_draw_arrays",		GL_EXT_multi_draw_arrays_functions},
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
   {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 45dafd6bcc..bb5f5c35f0 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -774,6 +774,54 @@ struct r300_fragment_program {
 	GLuint optimization;
 };
 
+struct r500_fragment_program {
+	struct gl_fragment_program mesa_program;
+
+	GLcontext *ctx;
+	GLboolean translated;
+	GLboolean error;
+	struct r300_pfs_compile_state *cs;
+
+	struct {
+		GLuint inst0;
+		GLuint inst1;
+		GLuint inst2;
+		GLuint inst3;
+		GLuint inst4;
+		GLuint inst5;
+	} inst[512];
+	/* TODO: This is magic! */
+
+	struct {
+		int tex_offset;
+		int tex_end;
+		int alu_offset;
+		int alu_end;
+		int flags;
+	} node[4];
+	int cur_node;
+	int first_node_has_tex;
+
+	int alu_offset;
+	int alu_end;
+	int tex_offset;
+	int tex_end;
+
+	/* Hardware constants.
+	 * Contains a pointer to the value. The destination of the pointer
+	 * is supposed to be updated when GL state changes.
+	 * Typically, this is either a pointer into
+	 * gl_program_parameter_list::ParameterValues, or a pointer to a
+	 * global constant (e.g. for sin/cos-approximation)
+	 */
+	const GLfloat *constant[PFS_NUM_CONST_REGS];
+	int const_nr;
+
+	int max_temp_idx;
+
+	GLuint optimization;
+};
+
 #define R300_MAX_AOS_ARRAYS		16
 
 #define REG_COORDS	0
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index c664fb6562..5ba2971fb9 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -2217,6 +2217,7 @@ static void update_params(struct r300_fragment_program *fp)
 void r300TranslateFragmentShader(r300ContextPtr r300,
 				 struct r300_fragment_program *fp)
 {
+
 	struct r300_pfs_compile_state *cs = NULL;
 
 	if (!fp->translated) {
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index eee1e803a0..fc07105c56 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -334,13 +334,26 @@ static GLboolean r300RunRender(GLcontext * ctx,
 static int r300Fallback(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	/* Do we need to use new-style shaders?
+	 * Also is there a better way to do this? */
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		struct r500_fragment_program *fp = (struct r500_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
-
-	if (fp) {
-		if (!fp->translated)
-			r300TranslateFragmentShader(r300, fp);
-		FALLBACK_IF(!fp->translated);
+		if (fp) {
+			if (!fp->translated) {
+				r500TranslateFragmentShader(r300, fp);
+				FALLBACK_IF(!fp->translated);
+			}
+		}
+	} else {
+		struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+		if (fp) {
+			if (!fp->translated) {
+				r300TranslateFragmentShader(r300, fp);
+				FALLBACK_IF(!fp->translated);
+			}
+		}
 	}
 
 	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 8f12266a5f..dac37ba273 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -2315,76 +2315,32 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 static void r500SetupPixelShader(r300ContextPtr rmesa)
 {
 	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	struct r500_fragment_program *fp = (struct r500_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
 	int i, k;
 
-	if (!fp)		/* should only happenen once, just after context is created */
+	if (!fp)	/* should only happen once, just after context is created */
+		return;
+
+	r500TranslateFragmentShader(rmesa, fp);
+	if (!fp->translated) {
+		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+			__FUNCTION__);
 		return;
+	}
 
-	/* emit the standard zero shader */
 	R300_STATECHANGE(rmesa, r500fp);
-	i = 1;
-	rmesa->hw.r500fp.cmd[i++] = 0x7808;
-	rmesa->hw.r500fp.cmd[i++] = R500_TEX_ID(0) | R500_TEX_INST_LD | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
-	rmesa->hw.r500fp.cmd[i++] = R500_TEX_SRC_ADDR(0) |  R500_TEX_SRC_S_SWIZ_R |
-		R500_TEX_SRC_T_SWIZ_G |
-		R500_TEX_DST_ADDR(0) |
-		R500_TEX_DST_R_SWIZ_R |
-		R500_TEX_DST_G_SWIZ_G |
-		R500_TEX_DST_B_SWIZ_B |
-		R500_TEX_DST_A_SWIZ_A;
-	rmesa->hw.r500fp.cmd[i++] = R500_DX_ADDR(0) |
-		R500_DX_S_SWIZ_R |
-		R500_DX_T_SWIZ_R |
-		R500_DX_R_SWIZ_R |
-		R500_DX_Q_SWIZ_R |
-		R500_DY_ADDR(0) |
-		R500_DY_S_SWIZ_R |
-		R500_DY_T_SWIZ_R |
-		R500_DY_R_SWIZ_R |
-		R500_DY_Q_SWIZ_R;
-	rmesa->hw.r500fp.cmd[i++] = 0x0;
-	rmesa->hw.r500fp.cmd[i++] = 0x0;
-
-	rmesa->hw.r500fp.cmd[i++] = R500_INST_TYPE_OUT |
-		R500_INST_TEX_SEM_WAIT |
-		R500_INST_LAST |
-		R500_INST_RGB_OMASK_R |
-		R500_INST_RGB_OMASK_G |
-		R500_INST_RGB_OMASK_B |
-		R500_INST_ALPHA_OMASK;
-
-	rmesa->hw.r500fp.cmd[i++] = R500_RGB_ADDR0(0) |
-		R500_RGB_ADDR1(0) |
-		R500_RGB_ADDR1_CONST |
-		R500_RGB_ADDR2(0) |
-		R500_RGB_ADDR2_CONST |
-		R500_RGB_SRCP_OP_1_MINUS_2RGB0;
-	rmesa->hw.r500fp.cmd[i++] = R500_ALPHA_ADDR0(0) |
-		R500_ALPHA_ADDR1(0) |
-		R500_ALPHA_ADDR1_CONST |
-		R500_ALPHA_ADDR2(0) |
-		R500_ALPHA_ADDR2_CONST |
-		R500_ALPHA_SRCP_OP_1_MINUS_2A0;
-	rmesa->hw.r500fp.cmd[i++] = R500_ALU_RGB_SEL_A_SRC0 |
-		R500_ALU_RGB_R_SWIZ_A_R |
-		R500_ALU_RGB_G_SWIZ_A_G |
-		R500_ALU_RGB_B_SWIZ_A_B |
-		R500_ALU_RGB_SEL_B_SRC0 |
-		R500_ALU_RGB_R_SWIZ_B_1 |
-		R500_ALU_RGB_B_SWIZ_B_1 |
-		R500_ALU_RGB_G_SWIZ_B_1;
-	rmesa->hw.r500fp.cmd[i++] = R500_ALPHA_OP_MAD |
-		R500_ALPHA_SWIZ_A_A |
-		R500_ALPHA_SWIZ_B_1;
-	rmesa->hw.r500fp.cmd[i++] = R500_ALU_RGBA_OP_MAD |
-		R500_ALU_RGBA_R_SWIZ_0 |
-		R500_ALU_RGBA_G_SWIZ_0 |
-		R500_ALU_RGBA_B_SWIZ_0 |
-		R500_ALU_RGBA_A_SWIZ_0;
-
-	bump_r500fp_count(rmesa->hw.r500fp.cmd, 12);
+	/* Emit our shader... */
+	for (i = 0; i < fp->cs->nrslots; i++) {
+		rmesa->hw.r500fp.cmd[i*6+1] = fp->inst[i].inst0;
+		rmesa->hw.r500fp.cmd[i*6+2] = fp->inst[i].inst1;
+		rmesa->hw.r500fp.cmd[i*6+3] = fp->inst[i].inst2;
+		rmesa->hw.r500fp.cmd[i*6+4] = fp->inst[i].inst3;
+		rmesa->hw.r500fp.cmd[i*6+5] = fp->inst[i].inst4;
+		rmesa->hw.r500fp.cmd[i*6+6] = fp->inst[i].inst5;
+	}
+
+	bump_r500fp_count(rmesa->hw.r500fp.cmd, fp->cs->nrslots * 6);
 
 	R300_STATECHANGE(rmesa, r500fp_const);
 	for (i = 0; i < fp->const_nr; i++) {
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index 3638a94380..f94b244232 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -32,6 +32,8 @@
  *
  * \author Jerome Glisse <j.glisse@gmail.com>
  *
+ * \author Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
  * \todo Depth write, WPOS/FOGC inputs
  *
  * \todo FogOption
@@ -48,12 +50,12 @@
 #include "shader/prog_print.h"
 
 #include "r300_context.h"
-#include "r300_fragprog.h"
+#include "r500_fragprog.h"
 #include "r300_reg.h"
 #include "r300_state.h"
 
 /*
- * Usefull macros and values
+ * Useful macros and values
  */
 #define ERROR(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -61,430 +63,64 @@
 		fp->error = GL_TRUE;			\
 	} while(0)
 
-#define PFS_INVAL 0xFFFFFFFF
 #define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
 
-#define SWIZZLE_XYZ		0
-#define SWIZZLE_XXX		1
-#define SWIZZLE_YYY		2
-#define SWIZZLE_ZZZ		3
-#define SWIZZLE_WWW		4
-#define SWIZZLE_YZX		5
-#define SWIZZLE_ZXY		6
-#define SWIZZLE_WZY		7
-#define SWIZZLE_111		8
-#define SWIZZLE_000		9
-#define SWIZZLE_HHH		10
-
-#define swizzle(r, x, y, z, w) do_swizzle(fp, r,		\
-					  ((SWIZZLE_##x<<0)|	\
-					   (SWIZZLE_##y<<3)|	\
-					   (SWIZZLE_##z<<6)|	\
-					   (SWIZZLE_##w<<9)),	\
-					  0)
-
-#define REG_TYPE_INPUT		0
-#define REG_TYPE_OUTPUT		1
-#define REG_TYPE_TEMP		2
-#define REG_TYPE_CONST		3
-
-#define REG_TYPE_SHIFT		0
-#define REG_INDEX_SHIFT		2
-#define REG_VSWZ_SHIFT		8
-#define REG_SSWZ_SHIFT		13
-#define REG_NEGV_SHIFT		18
-#define REG_NEGS_SHIFT		19
-#define REG_ABS_SHIFT		20
-#define REG_NO_USE_SHIFT	21	// Hack for refcounting
-#define REG_VALID_SHIFT		22	// Does the register contain a defined value?
-#define REG_BUILTIN_SHIFT   23	// Is it a builtin (like all zero/all one)?
-
-#define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
-#define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
-#define REG_VSWZ_MASK		(0x1F << REG_VSWZ_SHIFT)
-#define REG_SSWZ_MASK		(0x1F << REG_SSWZ_SHIFT)
-#define REG_NEGV_MASK		(0x01 << REG_NEGV_SHIFT)
-#define REG_NEGS_MASK		(0x01 << REG_NEGS_SHIFT)
-#define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
-#define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
-#define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
-#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
-
-#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
-	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
-	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
-	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
-	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
-	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
-	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
-	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_GET_TYPE(reg)						\
-	((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
-#define REG_GET_INDEX(reg)						\
-	((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
-#define REG_GET_VSWZ(reg)						\
-	((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
-#define REG_GET_SSWZ(reg)						\
-	((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
-#define REG_GET_NO_USE(reg)						\
-	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
-#define REG_GET_VALID(reg)						\
-	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
-#define REG_GET_BUILTIN(reg)						\
-	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
-#define REG_SET_TYPE(reg, type)						\
-	reg = ((reg & ~REG_TYPE_MASK) |					\
-	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
-#define REG_SET_INDEX(reg, index)					\
-	reg = ((reg & ~REG_INDEX_MASK) |				\
-	       ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
-#define REG_SET_VSWZ(reg, vswz)						\
-	reg = ((reg & ~REG_VSWZ_MASK) |					\
-	       ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
-#define REG_SET_SSWZ(reg, sswz)						\
-	reg = ((reg & ~REG_SSWZ_MASK) |					\
-	       ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_SET_NO_USE(reg, nouse)					\
-	reg = ((reg & ~REG_NO_USE_MASK) |				\
-	       ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
-#define REG_SET_VALID(reg, valid)					\
-	reg = ((reg & ~REG_VALID_MASK) |				\
-	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
-#define REG_SET_BUILTIN(reg, builtin)					\
-	reg = ((reg & ~REG_BUILTIN_MASK) |				\
-	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
-#define REG_ABS(reg)							\
-	reg = (reg | REG_ABS_MASK)
-#define REG_NEGV(reg)							\
-	reg = (reg | REG_NEGV_MASK)
-#define REG_NEGS(reg)							\
-	reg = (reg | REG_NEGS_MASK)
-
-/*
- * Datas structures for fragment program generation
- */
-
-/* description of r300 native hw instructions */
-static const struct {
-	const char *name;
-	int argc;
-	int v_op;
-	int s_op;
-} r300_fpop[] = {
-	/* *INDENT-OFF* */
-	{"MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD},
-	{"DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4},
-	{"DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4},
-	{"MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN},
-	{"MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX},
-	{"CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP},
-	{"FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC},
-	{"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2},
-	{"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2},
-	{"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP},
-	{"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ},
-	{"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL},
-	{"CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL},
-	/* *INDENT-ON* */
-};
-
-/* vector swizzles r300 can support natively, with a couple of
- * cases we handle specially
- *
- * REG_VSWZ/REG_SSWZ is an index into this table
- */
-
-/* mapping from SWIZZLE_* to r300 native values for scalar insns */
-#define SWIZZLE_HALF 6
-
-#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
-					  SWIZZLE_##y, \
-					  SWIZZLE_##z, \
-					  SWIZZLE_ZERO))
-/* native swizzles */
-static const struct r300_pfs_swizzle {
-	GLuint hash;		/* swizzle value this matches */
-	GLuint base;		/* base value for hw swizzle */
-	GLuint stride;		/* difference in base between arg0/1/2 */
-	GLuint flags;
-} v_swiz[] = {
-	/* *INDENT-OFF* */
-	{MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
-	{MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
-	{MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
-	{MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
-	{MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
-	{PFS_INVAL, 0, 0, 0},
-	/* *INDENT-ON* */
-};
-
-/* used during matching of non-native swizzles */
-#define SWZ_X_MASK (7 << 0)
-#define SWZ_Y_MASK (7 << 3)
-#define SWZ_Z_MASK (7 << 6)
-#define SWZ_W_MASK (7 << 9)
-static const struct {
-	GLuint hash;		/* used to mask matching swizzle components */
-	int mask;		/* actual outmask */
-	int count;		/* count of components matched */
-} s_mask[] = {
-	/* *INDENT-OFF* */
-	{SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
-	{SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
-	{SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
-	{SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
-	{SWZ_X_MASK, 1, 1},
-	{SWZ_Y_MASK, 2, 1},
-	{SWZ_Z_MASK, 4, 1},
-	{PFS_INVAL, PFS_INVAL, PFS_INVAL}
-	/* *INDENT-ON* */
-};
-
-static const struct {
-	int base;		/* hw value of swizzle */
-	int stride;		/* difference between SRC0/1/2 */
-	GLuint flags;
-} s_swiz[] = {
-	/* *INDENT-OFF* */
-	{R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
-	{R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
-	{R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
-	{R300_FPI2_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
-	{R300_FPI2_ARGA_ZERO, 0, 0},
-	{R300_FPI2_ARGA_ONE, 0, 0},
-	{R300_FPI2_ARGA_HALF, 0, 0}
-	/* *INDENT-ON* */
-};
-
-/* boiler-plate reg, for convenience */
-static const GLuint undef = REG(REG_TYPE_TEMP,
-				0,
-				SWIZZLE_XYZ,
-				SWIZZLE_W,
-				GL_FALSE,
-				GL_FALSE,
-				GL_FALSE);
-
-/* constant one source */
-static const GLuint pfs_one = REG(REG_TYPE_CONST,
-				  0,
-				  SWIZZLE_111,
-				  SWIZZLE_ONE,
-				  GL_FALSE,
-				  GL_TRUE,
-				  GL_TRUE);
-
-/* constant half source */
-static const GLuint pfs_half = REG(REG_TYPE_CONST,
-				   0,
-				   SWIZZLE_HHH,
-				   SWIZZLE_HALF,
-				   GL_FALSE,
-				   GL_TRUE,
-				   GL_TRUE);
-
-/* constant zero source */
-static const GLuint pfs_zero = REG(REG_TYPE_CONST,
-				   0,
-				   SWIZZLE_000,
-				   SWIZZLE_ZERO,
-				   GL_FALSE,
-				   GL_TRUE,
-				   GL_TRUE);
-
-/*
- * Common functions prototypes
- */
-static void dump_program(struct r300_fragment_program *fp);
-static void emit_arith(struct r300_fragment_program *fp, int op,
-		       GLuint dest, int mask,
-		       GLuint src0, GLuint src1, GLuint src2, int flags);
-
-/**
- * Get an R300 temporary that can be written to in the given slot.
- */
-static int get_hw_temp(struct r300_fragment_program *fp, int slot)
-{
-	COMPILE_STATE;
-	int r;
-
-	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
-		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
-			break;
-	}
-
-	if (r >= PFS_NUM_TEMP_REGS) {
-		ERROR("Out of hardware temps\n");
-		return 0;
-	}
-	// Reserved is used to avoid the following scenario:
-	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
-	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
-	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
-	//  to overwrite the value of temporary Y.
-	// End scenario.
-	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
-	cs->hwtemps[r].free = -1;
-
-	// Reset to some value that won't mess things up when the user
-	// tries to read from a temporary that hasn't been assigned a value yet.
-	// In the normal case, vector_valid and scalar_valid should be set to
-	// a sane value by the first emit that writes to this temporary.
-	cs->hwtemps[r].vector_valid = 0;
-	cs->hwtemps[r].scalar_valid = 0;
-
-	if (r > fp->max_temp_idx)
-		fp->max_temp_idx = r;
-
-	return r;
-}
-
-/**
- * Get an R300 temporary that will act as a TEX destination register.
- */
-static int get_hw_temp_tex(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	int r;
-
-	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
-		if (cs->used_in_node & (1 << r))
-			continue;
-
-		// Note: Be very careful here
-		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
-			break;
-	}
-
-	if (r >= PFS_NUM_TEMP_REGS)
-		return get_hw_temp(fp, 0);	/* Will cause an indirection */
-
-	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
-	cs->hwtemps[r].free = -1;
-
-	// Reset to some value that won't mess things up when the user
-	// tries to read from a temporary that hasn't been assigned a value yet.
-	// In the normal case, vector_valid and scalar_valid should be set to
-	// a sane value by the first emit that writes to this temporary.
-	cs->hwtemps[r].vector_valid = cs->nrslots;
-	cs->hwtemps[r].scalar_valid = cs->nrslots;
-
-	if (r > fp->max_temp_idx)
-		fp->max_temp_idx = r;
-
-	return r;
-}
-
-/**
- * Mark the given hardware register as free.
- */
-static void free_hw_temp(struct r300_fragment_program *fp, int idx)
-{
-	COMPILE_STATE;
-
-	// Be very careful here. Consider sequences like
-	//  MAD r0, r1,r2,r3
-	//  TEX r4, ...
-	// The TEX instruction may be moved in front of the MAD instruction
-	// due to the way nodes work. We don't want to alias r1 and r4 in
-	// this case.
-	// I'm certain the register allocation could be further sanitized,
-	// but it's tricky because of stuff that can happen inside emit_tex
-	// and emit_arith.
-	cs->hwtemps[idx].free = cs->nrslots + 1;
-}
-
-/**
- * Create a new Mesa temporary register.
- */
-static GLuint get_temp_reg(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	GLuint r = undef;
-	GLuint index;
-
-	index = ffs(~cs->temp_in_use);
-	if (!index) {
-		ERROR("Out of program temps\n");
-		return r;
-	}
-
-	cs->temp_in_use |= (1 << --index);
-	cs->temps[index].refcount = 0xFFFFFFFF;
-	cs->temps[index].reg = -1;
-
-	REG_SET_TYPE(r, REG_TYPE_TEMP);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
+/* "Register" flags */
+#define REG_CONSTANT (1 << 8)
+
+/* Swizzle tools */
+#define R500_SWIZZLE_ZERO 4
+#define R500_SWIZZLE_HALF 5
+#define R500_SWIZZLE_ONE 6
+#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
+#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
+/* Swizzles for inst2 */
+#define MAKE_SWIZ_TEX_STRQ(x) (x << 8)
+#define MAKE_SWIZ_TEX_RGBA(x) (x << 24)
+/* Swizzles for inst3 */
+#define MAKE_SWIZ_RGB_A(x) (x << 2)
+#define MAKE_SWIZ_RGB_B(x) (x << 15)
+/* Swizzles for inst4 */
+#define MAKE_SWIZ_ALPHA_A(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_B(x) (x << 21)
+/* Swizzle for inst5 */
+#define MAKE_SWIZ_RGBA_C(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_C(x) (x << 27)
+
+static inline GLuint make_rgb_swizzle(struct prog_src_register src) {
+	GLuint swiz = 0x0;
+	GLuint temp;
+	/* This could be optimized, but it should be plenty fast already. */
+	for (int i = 0; i < 3; i++) {
+		temp = (src.Swizzle >> i*3) & 0x7;
+		/* Fix SWIZZLE_ONE */
+		if (temp == 5) temp++;
+		swiz += temp << i*3;
+	}
+	return swiz;
 }
 
-/**
- * Create a new Mesa temporary register that will act as the destination
- * register for a texture read.
- */
-static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	GLuint r = undef;
-	GLuint index;
-
-	index = ffs(~cs->temp_in_use);
-	if (!index) {
-		ERROR("Out of program temps\n");
-		return r;
-	}
-
-	cs->temp_in_use |= (1 << --index);
-	cs->temps[index].refcount = 0xFFFFFFFF;
-	cs->temps[index].reg = get_hw_temp_tex(fp);
-
-	REG_SET_TYPE(r, REG_TYPE_TEMP);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
+static inline GLuint make_alpha_swizzle(struct prog_src_register src) {
+	GLuint swiz = (src.Swizzle >> 12) & 0x7;
+	if (swiz == 5) swiz++;
+	return swiz;
 }
 
-/**
- * Free a Mesa temporary and the associated R300 temporary.
- */
-static void free_temp(struct r300_fragment_program *fp, GLuint r)
-{
-	COMPILE_STATE;
-	GLuint index = REG_GET_INDEX(r);
-
-	if (!(cs->temp_in_use & (1 << index)))
-		return;
-
-	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
-		free_hw_temp(fp, cs->temps[index].reg);
-		cs->temps[index].reg = -1;
-		cs->temp_in_use &= ~(1 << index);
-	} else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
-		free_hw_temp(fp, cs->inputs[index].reg);
-		cs->inputs[index].reg = -1;
+static inline GLuint make_strq_swizzle(struct prog_src_register src) {
+	GLuint swiz = 0x0;
+	GLuint temp = src.Swizzle;
+	for (int i = 0; i < 4; i++) {
+		swiz += (temp & 0x3) << i*2;
+		temp >>= 3;
 	}
+	return swiz;
 }
 
-/**
- * Emit a hardware constant/parameter.
- *
- * \p cp Stable pointer to an array of 4 floats.
- *  The pointer must be stable in the sense that it remains to be valid
- *  and hold the contents of the constant/parameter throughout the lifetime
- *  of the fragment program (actually, up until the next time the fragment
- *  program is translated).
- */
-static GLuint emit_const4fv(struct r300_fragment_program *fp,
+/* Borrowed verbatim from r300_fragprog since it hasn't changed. */
+static GLuint emit_const4fv(struct r500_fragment_program *fp,
 			    const GLfloat * cp)
 {
-	GLuint reg = undef;
+	GLuint reg = 0x0;
 	int index;
 
 	for (index = 0; index < fp->const_nr; ++index) {
@@ -493,6 +129,7 @@ static GLuint emit_const4fv(struct r300_fragment_program *fp,
 	}
 
 	if (index >= fp->const_nr) {
+		/* TODO: This should be r5xx nums, not r300 */
 		if (index >= PFS_NUM_CONST_REGS) {
 			ERROR("Out of hw constants!\n");
 			return reg;
@@ -502,1586 +139,429 @@ static GLuint emit_const4fv(struct r300_fragment_program *fp,
 		fp->constant[index] = cp;
 	}
 
-	REG_SET_TYPE(reg, REG_TYPE_CONST);
-	REG_SET_INDEX(reg, index);
-	REG_SET_VALID(reg, GL_TRUE);
+	reg = index | REG_CONSTANT;
 	return reg;
 }
 
-static inline GLuint negate(GLuint r)
-{
-	REG_NEGS(r);
-	REG_NEGV(r);
-	return r;
-}
-
-/* Hack, to prevent clobbering sources used multiple times when
- * emulating non-native instructions
- */
-static inline GLuint keep(GLuint r)
-{
-	REG_SET_NO_USE(r, GL_TRUE);
-	return r;
-}
-
-static inline GLuint absolute(GLuint r)
-{
-	REG_ABS(r);
-	return r;
-}
-
-static int swz_native(struct r300_fragment_program *fp,
-		      GLuint src, GLuint * r, GLuint arbneg)
-{
-	/* Native swizzle, handle negation */
-	src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
-
-	if ((arbneg & 0x7) == 0x0) {
-		src = src & ~REG_NEGV_MASK;
-		*r = src;
-	} else if ((arbneg & 0x7) == 0x7) {
-		src |= REG_NEGV_MASK;
-		*r = src;
-	} else {
-		if (!REG_GET_VALID(*r))
-			*r = get_temp_reg(fp);
-		src |= REG_NEGV_MASK;
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
-		src = src & ~REG_NEGV_MASK;
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   *r,
-			   (arbneg ^ 0x7) | WRITEMASK_W,
-			   src, pfs_one, pfs_zero, 0);
-	}
-
-	return 3;
-}
-
-static int swz_emit_partial(struct r300_fragment_program *fp,
-			    GLuint src,
-			    GLuint * r, int mask, int mc, GLuint arbneg)
-{
-	GLuint tmp;
-	GLuint wmask = 0;
-
-	if (!REG_GET_VALID(*r))
-		*r = get_temp_reg(fp);
-
-	/* A partial match, VSWZ/mask define what parts of the
-	 * desired swizzle we match
-	 */
-	if (mc + s_mask[mask].count == 3) {
-		wmask = WRITEMASK_W;
-		src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
-	}
-
-	tmp = arbneg & s_mask[mask].mask;
-	if (tmp) {
-		tmp = tmp ^ s_mask[mask].mask;
-		if (tmp) {
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r,
-				   arbneg & s_mask[mask].mask,
-				   keep(src) | REG_NEGV_MASK,
-				   pfs_one, pfs_zero, 0);
-			if (!wmask) {
-				REG_SET_NO_USE(src, GL_TRUE);
-			} else {
-				REG_SET_NO_USE(src, GL_FALSE);
-			}
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
-		} else {
-			if (!wmask) {
-				REG_SET_NO_USE(src, GL_TRUE);
-			} else {
-				REG_SET_NO_USE(src, GL_FALSE);
-			}
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r,
-				   (arbneg & s_mask[mask].mask) | wmask,
-				   src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
-		}
-	} else {
-		if (!wmask) {
-			REG_SET_NO_USE(src, GL_TRUE);
-		} else {
-			REG_SET_NO_USE(src, GL_FALSE);
-		}
-		emit_arith(fp, PFS_OP_MAD,
-			   *r,
-			   s_mask[mask].mask | wmask,
-			   src, pfs_one, pfs_zero, 0);
-	}
-
-	return s_mask[mask].count;
-}
-
-static GLuint do_swizzle(struct r300_fragment_program *fp,
-			 GLuint src, GLuint arbswz, GLuint arbneg)
-{
-	GLuint r = undef;
-	GLuint vswz;
-	int c_mask = 0;
-	int v_match = 0;
-
-	/* If swizzling from something without an XYZW native swizzle,
-	 * emit result to a temp, and do new swizzle from the temp.
-	 */
-#if 0
-	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
-		GLuint temp = get_temp_reg(fp);
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
-		src = temp;
-	}
-#endif
-
-	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
-		GLuint vsrcswz =
-		    (v_swiz[REG_GET_VSWZ(src)].
-		     hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
-		    REG_GET_SSWZ(src) << 9;
-		GLint i;
-
-		GLuint newswz = 0;
-		GLuint offset;
-		for (i = 0; i < 4; ++i) {
-			offset = GET_SWZ(arbswz, i);
-
-			newswz |=
-			    (offset <= 3) ? GET_SWZ(vsrcswz,
-						    offset) << i *
-			    3 : offset << i * 3;
-		}
-
-		arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
-		REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
-	} else {
-		/* set scalar swizzling */
-		REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
-
-	}
-	do {
-		vswz = REG_GET_VSWZ(src);
-		do {
-			int chash;
-
-			REG_SET_VSWZ(src, vswz);
-			chash = v_swiz[REG_GET_VSWZ(src)].hash &
-			    s_mask[c_mask].hash;
-
-			if (chash == (arbswz & s_mask[c_mask].hash)) {
-				if (s_mask[c_mask].count == 3) {
-					v_match += swz_native(fp,
-							      src, &r, arbneg);
-				} else {
-					v_match += swz_emit_partial(fp,
-								    src,
-								    &r,
-								    c_mask,
-								    v_match,
-								    arbneg);
-				}
-
-				if (v_match == 3)
-					return r;
-
-				/* Fill with something invalid.. all 0's was
-				 * wrong before, matched SWIZZLE_X.  So all
-				 * 1's will be okay for now
-				 */
-				arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
-			}
-		} while (v_swiz[++vswz].hash != PFS_INVAL);
-		REG_SET_VSWZ(src, SWIZZLE_XYZ);
-	} while (s_mask[++c_mask].hash != PFS_INVAL);
-
-	ERROR("should NEVER get here\n");
-	return r;
-}
-
-static GLuint t_src(struct r300_fragment_program *fp,
-		    struct prog_src_register fpsrc)
-{
-	GLuint r = undef;
-
-	switch (fpsrc.File) {
-	case PROGRAM_TEMPORARY:
-		REG_SET_INDEX(r, fpsrc.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_TEMP);
-		break;
-	case PROGRAM_INPUT:
-		REG_SET_INDEX(r, fpsrc.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_INPUT);
-		break;
-	case PROGRAM_LOCAL_PARAM:
-		r = emit_const4fv(fp,
-				  fp->mesa_program.Base.LocalParams[fpsrc.
-								    Index]);
-		break;
-	case PROGRAM_ENV_PARAM:
-		r = emit_const4fv(fp,
-				  fp->ctx->FragmentProgram.Parameters[fpsrc.
-								      Index]);
-		break;
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_NAMED_PARAM:
-		r = emit_const4fv(fp,
-				  fp->mesa_program.Base.Parameters->
-				  ParameterValues[fpsrc.Index]);
-		break;
-	default:
-		ERROR("unknown SrcReg->File %x\n", fpsrc.File);
-		return r;
-	}
-
-	/* no point swizzling ONE/ZERO/HALF constants... */
-	if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
-		r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
-	return r;
-}
-
-static GLuint t_scalar_src(struct r300_fragment_program *fp,
-			   struct prog_src_register fpsrc)
-{
-	struct prog_src_register src = fpsrc;
-	int sc = GET_SWZ(fpsrc.Swizzle, 0);	/* X */
-
-	src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
-
-	return t_src(fp, src);
-}
-
-static GLuint t_dst(struct r300_fragment_program *fp,
-		    struct prog_dst_register dest)
-{
-	GLuint r = undef;
-
-	switch (dest.File) {
-	case PROGRAM_TEMPORARY:
-		REG_SET_INDEX(r, dest.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_TEMP);
-		return r;
-	case PROGRAM_OUTPUT:
-		REG_SET_TYPE(r, REG_TYPE_OUTPUT);
-		switch (dest.Index) {
-		case FRAG_RESULT_COLR:
-		case FRAG_RESULT_DEPR:
-			REG_SET_INDEX(r, dest.Index);
-			REG_SET_VALID(r, GL_TRUE);
-			return r;
+static GLuint make_src(struct r500_fragment_program *fp, struct prog_src_register src) {
+	GLuint reg;
+	switch (src.File) {
+		case PROGRAM_TEMPORARY:
+			reg = (src.Index << 0x1) | 0x1;
+			break;
+		case PROGRAM_INPUT:
+			/* Ugly hack needed to work around Mesa;
+			 * fragments don't get loaded right otherwise! */
+			reg = 0x0;
+			break;
+		case PROGRAM_CONSTANT:
+			reg = emit_const4fv(fp, fp->mesa_program.Base.Parameters->
+				  ParameterValues[src.Index]);
+			break;
 		default:
-			ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
-			return r;
-		}
-	default:
-		ERROR("Bad DstReg->File 0x%x\n", dest.File);
-		return r;
-	}
-}
-
-static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
-{
-	COMPILE_STATE;
-	int idx;
-	int index = REG_GET_INDEX(src);
-
-	switch (REG_GET_TYPE(src)) {
-	case REG_TYPE_TEMP:
-		/* NOTE: if reg==-1 here, a source is being read that
-		 *       hasn't been written to. Undefined results.
-		 */
-		if (cs->temps[index].reg == -1)
-			cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
-
-		idx = cs->temps[index].reg;
-
-		if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
-			free_temp(fp, src);
-		break;
-	case REG_TYPE_INPUT:
-		idx = cs->inputs[index].reg;
-
-		if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
-			free_hw_temp(fp, cs->inputs[index].reg);
-		break;
-	case REG_TYPE_CONST:
-		return (index | SRC_CONST);
-	default:
-		ERROR("Invalid type for source reg\n");
-		return (0 | SRC_CONST);
+			ERROR("Can't handle src.File %x\n", src.File);
+			reg = 0x0;
+			break;
 	}
-
-	if (!tex)
-		cs->used_in_node |= (1 << idx);
-
-	return idx;
+	return reg;
 }
 
-static int t_hw_dst(struct r300_fragment_program *fp,
-		    GLuint dest, GLboolean tex, int slot)
-{
-	COMPILE_STATE;
-	int idx;
-	GLuint index = REG_GET_INDEX(dest);
-	assert(REG_GET_VALID(dest));
-
-	switch (REG_GET_TYPE(dest)) {
-	case REG_TYPE_TEMP:
-		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
-			if (!tex) {
-				cs->temps[index].reg = get_hw_temp(fp, slot);
-			} else {
-				cs->temps[index].reg = get_hw_temp_tex(fp);
-			}
-		}
-		idx = cs->temps[index].reg;
-
-		if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
-			free_temp(fp, dest);
-
-		cs->dest_in_node |= (1 << idx);
-		cs->used_in_node |= (1 << idx);
-		break;
-	case REG_TYPE_OUTPUT:
-		switch (index) {
-		case FRAG_RESULT_COLR:
-			fp->node[fp->cur_node].flags |=
-			    R300_PFS_NODE_OUTPUT_COLOR;
+static GLuint make_dest(struct r500_fragment_program *fp, struct prog_dst_register dest) {
+	GLuint reg;
+	switch (dest.File) {
+		case PROGRAM_TEMPORARY:
+			reg = (dest.Index << 0x1) | 0x1;
 			break;
-		case FRAG_RESULT_DEPR:
-			fp->node[fp->cur_node].flags |=
-			    R300_PFS_NODE_OUTPUT_DEPTH;
+		case PROGRAM_OUTPUT:
+			/* Eventually we may need to handle multiple
+			 * rendering targets... */
+			reg = dest.Index;
 			break;
-		}
-		return index;
-		break;
-	default:
-		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
-		return 0;
-	}
-
-	return idx;
-}
-
-static void emit_nop(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-
-	if (cs->nrslots >= PFS_MAX_ALU_INST) {
-		ERROR("Out of ALU instruction slots\n");
-		return;
-	}
-
-	fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
-	fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
-	fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
-	fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
-	cs->nrslots++;
-}
-
-static void emit_tex(struct r300_fragment_program *fp,
-		     struct prog_instruction *fpi, int opcode)
-{
-	COMPILE_STATE;
-	GLuint coord = t_src(fp, fpi->SrcReg[0]);
-	GLuint dest = undef, rdest = undef;
-	GLuint din, uin;
-	int unit = fpi->TexSrcUnit;
-	int hwsrc, hwdest;
-	GLuint tempreg = 0;
-
-	uin = cs->used_in_node;
-	din = cs->dest_in_node;
-
-	/* Resolve source/dest to hardware registers */
-	if (opcode != R300_FPITX_OP_KIL) {
-		if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
-			/**
-			 * Hardware uses [0..1]x[0..1] range for rectangle textures
-			 * instead of [0..Width]x[0..Height].
-			 * Add a scaling instruction.
-			 *
-			 * \todo Refactor this once we have proper rewriting/optimization
-			 * support for programs.
-			 */
-			gl_state_index tokens[STATE_LENGTH] = {
-				STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
-				0
-			};
-			int factor_index;
-			GLuint factorreg;
-
-			tokens[2] = unit;
-			factor_index =
-			    _mesa_add_state_reference(fp->mesa_program.Base.
-						      Parameters, tokens);
-			factorreg =
-			    emit_const4fv(fp,
-					  fp->mesa_program.Base.Parameters->
-					  ParameterValues[factor_index]);
-			tempreg = keep(get_temp_reg(fp));
-
-			emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
-				   coord, factorreg, pfs_zero, 0);
-
-			/* Ensure correct node indirection */
-			uin = cs->used_in_node;
-			din = cs->dest_in_node;
-
-			hwsrc = t_hw_src(fp, tempreg, GL_TRUE);
-		} else {
-			hwsrc = t_hw_src(fp, coord, GL_TRUE);
-		}
-
-		dest = t_dst(fp, fpi->DstReg);
-
-		/* r300 doesn't seem to be able to do TEX->output reg */
-		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-			rdest = dest;
-			dest = get_temp_reg_tex(fp);
-		} else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
-			/* in case write mask isn't XYZW */
-			rdest = dest;
-			dest = get_temp_reg_tex(fp);
-		}
-		hwdest =
-		    t_hw_dst(fp, dest, GL_TRUE,
-			     fp->node[fp->cur_node].alu_offset);
-
-		/* Use a temp that hasn't been used in this node, rather
-		 * than causing an indirection
-		 */
-		if (uin & (1 << hwdest)) {
-			free_hw_temp(fp, hwdest);
-			hwdest = get_hw_temp_tex(fp);
-			cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
-		}
-	} else {
-		hwdest = 0;
-		unit = 0;
-		hwsrc = t_hw_src(fp, coord, GL_TRUE);
-	}
-
-	/* Indirection if source has been written in this node, or if the
-	 * dest has been read/written in this node
-	 */
-	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
-	     (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
-
-		/* Finish off current node */
-		if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
-			emit_nop(fp);
-
-		fp->node[fp->cur_node].alu_end =
-		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
-		assert(fp->node[fp->cur_node].alu_end >= 0);
-
-		if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
-			ERROR("too many levels of texture indirection\n");
-			return;
-		}
-
-		/* Start new node */
-		fp->node[fp->cur_node].tex_offset = fp->tex.length;
-		fp->node[fp->cur_node].alu_offset = cs->nrslots;
-		fp->node[fp->cur_node].tex_end = -1;
-		fp->node[fp->cur_node].alu_end = -1;
-		fp->node[fp->cur_node].flags = 0;
-		cs->used_in_node = 0;
-		cs->dest_in_node = 0;
-	}
-
-	if (fp->cur_node == 0)
-		fp->first_node_has_tex = 1;
-
-	fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_FPITX_SRC_SHIFT)
-	    | (hwdest << R300_FPITX_DST_SHIFT)
-	    | (unit << R300_FPITX_IMAGE_SHIFT)
-	    /* not entirely sure about this */
-	    | (opcode << R300_FPITX_OPCODE_SHIFT);
-
-	cs->dest_in_node |= (1 << hwdest);
-	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
-		cs->used_in_node |= (1 << hwsrc);
-
-	fp->node[fp->cur_node].tex_end++;
-
-	/* Copy from temp to output if needed */
-	if (REG_GET_VALID(rdest)) {
-		emit_arith(fp, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
-			   pfs_one, pfs_zero, 0);
-		free_temp(fp, dest);
-	}
-
-	/* Free temp register */
-	if (tempreg != 0)
-		free_temp(fp, tempreg);
-}
-
-/**
- * Returns the first slot where we could possibly allow writing to dest,
- * according to register allocation.
- */
-static int get_earliest_allowed_write(struct r300_fragment_program *fp,
-				      GLuint dest, int mask)
-{
-	COMPILE_STATE;
-	int idx;
-	int pos;
-	GLuint index = REG_GET_INDEX(dest);
-	assert(REG_GET_VALID(dest));
-
-	switch (REG_GET_TYPE(dest)) {
-	case REG_TYPE_TEMP:
-		if (cs->temps[index].reg == -1)
-			return 0;
-
-		idx = cs->temps[index].reg;
-		break;
-	case REG_TYPE_OUTPUT:
-		return 0;
-	default:
-		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
-		return 0;
-	}
-
-	pos = cs->hwtemps[idx].reserved;
-	if (mask & WRITEMASK_XYZ) {
-		if (pos < cs->hwtemps[idx].vector_lastread)
-			pos = cs->hwtemps[idx].vector_lastread;
-	}
-	if (mask & WRITEMASK_W) {
-		if (pos < cs->hwtemps[idx].scalar_lastread)
-			pos = cs->hwtemps[idx].scalar_lastread;
-	}
-
-	return pos;
-}
-
-/**
- * Allocates a slot for an ALU instruction that can consist of
- * a vertex part or a scalar part or both.
- *
- * Sources from src (src[0] to src[argc-1]) are added to the slot in the
- * appropriate position (vector and/or scalar), and their positions are
- * recorded in the srcpos array.
- *
- * This function emits instruction code for the source fetch and the
- * argument selection. It does not emit instruction code for the
- * opcode or the destination selection.
- *
- * @return the index of the slot
- */
-static int find_and_prepare_slot(struct r300_fragment_program *fp,
-				 GLboolean emit_vop,
-				 GLboolean emit_sop,
-				 int argc, GLuint * src, GLuint dest, int mask)
-{
-	COMPILE_STATE;
-	int hwsrc[3];
-	int srcpos[3];
-	unsigned int used;
-	int tempused;
-	int tempvsrc[3];
-	int tempssrc[3];
-	int pos;
-	int regnr;
-	int i, j;
-
-	// Determine instruction slots, whether sources are required on
-	// vector or scalar side, and the smallest slot number where
-	// all source registers are available
-	used = 0;
-	if (emit_vop)
-		used |= SLOT_OP_VECTOR;
-	if (emit_sop)
-		used |= SLOT_OP_SCALAR;
-
-	pos = get_earliest_allowed_write(fp, dest, mask);
-
-	if (fp->node[fp->cur_node].alu_offset > pos)
-		pos = fp->node[fp->cur_node].alu_offset;
-	for (i = 0; i < argc; ++i) {
-		if (!REG_GET_BUILTIN(src[i])) {
-			if (emit_vop)
-				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
-			if (emit_sop)
-				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
-		}
-
-		hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE);	/* Note: sideeffects wrt refcounting! */
-		regnr = hwsrc[i] & 31;
-
-		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
-			if (used & (SLOT_SRC_VECTOR << i)) {
-				if (cs->hwtemps[regnr].vector_valid > pos)
-					pos = cs->hwtemps[regnr].vector_valid;
-			}
-			if (used & (SLOT_SRC_SCALAR << i)) {
-				if (cs->hwtemps[regnr].scalar_valid > pos)
-					pos = cs->hwtemps[regnr].scalar_valid;
-			}
-		}
-	}
-
-	// Find a slot that fits
-	for (;; ++pos) {
-		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
-			continue;
-
-		if (pos >= cs->nrslots) {
-			if (cs->nrslots >= PFS_MAX_ALU_INST) {
-				ERROR("Out of ALU instruction slots\n");
-				return -1;
-			}
-
-			fp->alu.inst[pos].inst0 = NOP_INST0;
-			fp->alu.inst[pos].inst1 = NOP_INST1;
-			fp->alu.inst[pos].inst2 = NOP_INST2;
-			fp->alu.inst[pos].inst3 = NOP_INST3;
-
-			cs->nrslots++;
-		}
-		// Note: When we need both parts (vector and scalar) of a source,
-		// we always try to put them into the same position. This makes the
-		// code easier to read, and it is optimal (i.e. one doesn't gain
-		// anything by splitting the parts).
-		// It also avoids headaches with swizzles that access both parts (i.e WXY)
-		tempused = cs->slot[pos].used;
-		for (i = 0; i < 3; ++i) {
-			tempvsrc[i] = cs->slot[pos].vsrc[i];
-			tempssrc[i] = cs->slot[pos].ssrc[i];
-		}
-
-		for (i = 0; i < argc; ++i) {
-			int flags = (used >> i) & SLOT_SRC_BOTH;
-
-			if (!flags) {
-				srcpos[i] = 0;
-				continue;
-			}
-
-			for (j = 0; j < 3; ++j) {
-				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
-					if (tempvsrc[j] != hwsrc[i])
-						continue;
-				}
-
-				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
-					if (tempssrc[j] != hwsrc[i])
-						continue;
-				}
-
-				break;
-			}
-
-			if (j == 3)
-				break;
-
-			srcpos[i] = j;
-			tempused |= flags << j;
-			if (flags & SLOT_SRC_VECTOR)
-				tempvsrc[j] = hwsrc[i];
-			if (flags & SLOT_SRC_SCALAR)
-				tempssrc[j] = hwsrc[i];
-		}
-
-		if (i == argc)
+		default:
+			ERROR("Can't handle dest.File %x\n", dest.File);
+			reg = 0x0;
 			break;
 	}
-
-	// Found a slot, reserve it
-	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
-	for (i = 0; i < 3; ++i) {
-		cs->slot[pos].vsrc[i] = tempvsrc[i];
-		cs->slot[pos].ssrc[i] = tempssrc[i];
-	}
-
-	for (i = 0; i < argc; ++i) {
-		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
-			int regnr = hwsrc[i] & 31;
-
-			if (used & (SLOT_SRC_VECTOR << i)) {
-				if (cs->hwtemps[regnr].vector_lastread < pos)
-					cs->hwtemps[regnr].vector_lastread =
-					    pos;
-			}
-			if (used & (SLOT_SRC_SCALAR << i)) {
-				if (cs->hwtemps[regnr].scalar_lastread < pos)
-					cs->hwtemps[regnr].scalar_lastread =
-					    pos;
-			}
-		}
-	}
-
-	// Emit the source fetch code
-	fp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
-	fp->alu.inst[pos].inst1 |=
-	    ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
-	     (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
-	     (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
-
-	fp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
-	fp->alu.inst[pos].inst3 |=
-	    ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
-	     (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
-	     (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
-
-	// Emit the argument selection code
-	if (emit_vop) {
-		int swz[3];
-
-		for (i = 0; i < 3; ++i) {
-			if (i < argc) {
-				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
-					  (srcpos[i] *
-					   v_swiz[REG_GET_VSWZ(src[i])].
-					   stride)) | ((src[i] & REG_NEGV_MASK)
-						       ? ARG_NEG : 0) | ((src[i]
-									  &
-									  REG_ABS_MASK)
-									 ?
-									 ARG_ABS
-									 : 0);
-			} else {
-				swz[i] = R300_FPI0_ARGC_ZERO;
-			}
-		}
-
-		fp->alu.inst[pos].inst0 &=
-		    ~(R300_FPI0_ARG0C_MASK | R300_FPI0_ARG1C_MASK |
-		      R300_FPI0_ARG2C_MASK);
-		fp->alu.inst[pos].inst0 |=
-		    (swz[0] << R300_FPI0_ARG0C_SHIFT) | (swz[1] <<
-							 R300_FPI0_ARG1C_SHIFT)
-		    | (swz[2] << R300_FPI0_ARG2C_SHIFT);
-	}
-
-	if (emit_sop) {
-		int swz[3];
-
-		for (i = 0; i < 3; ++i) {
-			if (i < argc) {
-				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
-					  (srcpos[i] *
-					   s_swiz[REG_GET_SSWZ(src[i])].
-					   stride)) | ((src[i] & REG_NEGV_MASK)
-						       ? ARG_NEG : 0) | ((src[i]
-									  &
-									  REG_ABS_MASK)
-									 ?
-									 ARG_ABS
-									 : 0);
-			} else {
-				swz[i] = R300_FPI2_ARGA_ZERO;
-			}
-		}
-
-		fp->alu.inst[pos].inst2 &=
-		    ~(R300_FPI2_ARG0A_MASK | R300_FPI2_ARG1A_MASK |
-		      R300_FPI2_ARG2A_MASK);
-		fp->alu.inst[pos].inst2 |=
-		    (swz[0] << R300_FPI2_ARG0A_SHIFT) | (swz[1] <<
-							 R300_FPI2_ARG1A_SHIFT)
-		    | (swz[2] << R300_FPI2_ARG2A_SHIFT);
-	}
-
-	return pos;
-}
-
-/**
- * Append an ALU instruction to the instruction list.
- */
-static void emit_arith(struct r300_fragment_program *fp,
-		       int op,
-		       GLuint dest,
-		       int mask,
-		       GLuint src0, GLuint src1, GLuint src2, int flags)
-{
-	COMPILE_STATE;
-	GLuint src[3] = { src0, src1, src2 };
-	int hwdest;
-	GLboolean emit_vop, emit_sop;
-	int vop, sop, argc;
-	int pos;
-
-	vop = r300_fpop[op].v_op;
-	sop = r300_fpop[op].s_op;
-	argc = r300_fpop[op].argc;
-
-	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
-		if (mask & WRITEMASK_Z) {
-			mask = WRITEMASK_W;
-		} else {
-			return;
-		}
-	}
-
-	emit_vop = GL_FALSE;
-	emit_sop = GL_FALSE;
-	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
-		emit_vop = GL_TRUE;
-	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
-		emit_sop = GL_TRUE;
-
-	pos =
-	    find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
-				  mask);
-	if (pos < 0)
-		return;
-
-	hwdest = t_hw_dst(fp, dest, GL_FALSE, pos);	/* Note: Side effects wrt register allocation */
-
-	if (flags & PFS_FLAG_SAT) {
-		vop |= R300_FPI0_OUTC_SAT;
-		sop |= R300_FPI2_OUTA_SAT;
-	}
-
-	/* Throw the pieces together and get FPI0/1 */
-	if (emit_vop) {
-		fp->alu.inst[pos].inst0 |= vop;
-
-		fp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
-
-		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-				fp->alu.inst[pos].inst1 |=
-				    (mask & WRITEMASK_XYZ) <<
-				    R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
-			} else
-				assert(0);
-		} else {
-			fp->alu.inst[pos].inst1 |=
-			    (mask & WRITEMASK_XYZ) <<
-			    R300_FPI1_DSTC_REG_MASK_SHIFT;
-
-			cs->hwtemps[hwdest].vector_valid = pos + 1;
-		}
-	}
-
-	/* And now FPI2/3 */
-	if (emit_sop) {
-		fp->alu.inst[pos].inst2 |= sop;
-
-		if (mask & WRITEMASK_W) {
-			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-					fp->alu.inst[pos].inst3 |=
-					    (hwdest << R300_FPI3_DSTA_SHIFT) |
-					    R300_FPI3_DSTA_OUTPUT;
-				} else if (REG_GET_INDEX(dest) ==
-					   FRAG_RESULT_DEPR) {
-					fp->alu.inst[pos].inst3 |=
-					    R300_FPI3_DSTA_DEPTH;
-				} else
-					assert(0);
-			} else {
-				fp->alu.inst[pos].inst3 |=
-				    (hwdest << R300_FPI3_DSTA_SHIFT) |
-				    R300_FPI3_DSTA_REG;
-
-				cs->hwtemps[hwdest].scalar_valid = pos + 1;
-			}
-		}
-	}
-
-	return;
+	return reg;
 }
 
-#if 0
-static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
+static void dumb_shader(struct r500_fragment_program *fp)
 {
-	struct gl_fragment_program *mp = &fp->mesa_program;
-	GLuint r = undef;
-
-	if (!(mp->Base.InputsRead & (1 << attr))) {
-		ERROR("Attribute %d was not provided!\n", attr);
-		return undef;
-	}
-
-	REG_SET_TYPE(r, REG_TYPE_INPUT);
-	REG_SET_INDEX(r, attr);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
+	/* R500_INST_TYPE_TEX? */
+	fp->inst[0].inst0 = 0x7808;
+	fp->inst[0].inst1 = R500_TEX_ID(0) | R500_TEX_INST_LD | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+	fp->inst[0].inst2 = R500_TEX_SRC_ADDR(0) |  R500_TEX_SRC_S_SWIZ_R |
+		R500_TEX_SRC_T_SWIZ_G |
+		R500_TEX_DST_ADDR(0) |
+		R500_TEX_DST_R_SWIZ_R |
+		R500_TEX_DST_G_SWIZ_G |
+		R500_TEX_DST_B_SWIZ_B |
+		R500_TEX_DST_A_SWIZ_A;
+	fp->inst[0].inst3 = R500_DX_ADDR(0) |
+		R500_DX_S_SWIZ_R |
+		R500_DX_T_SWIZ_R |
+		R500_DX_R_SWIZ_R |
+		R500_DX_Q_SWIZ_R |
+		R500_DY_ADDR(0) |
+		R500_DY_S_SWIZ_R |
+		R500_DY_T_SWIZ_R |
+		R500_DY_R_SWIZ_R |
+		R500_DY_Q_SWIZ_R;
+	fp->inst[0].inst4 = 0x0;
+	fp->inst[0].inst5 = 0x0;
+
+	fp->inst[1].inst0 = R500_INST_TYPE_OUT |
+		R500_INST_TEX_SEM_WAIT |
+		R500_INST_LAST |
+		R500_INST_RGB_OMASK_R |
+		R500_INST_RGB_OMASK_G |
+		R500_INST_RGB_OMASK_B |
+		R500_INST_ALPHA_OMASK;
+	fp->inst[1].inst1 = R500_RGB_ADDR0(0) |
+		R500_RGB_ADDR1(0) |
+		R500_RGB_ADDR1_CONST |
+		R500_RGB_ADDR2(0) |
+		R500_RGB_ADDR2_CONST |
+		R500_RGB_SRCP_OP_1_MINUS_2RGB0;
+	fp->inst[1].inst2 = R500_ALPHA_ADDR0(0) |
+		R500_ALPHA_ADDR1(0) |
+		R500_ALPHA_ADDR1_CONST |
+		R500_ALPHA_ADDR2(0) |
+		R500_ALPHA_ADDR2_CONST |
+		R500_ALPHA_SRCP_OP_1_MINUS_2A0;
+	fp->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 |
+		R500_ALU_RGB_R_SWIZ_A_R |
+		R500_ALU_RGB_G_SWIZ_A_G |
+		R500_ALU_RGB_B_SWIZ_A_B |
+		R500_ALU_RGB_SEL_B_SRC0 |
+		R500_ALU_RGB_R_SWIZ_B_1 |
+		R500_ALU_RGB_B_SWIZ_B_1 |
+		R500_ALU_RGB_G_SWIZ_B_1;
+	fp->inst[1].inst4 = R500_ALPHA_OP_MAD |
+		R500_ALPHA_SWIZ_A_A |
+		R500_ALPHA_SWIZ_B_1;
+	fp->inst[1].inst5 = R500_ALU_RGBA_OP_MAD |
+		R500_ALU_RGBA_R_SWIZ_0 |
+		R500_ALU_RGBA_G_SWIZ_0 |
+		R500_ALU_RGBA_B_SWIZ_0 |
+		R500_ALU_RGBA_A_SWIZ_0;
+
+	fp->cs->nrslots = 2;
+	fp->translated = GL_TRUE;
 }
-#endif
-
-static GLfloat SinCosConsts[2][4] = {
-	{
-	 1.273239545,		// 4/PI
-	 -0.405284735,		// -4/(PI*PI)
-	 3.141592654,		// PI
-	 0.2225			// weight
-	 },
-	{
-	 0.75,
-	 0.0,
-	 0.159154943,		// 1/(2*PI)
-	 6.283185307		// 2*PI
-	 }
-};
-
-/**
- * Emit a LIT instruction.
- * \p flags may be PFS_FLAG_SAT
- *
- * Definition of LIT (from ARB_fragment_program):
- * tmp = VectorLoad(op0);
- * if (tmp.x < 0) tmp.x = 0;
- * if (tmp.y < 0) tmp.y = 0;
- * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- * result.x = 1.0;
- * result.y = tmp.x;
- * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- * result.w = 1.0;
- *
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots. So unless there's some special undocumented opcode,
- * this implementation is potentially optimal. Unfortunately,
- * emit_arith is a bit too conservative because it doesn't understand
- * partial writes to the vector component.
- */
-static const GLfloat LitConst[4] =
-    { 127.999999, 127.999999, 127.999999, -127.999999 };
-
-static void emit_lit(struct r300_fragment_program *fp,
-		     GLuint dest, int mask, GLuint src, int flags)
-{
-	COMPILE_STATE;
-	GLuint cnst;
-	int needTemporary;
-	GLuint temp;
-
-	cnst = emit_const4fv(fp, LitConst);
-
-	needTemporary = 0;
-	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
-		needTemporary = 1;
-	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-		// LIT is typically followed by DP3/DP4, so there's no point
-		// in creating special code for this case
-		needTemporary = 1;
-	}
-
-	if (needTemporary) {
-		temp = keep(get_temp_reg(fp));
-	} else {
-		temp = keep(dest);
-	}
-
-	// Note: The order of emit_arith inside the slots is relevant,
-	// because emit_arith only looks at scalar vs. vector when resolving
-	// dependencies, and it does not consider individual vector components,
-	// so swizzling between the two parts can create fake dependencies.
 
-	// First slot
-	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
-		   keep(src), pfs_zero, undef, 0);
-	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
-
-	// Second slot
-	emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
-		   swizzle(temp, W, W, W, W), cnst, undef, 0);
-	emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
-		   swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
-
-	// Third slot
-	// If desired, we saturate the y result here.
-	// This does not affect the use as a condition variable in the CMP later
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
-		   temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
-		   swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
-
-	// Fourth slot
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
-		   pfs_one, pfs_one, pfs_zero, 0);
-	emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
-
-	// Fifth slot
-	emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
-		   pfs_zero, swizzle(temp, W, W, W, W),
-		   negate(swizzle(temp, Y, Y, Y, Y)), flags);
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
-		   pfs_zero, 0);
-
-	if (needTemporary) {
-		emit_arith(fp, PFS_OP_MAD, dest, mask,
-			   temp, pfs_one, pfs_zero, flags);
-		free_temp(fp, temp);
-	} else {
-		// Decrease refcount of the destination
-		t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
-	}
+static void emit_alu(struct r500_fragment_program *fp) {
 }
 
-static GLboolean parse_program(struct r300_fragment_program *fp)
+static GLboolean parse_program(struct r500_fragment_program *fp)
 {
 	struct gl_fragment_program *mp = &fp->mesa_program;
 	const struct prog_instruction *inst = mp->Base.Instructions;
 	struct prog_instruction *fpi;
 	GLuint src[3], dest, temp[2];
-	int flags, mask = 0;
-	int const_sin[2];
+	int flags, mask, counter = 0;
 
 	if (!inst || inst[0].Opcode == OPCODE_END) {
-		ERROR("empty program?\n");
+		ERROR("The program is empty!\n");
 		return GL_FALSE;
 	}
 
 	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-		if (fpi->SaturateMode == SATURATE_ZERO_ONE)
-			flags = PFS_FLAG_SAT;
-		else
-			flags = 0;
 
 		if (fpi->Opcode != OPCODE_KIL) {
-			dest = t_dst(fp, fpi->DstReg);
-			mask = fpi->DstReg.WriteMask;
+			dest = make_dest(fp, fpi->DstReg);
+			mask = fpi->DstReg.WriteMask << 11;
 		}
 
 		switch (fpi->Opcode) {
-		case OPCODE_ABS:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   absolute(src[0]), pfs_one, pfs_zero, flags);
-			break;
-		case OPCODE_ADD:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, src[1], flags);
-			break;
-		case OPCODE_CMP:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			/* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
-			 *    r300 - if src2.c < 0.0 ? src1.c : src0.c
-			 */
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   src[2], src[1], src[0], flags);
-			break;
-		case OPCODE_COS:
-			/*
-			 * cos using a parabola (see SIN):
-			 * cos(x):
-			 *   x = (x/(2*PI))+0.75
-			 *   x = frac(x)
-			 *   x = (x*2*PI)-PI
-			 *   result = sin(x)
-			 */
-			temp[0] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* add 0.5*PI and do range reduction */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(src[0], X, X, X, X),
-				   swizzle(const_sin[1], Z, Z, Z, Z),
-				   swizzle(const_sin[1], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], X, X, X, X),
-				   undef, undef, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
-				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//-PI
-				   0);
-
-			/* SIN */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
-				   swizzle(temp[0], X, X, X, X),
-				   absolute(swizzle(temp[0], X, X, X, X)),
-				   negate(swizzle(temp[0], X, X, X, X)), 0);
-
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[0], X, X, X, X), flags);
-
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_DP3:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_DP3, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_DP4:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_DPH:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			/* src0.xyz1 -> temp
-			 * DP4 dest, temp, src1
-			 */
-#if 0
-			temp[0] = get_temp_reg(fp);
-			src[0].s_swz = SWIZZLE_ONE;
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, pfs_zero, 0);
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   temp[0], src[1], undef, flags);
-			free_temp(fp, temp[0]);
-#else
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   swizzle(src[0], X, Y, Z, ONE), src[1],
-				   undef, flags);
-#endif
-			break;
-		case OPCODE_DST:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			/* dest.y = src0.y * src1.y */
-			if (mask & WRITEMASK_Y)
-				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
-					   keep(src[0]), keep(src[1]),
-					   pfs_zero, flags);
-			/* dest.z = src0.z */
-			if (mask & WRITEMASK_Z)
-				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
-					   src[0], pfs_one, pfs_zero, flags);
-			/* result.x = 1.0
-			 * result.w = src1.w */
-			if (mask & WRITEMASK_XW) {
-				REG_SET_VSWZ(src[1], SWIZZLE_111);	/*Cheat */
-				emit_arith(fp, PFS_OP_MAD, dest,
-					   mask & WRITEMASK_XW,
-					   src[1], pfs_one, pfs_zero, flags);
-			}
-			break;
-		case OPCODE_EX2:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_EX2, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_FLR:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			temp[0] = get_temp_reg(fp);
-			/* FRC temp, src0
-			 * MAD dest, src0, 1.0, -temp
-			 */
-			emit_arith(fp, PFS_OP_FRC, temp[0], mask,
-				   keep(src[0]), undef, undef, 0);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, negate(temp[0]), flags);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_FRC:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_FRC, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_KIL:
-			emit_tex(fp, fpi, R300_FPITX_OP_KIL);
-			break;
-		case OPCODE_LG2:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_LG2, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_LIT:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_lit(fp, dest, mask, src[0], flags);
-			break;
-		case OPCODE_LRP:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			/* result = tmp0tmp1 + (1 - tmp0)tmp2
-			 *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
-			 *     MAD temp, -tmp0, tmp2, tmp2
-			 *     MAD result, tmp0, tmp1, temp
-			 */
-			temp[0] = get_temp_reg(fp);
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   negate(keep(src[0])), keep(src[2]), src[2],
-				   0);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], temp[0], flags);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_MAD:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], src[2], flags);
-			break;
-		case OPCODE_MAX:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAX, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_MIN:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MIN, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_MOV:
-		case OPCODE_SWZ:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, pfs_zero, flags);
-			break;
-		case OPCODE_MUL:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], pfs_zero, flags);
-			break;
-		case OPCODE_POW:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
-				   src[0], undef, undef, 0);
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
-				   temp[0], src[1], pfs_zero, 0);
-			emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
-				   temp[0], undef, undef, 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_RCP:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_RCP, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_RSQ:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_RSQ, dest, mask,
-				   absolute(src[0]), pfs_zero, pfs_zero, flags);
-			break;
-		case OPCODE_SCS:
-			/*
-			 * scs using a parabola :
-			 * scs(x):
-			 *   result.x = sin(-abs(x)+0.5*PI)  (cos)
-			 *   result.y = sin(x)               (sin)
-			 *
-			 */
-			temp[0] = get_temp_reg(fp);
-			temp[1] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* x = -abs(x)+0.5*PI */
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),	//PI
-				   pfs_half,
-				   negate(abs
-					  (swizzle(keep(src[0]), X, X, X, X))),
-				   0);
-
-			/* C*x (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
-				   swizzle(const_sin[0], Y, Y, Y, Y),
-				   swizzle(keep(src[0]), X, X, X, X),
-				   pfs_zero, 0);
-
-			/* B*x, C*x (cos) */
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			/* B*x (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
-				   swizzle(const_sin[0], X, X, X, X),
-				   keep(src[0]), pfs_zero, 0);
-
-			/* y = B*x + C*x*abs(x) (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
-				   absolute(src[0]),
-				   swizzle(temp[0], W, W, W, W),
-				   swizzle(temp[1], W, W, W, W), 0);
-
-			/* y = B*x + C*x*abs(x) (cos) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			/* y*abs(y) - y (cos), y*abs(y) - y (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
-								      W, Z, Y,
-								      X),
-				   absolute(swizzle(temp[1], W, Z, Y, X)),
-				   negate(swizzle(temp[1], W, Z, Y, X)), 0);
-
-			/* dest.xy = mad(temp.xy, P, temp2.wz) */
-			emit_arith(fp, PFS_OP_MAD, dest,
-				   mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[1], W, Z, Y, X), flags);
-
-			free_temp(fp, temp[0]);
-			free_temp(fp, temp[1]);
-			break;
-		case OPCODE_SGE:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			/* temp = src0 - src1
-			 * dest.c = (temp.c < 0.0) ? 0 : 1
-			 */
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, negate(src[1]), 0);
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   pfs_one, pfs_zero, temp[0], 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SIN:
-			/*
-			 *  using a parabola:
-			 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
-			 * extra precision is obtained by weighting against
-			 * itself squared.
-			 */
-
-			temp[0] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* do range reduction */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(keep(src[0]), X, X, X, X),
-				   swizzle(const_sin[1], Z, Z, Z, Z),
-				   pfs_half, 0);
-
-			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], X, X, X, X),
-				   undef, undef, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
-				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//PI
-				   0);
-
-			/* SIN */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
-				   swizzle(temp[0], X, X, X, X),
-				   absolute(swizzle(temp[0], X, X, X, X)),
-				   negate(swizzle(temp[0], X, X, X, X)), 0);
-
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[0], X, X, X, X), flags);
-
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SLT:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			/* temp = src0 - src1
-			 * dest.c = (temp.c < 0.0) ? 1 : 0
-			 */
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, negate(src[1]), 0);
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   pfs_zero, pfs_one, temp[0], 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SUB:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, negate(src[1]), flags);
-			break;
-		case OPCODE_TEX:
-			emit_tex(fp, fpi, R300_FPITX_OP_TEX);
-			break;
-		case OPCODE_TXB:
-			emit_tex(fp, fpi, R300_FPITX_OP_TXB);
-			break;
-		case OPCODE_TXP:
-			emit_tex(fp, fpi, R300_FPITX_OP_TXP);
-			break;
-		case OPCODE_XPD:{
-				src[0] = t_src(fp, fpi->SrcReg[0]);
-				src[1] = t_src(fp, fpi->SrcReg[1]);
-				temp[0] = get_temp_reg(fp);
-				/* temp = src0.zxy * src1.yzx */
-				emit_arith(fp, PFS_OP_MAD, temp[0],
-					   WRITEMASK_XYZ, swizzle(keep(src[0]),
-								  Z, X, Y, W),
-					   swizzle(keep(src[1]), Y, Z, X, W),
-					   pfs_zero, 0);
-				/* dest.xyz = src0.yzx * src1.zxy - temp
-				 * dest.w       = undefined
-				 * */
-				emit_arith(fp, PFS_OP_MAD, dest,
-					   mask & WRITEMASK_XYZ, swizzle(src[0],
-									 Y, Z,
-									 X, W),
-					   swizzle(src[1], Z, X, Y, W),
-					   negate(temp[0]), flags);
-				/* cleanup */
-				free_temp(fp, temp[0]);
+			case OPCODE_ABS:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				/* Variation on MOV */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_MOD_A_ABS | R500_ALU_RGB_SEL_B_SRC0
+					| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAX
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0
+					| MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) | R500_ALPHA_MOD_A_ABS
+					| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+					| R500_ALU_RGBA_ADDRD(dest);
+				break;
+			case OPCODE_ADD:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				/* Variation on MAD: 1*src0+src1 */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+					| R500_RGB_ADDR1(src[1]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+					| R500_ALPHA_ADDR1(src[1]);
+				fp->inst[counter].inst3 = /* 1 */
+					MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+					| R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
+					| R500_ALPHA_ADDRD(dest)
+					| MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+					| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+					| R500_ALU_RGBA_ADDRD(dest)
+					| R500_ALU_RGBA_SEL_C_SRC1
+					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+					| R500_ALU_RGBA_ALPHA_SEL_C_SRC1
+					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]));
+				break;
+			case OPCODE_DP3:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				src[2] = make_src(fp, fpi->SrcReg[2]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+					| R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[2]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+					| R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[2]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_DP
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3
+					| R500_ALU_RGBA_ADDRD(dest)
+					| R500_ALU_RGBA_SEL_C_SRC2
+					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
+					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
+				break;
+			case OPCODE_DP4:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				src[2] = make_src(fp, fpi->SrcReg[2]);
+				/* Based on DP3 */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+					| R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[2]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+					| R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[2]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_DP
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
+					| R500_ALU_RGBA_ADDRD(dest)
+					| R500_ALU_RGBA_SEL_C_SRC2
+					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
+					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
+				break;
+			case OPCODE_MAD:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				src[2] = make_src(fp, fpi->SrcReg[2]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+					| R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[2]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+					| R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[2]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+					| R500_ALU_RGBA_ADDRD(dest)
+					| R500_ALU_RGBA_SEL_C_SRC2
+					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
+					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
+				break;
+			case OPCODE_MAX:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[0]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1
+					| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAX
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+					| R500_ALU_RGBA_ADDRD(dest);
+				break;
+			case OPCODE_MIN:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[0]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1
+					| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MIN
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
+					| R500_ALU_RGBA_ADDRD(dest);
+				break;
+			case OPCODE_MOV:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				/* We use MAX, but MIN, CND, and CMP also work.
+				 * Just remember to disable the OMOD! */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| R500_ALU_RGB_R_SWIZ_A_R | R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B
+					| R500_ALU_RGB_SEL_B_SRC0
+					| R500_ALU_RGB_R_SWIZ_B_R | R500_ALU_RGB_G_SWIZ_B_G | R500_ALU_RGB_B_SWIZ_B_B
+					| R500_ALU_RGB_OMOD_DISABLE;
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAX
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SEL_B_SRC0
+					| R500_ALPHA_OMOD_DISABLE;
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+					| R500_ALU_RGBA_ADDRD(dest);
+				break;
+			case OPCODE_MUL:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				/* Variation on MAD: src0*src1+0 */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+					| R500_RGB_ADDR1(src[1]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+					| R500_ALPHA_ADDR1(src[1]);
+				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+					| R500_ALU_RGBA_ADDRD(dest)
+					// | R500_ALU_RGBA_SEL_C_SRC2
+					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+					// | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+				break;
+			case OPCODE_SUB:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				src[1] = make_src(fp, fpi->SrcReg[1]);
+				/* Variation on MAD: 1*src0-src1 */
+				fp->inst[counter].inst0 = R500_INST_TYPE_ALU
+					| mask;
+				fp->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
+					| R500_RGB_ADDR2(src[1]);
+				fp->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
+					| R500_ALPHA_ADDR2(src[1]);
+				fp->inst[counter].inst3 = /* 1 */
+					MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
+					| R500_ALPHA_ADDRD(dest)
+					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+					| R500_ALU_RGBA_ADDRD(dest)
+					| R500_ALU_RGBA_SEL_C_SRC2
+					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+					| R500_ALU_RGBA_MOD_C_NEG
+					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
+					| R500_ALU_RGBA_ALPHA_MOD_C_NEG;
+				break;
+			case OPCODE_TEX:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_TEX | mask
+					| R500_INST_TEX_SEM_WAIT;
+				fp->inst[counter].inst1 = fpi->TexSrcUnit
+					| R500_TEX_INST_LD | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+				fp->inst[counter].inst2 = R500_TEX_SRC_ADDR(src[0])
+					/* | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0])) */
+					| R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
+					| R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A
+					| R500_TEX_DST_ADDR(dest)
+					| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
+					| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+				fp->inst[counter].inst3 = 0x0;
+				fp->inst[counter].inst4 = 0x0;
+				fp->inst[counter].inst5 = 0x0;
+				break;
+			case OPCODE_TXP:
+				src[0] = make_src(fp, fpi->SrcReg[0]);
+				fp->inst[counter].inst0 = R500_INST_TYPE_TEX | mask;
+				fp->inst[counter].inst1 = fpi->TexSrcUnit
+					| R500_TEX_INST_PROJ | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+				fp->inst[counter].inst2 = R500_TEX_SRC_ADDR(src[0])
+					/* | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0])) */
+					| R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
+					| R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A
+					| R500_TEX_DST_ADDR(dest)
+					| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
+					| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+				fp->inst[counter].inst3 = 0x0;
+				fp->inst[counter].inst4 = 0x0;
+				fp->inst[counter].inst5 = 0x0;
+				break;
+			default:
+				ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
 				break;
-			}
-		default:
-			ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
-			break;
 		}
 
+		/* Finishing touches */
+		if (fpi->SaturateMode == SATURATE_ZERO_ONE) {
+			fp->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
+		}
+		if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+			fp->inst[counter].inst0 |= R500_INST_TYPE_OUT
+			| R500_INST_RGB_OMASK_R | R500_INST_RGB_OMASK_G
+			| R500_INST_RGB_OMASK_B | R500_INST_ALPHA_OMASK;
+		}
+
+		counter++;
+
 		if (fp->error)
 			return GL_FALSE;
 
 	}
 
-	return GL_TRUE;
-}
-
-static void insert_wpos(struct gl_program *prog)
-{
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = prog->NumTemporaries;
-	/* should do something else if no temps left... */
-	prog->NumTemporaries++;
-
-	fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
-	_mesa_init_instructions(fpi, prog->NumInstructions + 3);
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(prog->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
+	fp->cs->nrslots = counter;
 
-	_mesa_copy_instructions(&fpi[i], prog->Instructions,
-				prog->NumInstructions);
-
-	free(prog->Instructions);
-
-	prog->Instructions = fpi;
-
-	prog->NumInstructions += i;
-	fpi = &prog->Instructions[prog->NumInstructions - 1];
-
-	assert(fpi->Opcode == OPCODE_END);
-
-	for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
-		for (i = 0; i < 3; i++)
-			if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
-			    fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
-				fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
-				fpi->SrcReg[i].Index = tempregi;
-			}
+	/* Finish him! (If it's an output instruction...)
+	 * Yes, I know it's ugly... */
+	if ((fp->inst[counter].inst0 & 0x3) ^ 0x2) {
+		fp->inst[counter].inst0 |= R500_INST_TYPE_OUT
+		| R500_INST_TEX_SEM_WAIT | R500_INST_LAST;
 	}
+
+	return GL_TRUE;
 }
 
-/* - Init structures
- * - Determine what hwregs each input corresponds to
- */
-static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
+static void init_program(r300ContextPtr r300, struct r500_fragment_program *fp)
 {
 	struct r300_pfs_compile_state *cs = NULL;
 	struct gl_fragment_program *mp = &fp->mesa_program;
@@ -2096,7 +576,6 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
 	fp->translated = GL_FALSE;
 	fp->error = GL_FALSE;
 	fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
-	fp->tex.length = 0;
 	fp->cur_node = 0;
 	fp->first_node_has_tex = 0;
 	fp->const_nr = 0;
@@ -2120,6 +599,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
 	 * starting from register 0.
 	 */
 
+#if 0
 	/* Texcoords come first */
 	for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
 		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
@@ -2160,6 +640,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
 			if (InputsRead & (1 << i))
 				cs->inputs[i].reg = 0;
 	}
+#endif
 
 	/* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
 	 * That way, we can free up the reg when it's no longer needed
@@ -2204,7 +685,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
 	cs->temp_in_use = temps_used;
 }
 
-static void update_params(struct r300_fragment_program *fp)
+static void update_params(struct r500_fragment_program *fp)
 {
 	struct gl_fragment_program *mp = &fp->mesa_program;
 
@@ -2214,17 +695,25 @@ static void update_params(struct r300_fragment_program *fp)
 }
 
 void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r300_fragment_program *fp)
+				 struct r500_fragment_program *fp)
 {
+
 	struct r300_pfs_compile_state *cs = NULL;
 
 	if (!fp->translated) {
 
+		/* I need to see what I'm working with! */
+		fprintf(stderr, "Mesa program:\n");
+		fprintf(stderr, "-------------\n");
+		_mesa_print_program(&fp->mesa_program.Base);
+		fflush(stdout);
+
 		init_program(r300, fp);
 		cs = fp->cs;
 
 		if (parse_program(fp) == GL_FALSE) {
-			dump_program(fp);
+			ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n");
+			dumb_shader(fp);
 			return;
 		}
 
@@ -2235,242 +724,12 @@ void r500TranslateFragmentShader(r300ContextPtr r300,
 			fp->node[fp->cur_node].tex_end = 0;
 		fp->alu_offset = 0;
 		fp->alu_end = cs->nrslots - 1;
-		fp->tex_offset = 0;
-		fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
-		assert(fp->node[fp->cur_node].alu_end >= 0);
-		assert(fp->alu_end >= 0);
+		//assert(fp->node[fp->cur_node].alu_end >= 0);
+		//assert(fp->alu_end >= 0);
 
 		fp->translated = GL_TRUE;
-		if (RADEON_DEBUG & DEBUG_PIXEL)
-			dump_program(fp);
 		r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
 	}
 
 	update_params(fp);
 }
-
-/* just some random things... */
-static void dump_program(struct r300_fragment_program *fp)
-{
-	int n, i, j;
-	static int pc = 0;
-
-	fprintf(stderr, "pc=%d*************************************\n", pc++);
-
-	fprintf(stderr, "Mesa program:\n");
-	fprintf(stderr, "-------------\n");
-	_mesa_print_program(&fp->mesa_program.Base);
-	fflush(stdout);
-
-	fprintf(stderr, "Hardware program\n");
-	fprintf(stderr, "----------------\n");
-
-	for (n = 0; n < (fp->cur_node + 1); n++) {
-		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
-			"alu_end: %d, tex_end: %d\n", n,
-			fp->node[n].alu_offset,
-			fp->node[n].tex_offset,
-			fp->node[n].alu_end, fp->node[n].tex_end);
-
-		if (fp->tex.length) {
-			fprintf(stderr, "  TEX:\n");
-			for (i = fp->node[n].tex_offset;
-			     i <= fp->node[n].tex_offset + fp->node[n].tex_end;
-			     ++i) {
-				const char *instr;
-
-				switch ((fp->tex.
-					 inst[i] >> R300_FPITX_OPCODE_SHIFT) &
-					15) {
-				case R300_FPITX_OP_TEX:
-					instr = "TEX";
-					break;
-				case R300_FPITX_OP_KIL:
-					instr = "KIL";
-					break;
-				case R300_FPITX_OP_TXP:
-					instr = "TXP";
-					break;
-				case R300_FPITX_OP_TXB:
-					instr = "TXB";
-					break;
-				default:
-					instr = "UNKNOWN";
-				}
-
-				fprintf(stderr,
-					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
-					instr,
-					(fp->tex.
-					 inst[i] >> R300_FPITX_DST_SHIFT) & 31,
-					(fp->tex.
-					 inst[i] & R300_FPITX_SRC_CONST) ? 'c' :
-					't',
-					(fp->tex.
-					 inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
-					(fp->tex.
-					 inst[i] & R300_FPITX_IMAGE_MASK) >>
-					R300_FPITX_IMAGE_SHIFT,
-					fp->tex.inst[i]);
-			}
-		}
-
-		for (i = fp->node[n].alu_offset;
-		     i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
-			char srcc[3][10], dstc[20];
-			char srca[3][10], dsta[20];
-			char argc[3][20];
-			char arga[3][20];
-			char flags[5], tmp[10];
-
-			for (j = 0; j < 3; ++j) {
-				int regc = fp->alu.inst[i].inst1 >> (j * 6);
-				int rega = fp->alu.inst[i].inst3 >> (j * 6);
-
-				sprintf(srcc[j], "%c%i",
-					(regc & 32) ? 'c' : 't', regc & 31);
-				sprintf(srca[j], "%c%i",
-					(rega & 32) ? 'c' : 't', rega & 31);
-			}
-
-			dstc[0] = 0;
-			sprintf(flags, "%s%s%s",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
-			if (flags[0] != 0) {
-				sprintf(dstc, "t%i.%s ",
-					(fp->alu.inst[i].
-					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
-					flags);
-			}
-			sprintf(flags, "%s%s%s",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
-				(fp->alu.inst[i].
-				 inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
-			if (flags[0] != 0) {
-				sprintf(tmp, "o%i.%s",
-					(fp->alu.inst[i].
-					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
-					flags);
-				strcat(dstc, tmp);
-			}
-
-			dsta[0] = 0;
-			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
-				sprintf(dsta, "t%i.w ",
-					(fp->alu.inst[i].
-					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
-			}
-			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
-				sprintf(tmp, "o%i.w ",
-					(fp->alu.inst[i].
-					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
-				strcat(dsta, tmp);
-			}
-			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
-				strcat(dsta, "Z");
-			}
-
-			fprintf(stderr,
-				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
-				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
-				srcc[0], srcc[1], srcc[2], dstc,
-				fp->alu.inst[i].inst1, srca[0], srca[1],
-				srca[2], dsta, fp->alu.inst[i].inst3);
-
-			for (j = 0; j < 3; ++j) {
-				int regc = fp->alu.inst[i].inst0 >> (j * 7);
-				int rega = fp->alu.inst[i].inst2 >> (j * 7);
-				int d;
-				char buf[20];
-
-				d = regc & 31;
-				if (d < 12) {
-					switch (d % 4) {
-					case R300_FPI0_ARGC_SRC0C_XYZ:
-						sprintf(buf, "%s.xyz",
-							srcc[d / 4]);
-						break;
-					case R300_FPI0_ARGC_SRC0C_XXX:
-						sprintf(buf, "%s.xxx",
-							srcc[d / 4]);
-						break;
-					case R300_FPI0_ARGC_SRC0C_YYY:
-						sprintf(buf, "%s.yyy",
-							srcc[d / 4]);
-						break;
-					case R300_FPI0_ARGC_SRC0C_ZZZ:
-						sprintf(buf, "%s.zzz",
-							srcc[d / 4]);
-						break;
-					}
-				} else if (d < 15) {
-					sprintf(buf, "%s.www", srca[d - 12]);
-				} else if (d == 20) {
-					sprintf(buf, "0.0");
-				} else if (d == 21) {
-					sprintf(buf, "1.0");
-				} else if (d == 22) {
-					sprintf(buf, "0.5");
-				} else if (d >= 23 && d < 32) {
-					d -= 23;
-					switch (d / 3) {
-					case 0:
-						sprintf(buf, "%s.yzx",
-							srcc[d % 3]);
-						break;
-					case 1:
-						sprintf(buf, "%s.zxy",
-							srcc[d % 3]);
-						break;
-					case 2:
-						sprintf(buf, "%s.Wzy",
-							srcc[d % 3]);
-						break;
-					}
-				} else {
-					sprintf(buf, "%i", d);
-				}
-
-				sprintf(argc[j], "%s%s%s%s",
-					(regc & 32) ? "-" : "",
-					(regc & 64) ? "|" : "",
-					buf, (regc & 64) ? "|" : "");
-
-				d = rega & 31;
-				if (d < 9) {
-					sprintf(buf, "%s.%c", srcc[d / 3],
-						'x' + (char)(d % 3));
-				} else if (d < 12) {
-					sprintf(buf, "%s.w", srca[d - 9]);
-				} else if (d == 16) {
-					sprintf(buf, "0.0");
-				} else if (d == 17) {
-					sprintf(buf, "1.0");
-				} else if (d == 18) {
-					sprintf(buf, "0.5");
-				} else {
-					sprintf(buf, "%i", d);
-				}
-
-				sprintf(arga[j], "%s%s%s%s",
-					(rega & 32) ? "-" : "",
-					(rega & 64) ? "|" : "",
-					buf, (rega & 64) ? "|" : "");
-			}
-
-			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
-				"       w: %8s %8s %8s    op: %08x\n",
-				argc[0], argc[1], argc[2],
-				fp->alu.inst[i].inst0, arga[0], arga[1],
-				arga[2], fp->alu.inst[i].inst2);
-		}
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 72fca77845..404dbf3b7c 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -96,7 +96,10 @@ typedef struct r300_fragment_program_swizzle {
 #define DRI_CONF_FP_OPTIMIZATION_SPEED   0
 #define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
 
-struct r300_fragment_program;
+struct r500_fragment_program;
+
+extern void r500TranslateFragmentShader(r300ContextPtr r300,
+					struct r500_fragment_program *fp);
 
 extern void r300TranslateFragmentShader(r300ContextPtr r300,
 					struct r300_fragment_program *fp);
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index dc1b8a9c8e..f9e459de91 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -147,7 +147,12 @@
 #define PCI_CHIP_RV410_5E4D		0x5E4D
 #define PCI_CHIP_RV410_5E4F		0x5E4F
 
+<<<<<<< HEAD:src/mesa/drivers/dri/radeon/radeon_chipset.h
+#define PCI_CHIP_RV530_71C4		0x71C4
+#define PCI_CHIP_RV530_71D5		0x71D5
+=======
 #define PCI_CHIP_RV530_71C4             0x71C4
+>>>>>>> eff6f1203222a776c5990b5d104b57a7f69b9aab:src/mesa/drivers/dri/radeon/radeon_chipset.h
 
 #define PCI_CHIP_RS350_7834		0x7834
 #define PCI_CHIP_RS350_7835		0x7835
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index b0b8730b39..7d6b4a7577 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -90,7 +90,7 @@ DRI_CONF_BEGIN
         DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
         DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
         DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
-        DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+        DRI_CONF_ALLOW_LARGE_TEXTURES(2)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_DEBUG
         DRI_CONF_NO_RAST(false)
@@ -117,7 +117,7 @@ DRI_CONF_BEGIN
         DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
         DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
         DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
-        DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+        DRI_CONF_ALLOW_LARGE_TEXTURES(2)
         DRI_CONF_TEXTURE_BLEND_QUALITY(1.0,"0.0:1.0")
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_DEBUG
@@ -691,6 +691,14 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       fprintf(stderr, "Warning, R520 detected, 3D HAHAHAHAHA!!.\n");
       break;
 
+<<<<<<< HEAD:src/mesa/drivers/dri/radeon/radeon_screen.c
+   case PCI_CHIP_RV530_71D5:
+      screen->chip_family = CHIP_FAMILY_R520;
+      fprintf(stderr, "Warning, RV530 detected, all your base belong to us\n");
+      break;
+
+=======
+>>>>>>> eff6f1203222a776c5990b5d104b57a7f69b9aab:src/mesa/drivers/dri/radeon/radeon_screen.c
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
 	      dri_priv->deviceID);
diff --git a/src/mesa/drivers/x11/glxheader.h b/src/mesa/drivers/x11/glxheader.h
index a402191f13..15e8390583 100644
--- a/src/mesa/drivers/x11/glxheader.h
+++ b/src/mesa/drivers/x11/glxheader.h
@@ -34,6 +34,7 @@
 
 #ifdef XFree86Server
 
+# include "xorg-server.h"
 # include "resource.h"
 # include "windowstr.h"
 
diff --git a/src/mesa/drivers/x11/sources b/src/mesa/drivers/x11/sources
deleted file mode 100644
index d76d65eaad..0000000000
--- a/src/mesa/drivers/x11/sources
+++ /dev/null
@@ -1,8 +0,0 @@
-# Note: only listing sources needed for X server renderer
-MESA_DRIVER_X11_SOURCES = \
-xm_api.c \
-xm_buffer.c \
-xm_dd.c \
-xm_line.c \
-xm_span.c \
-xm_tri.c
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index e606bb1191..8941817531 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -1303,6 +1303,67 @@ xmesa_convert_from_x_visual_type( int visualType )
 /**********************************************************************/
 
 
+#ifdef IN_DRI_DRIVER
+#define need_GL_VERSION_1_3
+#define need_GL_VERSION_1_4
+#define need_GL_VERSION_1_5
+#define need_GL_VERSION_2_0
+
+/* sw extensions for imaging */
+#define need_GL_EXT_blend_color
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_convolution
+#define need_GL_EXT_histogram
+#define need_GL_SGI_color_table
+
+/* sw extensions not associated with some GL version */
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_vertex_program
+#define need_GL_APPLE_vertex_array_object
+#define need_GL_ATI_fragment_shader
+#define need_GL_EXT_depth_bounds_test
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_paletted_texture
+#define need_GL_IBM_multimode_draw_arrays
+#define need_GL_MESA_resize_buffers
+#define need_GL_NV_vertex_program
+#define need_GL_NV_fragment_program
+
+#include "extension_helper.h"
+#include "utils.h"
+
+const struct dri_extension card_extensions[] =
+{
+   { "GL_VERSION_1_3",			GL_VERSION_1_3_functions },
+   { "GL_VERSION_1_4",			GL_VERSION_1_4_functions },
+   { "GL_VERSION_1_5",			GL_VERSION_1_5_functions },
+   { "GL_VERSION_2_0",			GL_VERSION_2_0_functions },
+
+   { "GL_EXT_blend_color",		GL_EXT_blend_color_functions },
+   { "GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions },
+   { "GL_EXT_convolution",		GL_EXT_convolution_functions },
+   { "GL_EXT_histogram",		GL_EXT_histogram_functions },
+   { "GL_SGI_color_table",		GL_SGI_color_table_functions },
+
+   { "GL_ARB_shader_objects",		GL_ARB_shader_objects_functions },
+   { "GL_ARB_vertex_program",		GL_ARB_vertex_program_functions },
+   { "GL_APPLE_vertex_array_object",	GL_APPLE_vertex_array_object_functions },
+   { "GL_ATI_fragment_shader",		GL_ATI_fragment_shader_functions },
+   { "GL_EXT_depth_bounds_test",	GL_EXT_depth_bounds_test_functions },
+   { "GL_EXT_framebuffer_object",	GL_EXT_framebuffer_object_functions },
+   { "GL_EXT_framebuffer_blit",		GL_EXT_framebuffer_blit_functions },
+   { "GL_EXT_gpu_program_parameters",	GL_EXT_gpu_program_parameters_functions },
+   { "GL_EXT_paletted_texture",		GL_EXT_paletted_texture_functions },
+   { "GL_IBM_multimode_draw_arrays",	GL_IBM_multimode_draw_arrays_functions },
+   { "GL_MESA_resize_buffers",		GL_MESA_resize_buffers_functions },
+   { "GL_NV_vertex_program",		GL_NV_vertex_program_functions },
+   { "GL_NV_fragment_program",		GL_NV_fragment_program_functions },
+   { NULL,				NULL }
+};
+#endif
+
 /*
  * Create a new X/Mesa visual.
  * Input:  display - X11 display
@@ -1348,6 +1409,14 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
    XMesaVisual v;
    GLint red_bits, green_bits, blue_bits, alpha_bits;
 
+#ifdef IN_DRI_DRIVER
+   /* driInitExtensions() should be called once per screen to setup extension
+    * indices.  There is no need to call it when the context is created since
+    * XMesa enables mesa sw extensions on its own.
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+#endif
+
 #ifndef XFree86Server
    /* For debugging only */
    if (_mesa_getenv("MESA_XSYNC")) {
@@ -1525,7 +1594,7 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
    _mesa_enable_2_0_extensions(mesaCtx);
    _mesa_enable_2_1_extensions(mesaCtx);
 #if ENABLE_EXT_texure_compression_s3tc
-    if (c->Mesa_DXTn) {
+    if (mesaCtx->Mesa_DXTn) {
        _mesa_enable_extension(mesaCtx, "GL_EXT_texture_compression_s3tc");
        _mesa_enable_extension(mesaCtx, "GL_S3_s3tc");
     }
diff --git a/src/mesa/drivers/xorg/.gitignore b/src/mesa/drivers/xorg/.gitignore
new file mode 100644
index 0000000000..18a777939c
--- /dev/null
+++ b/src/mesa/drivers/xorg/.gitignore
@@ -0,0 +1,3 @@
+glxheader.h
+xmesaP.h
+xm_*
diff --git a/src/mesa/drivers/xorg/Makefile b/src/mesa/drivers/xorg/Makefile
new file mode 100644
index 0000000000..a1b417447b
--- /dev/null
+++ b/src/mesa/drivers/xorg/Makefile
@@ -0,0 +1,95 @@
+# src/mesa/drivers/xorg/Makefile
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = libGLcore.so
+
+SYMLINKS =		\
+	glxheader.h	\
+	xmesaP.h	\
+	xm_api.c	\
+	xm_buffer.c	\
+	xm_dd.c		\
+	xm_image.c	\
+	xm_image.h	\
+	xm_line.c	\
+	xm_span.c	\
+	xm_tri.c
+
+C_SOURCES =		\
+	xm_api.c	\
+	xm_buffer.c	\
+	xm_dd.c		\
+	xm_image.c	\
+	xm_line.c	\
+	xm_span.c	\
+	xm_tri.c	\
+	glcore.c
+
+########################################
+
+MESA_MODULES = $(TOP)/src/mesa/libmesa.a
+
+C_SOURCES += ../common/driverfuncs.c
+ifeq ("${DRIVER_DIRS}", "dri")
+C_SOURCES += ../dri/common/utils.c
+endif
+
+OBJECTS = $(C_SOURCES:.c=.o)
+
+### Include directories
+INCLUDES = \
+	-I. \
+	-I.. \
+	-I$(TOP)/include \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/mesa/glapi \
+	`pkg-config --cflags xorg-server`
+
+ifeq ("${DRIVER_DIRS}", "dri")
+INCLUDES += \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	`pkg-config --cflags libdrm`
+endif
+
+# undef 'USE_XSHM' to make it explicit that 'XFree86Server' takes precedence
+DRIVER_DEFINES = -UUSE_XSHM -DXFree86Server
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) Makefile
+	$(TOP)/bin/mklib -noprefix -o $@ \
+		$(OBJECTS) $(MESA_MODULES) $(GLCORE_LIB_DEPS)
+
+
+depend: $(C_SOURCES) $(SYMLINKS)
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		> /dev/null
+
+
+clean:
+	-rm -f *.o *.so $(SYMLINKS)
+	-rm -f depend depend.bak
+
+install: $(LIBNAME)
+	$(INSTALL) -d $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+	$(INSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+
+$(SYMLINKS):
+	@[ -e $@ ] || ln -sf ../x11/$@ ./
+
+symlinks: $(SYMLINKS)
+
+include depend
diff --git a/src/mesa/drivers/xorg/glcore.c b/src/mesa/drivers/xorg/glcore.c
new file mode 100644
index 0000000000..a0199117c6
--- /dev/null
+++ b/src/mesa/drivers/xorg/glcore.c
@@ -0,0 +1,24 @@
+
+#define _NEED_GL_CORE_IF
+#include <GL/xmesa.h>
+#include <GL/internal/glcore.h>
+#include "xmesaP.h"
+
+PUBLIC
+__GLcoreModule GL_Core = {
+    XMesaCreateVisual,
+    XMesaDestroyVisual,
+
+    XMesaCreateWindowBuffer,
+    XMesaCreatePixmapBuffer,
+    XMesaDestroyBuffer,
+    XMesaSwapBuffers,
+    XMesaResizeBuffers,
+
+    XMesaCreateContext,
+    XMesaDestroyContext,
+    XMesaCopyContext,
+    XMesaMakeCurrent2,
+    XMesaForceCurrent,
+    XMesaLoseCurrent
+};
diff --git a/src/mesa/glapi/glapi.c b/src/mesa/glapi/glapi.c
index 47c5782273..36b09e68e5 100644
--- a/src/mesa/glapi/glapi.c
+++ b/src/mesa/glapi/glapi.c
@@ -50,6 +50,10 @@
 
 
 
+#ifdef HAVE_DIX_CONFIG_H
+#include <dix-config.h>
+#endif
+
 #include "glheader.h"
 #include "glapi.h"
 #include "glapioffsets.h"
diff --git a/src/mesa/glapi/glthread.c b/src/mesa/glapi/glthread.c
index 4513853f5a..92f2e5bf56 100644
--- a/src/mesa/glapi/glthread.c
+++ b/src/mesa/glapi/glthread.c
@@ -29,6 +29,10 @@
  */
 
 
+#ifdef HAVE_DIX_CONFIG_H
+#include <dix-config.h>
+#endif
+
 #include "glheader.h"
 #include "glthread.h"
 
diff --git a/src/mesa/glapi/sources b/src/mesa/glapi/sources
deleted file mode 100644
index 1d5c252821..0000000000
--- a/src/mesa/glapi/sources
+++ /dev/null
@@ -1,12 +0,0 @@
-MESA_GLAPI_SOURCES = \
-glapi.c \
-glthread.c
-
-MESA_GLAPI_HEADERS = \
-dispatch.h \
-glapi.h \
-glapioffsets.h \
-glapitable.h \
-glapitemp.h \
-glprocs.h \
-glthread.h
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index d94876e70b..733aaad030 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -121,7 +121,6 @@
 #include "version.h"
 #include "vtxfmt.h"
 #include "glapi/glthread.h"
-#include "glapi/glapioffsets.h"
 #if FEATURE_NV_vertex_program || FEATURE_NV_fragment_program
 #include "shader/program.h"
 #endif
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 8d10d8a750..23ede7bb68 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -611,9 +611,9 @@ destroy_list(GLcontext *ctx, GLuint list)
 
 
 /*
- * Translate the nth element of list from type to GLuint.
+ * Translate the nth element of list from <type> to GLint.
  */
-static GLuint
+static GLint
 translate_id(GLsizei n, GLenum type, const GLvoid * list)
 {
    GLbyte *bptr;
@@ -627,37 +627,40 @@ translate_id(GLsizei n, GLenum type, const GLvoid * list)
    switch (type) {
    case GL_BYTE:
       bptr = (GLbyte *) list;
-      return (GLuint) *(bptr + n);
+      return (GLint) bptr[n];
    case GL_UNSIGNED_BYTE:
       ubptr = (GLubyte *) list;
-      return (GLuint) *(ubptr + n);
+      return (GLint) ubptr[n];
    case GL_SHORT:
       sptr = (GLshort *) list;
-      return (GLuint) *(sptr + n);
+      return (GLint) sptr[n];
    case GL_UNSIGNED_SHORT:
       usptr = (GLushort *) list;
-      return (GLuint) *(usptr + n);
+      return (GLint) usptr[n];
    case GL_INT:
       iptr = (GLint *) list;
-      return (GLuint) *(iptr + n);
+      return iptr[n];
    case GL_UNSIGNED_INT:
       uiptr = (GLuint *) list;
-      return (GLuint) *(uiptr + n);
+      return (GLint) uiptr[n];
    case GL_FLOAT:
       fptr = (GLfloat *) list;
-      return (GLuint) *(fptr + n);
+      return (GLint) FLOORF(fptr[n]);
    case GL_2_BYTES:
       ubptr = ((GLubyte *) list) + 2 * n;
-      return (GLuint) *ubptr * 256 + (GLuint) * (ubptr + 1);
+      return (GLint) ubptr[0] * 256
+           + (GLint) ubptr[1];
    case GL_3_BYTES:
       ubptr = ((GLubyte *) list) + 3 * n;
-      return (GLuint) * ubptr * 65536
-           + (GLuint) *(ubptr + 1) * 256 + (GLuint) * (ubptr + 2);
+      return (GLint) ubptr[0] * 65536
+           + (GLint) ubptr[1] * 256
+           + (GLint) ubptr[2];
    case GL_4_BYTES:
       ubptr = ((GLubyte *) list) + 4 * n;
-      return (GLuint) *ubptr * 16777216
-           + (GLuint) *(ubptr + 1) * 65536
-           + (GLuint) *(ubptr + 2) * 256 + (GLuint) * (ubptr + 3);
+      return (GLint) ubptr[0] * 16777216
+           + (GLint) ubptr[1] * 65536
+           + (GLint) ubptr[2] * 256
+           + (GLint) ubptr[3];
    default:
       return 0;
    }
@@ -992,10 +995,10 @@ _mesa_save_CallLists(GLsizei n, GLenum type, const GLvoid * lists)
    }
 
    for (i = 0; i < n; i++) {
-      GLuint list = translate_id(i, type, lists);
+      GLint list = translate_id(i, type, lists);
       Node *n = ALLOC_INSTRUCTION(ctx, OPCODE_CALL_LIST_OFFSET, 2);
       if (n) {
-         n[1].ui = list;
+         n[1].i = list;
          n[2].b = typeErrorFlag;
       }
    }
@@ -5774,7 +5777,8 @@ execute_list(GLcontext *ctx, GLuint list)
                _mesa_error(ctx, GL_INVALID_ENUM, "glCallLists(type)");
             }
             else if (ctx->ListState.CallDepth < MAX_LIST_NESTING) {
-               execute_list(ctx, ctx->List.ListBase + n[1].ui);
+               GLuint list = (GLuint) (ctx->List.ListBase + n[1].i);
+               execute_list(ctx, list);
             }
             break;
          case OPCODE_CLEAR:
@@ -6822,7 +6826,6 @@ void GLAPIENTRY
 _mesa_CallLists(GLsizei n, GLenum type, const GLvoid * lists)
 {
    GET_CURRENT_CONTEXT(ctx);
-   GLuint list;
    GLint i;
    GLboolean save_compile_flag;
 
@@ -6854,8 +6857,8 @@ _mesa_CallLists(GLsizei n, GLenum type, const GLvoid * lists)
    ctx->CompileFlag = GL_FALSE;
 
    for (i = 0; i < n; i++) {
-      list = translate_id(i, type, lists);
-      execute_list(ctx, ctx->List.ListBase + list);
+      GLuint list = (GLuint) (ctx->List.ListBase + translate_id(i, type, lists));
+      execute_list(ctx, list);
    }
 
    ctx->CompileFlag = save_compile_flag;
diff --git a/src/mesa/main/drawpix.c b/src/mesa/main/drawpix.c
index 4f28766674..fde9338430 100644
--- a/src/mesa/main/drawpix.c
+++ b/src/mesa/main/drawpix.c
@@ -374,8 +374,9 @@ _mesa_Bitmap( GLsizei width, GLsizei height,
 
    if (ctx->RenderMode == GL_RENDER) {
       /* Truncate, to satisfy conformance tests (matches SGI's OpenGL). */
-      GLint x = IFLOOR(ctx->Current.RasterPos[0] - xorig);
-      GLint y = IFLOOR(ctx->Current.RasterPos[1] - yorig);
+      const GLfloat epsilon = 0.0001;
+      GLint x = IFLOOR(ctx->Current.RasterPos[0] + epsilon - xorig);
+      GLint y = IFLOOR(ctx->Current.RasterPos[1] + epsilon - yorig);
 
       if (ctx->Unpack.BufferObj->Name) {
          /* unpack from PBO */
diff --git a/src/mesa/main/glheader.h b/src/mesa/main/glheader.h
index bab962ad5e..c6f81fdd7e 100644
--- a/src/mesa/main/glheader.h
+++ b/src/mesa/main/glheader.h
@@ -46,11 +46,6 @@
 #ifndef GLHEADER_H
 #define GLHEADER_H
 
-/* This allows Mesa to be integrated into XFree86 */
-#ifdef HAVE_DIX_CONFIG_H
-#include "dix-config.h"
-#endif
-
 #include <assert.h>
 #include <ctype.h>
 #if defined(__alpha__) && defined(CCPML)
diff --git a/src/mesa/main/sources b/src/mesa/main/sources
deleted file mode 100644
index dfcff89e4b..0000000000
--- a/src/mesa/main/sources
+++ /dev/null
@@ -1,138 +0,0 @@
-# List of source files in this directory used for X.org xserver build
-MESA_MAIN_SOURCES = \
-accum.c \
-api_arrayelt.c \
-api_loopback.c \
-api_noop.c \
-api_validate.c \
-arrayobj.c \
-attrib.c \
-blend.c \
-bufferobj.c \
-buffers.c \
-clip.c \
-colortab.c \
-context.c \
-convolve.c \
-debug.c \
-depth.c \
-depthstencil.c \
-dlist.c \
-drawpix.c \
-enable.c \
-enums.c \
-eval.c \
-execmem.c \
-extensions.c \
-fbobject.c \
-feedback.c \
-fog.c \
-framebuffer.c \
-get.c \
-getstring.c \
-hash.c \
-hint.c \
-histogram.c \
-image.c \
-imports.c \
-light.c \
-lines.c \
-matrix.c \
-mipmap.c \
-mm.c \
-occlude.c \
-pixel.c \
-points.c \
-polygon.c \
-rastpos.c \
-rbadaptors.c \
-renderbuffer.c \
-state.c \
-stencil.c \
-texcompress.c \
-texcompress_fxt1.c \
-texcompress_s3tc.c \
-texenvprogram.c \
-texformat.c \
-teximage.c \
-texobj.c \
-texrender.c \
-texstate.c \
-texstore.c \
-varray.c \
-$(VSNPRINTF_SOURCES) \
-vtxfmt.c
-
-MESA_VSNPRINTF_SOURCES = \
-vsnprintf.c
-
-MESA_MAIN_HEADERS = \
-accum.h \
-api_arrayelt.h \
-api_eval.h \
-api_loopback.h \
-api_noop.h \
-api_validate.h \
-arrayobj.h \
-attrib.h \
-bitset.h \
-blend.h \
-bufferobj.h \
-buffers.h \
-clip.h \
-colormac.h \
-colortab.h \
-config.h \
-context.h \
-convolve.h \
-dd.h \
-debug.h \
-depth.h \
-depthstencil.h \
-dlist.h \
-drawpix.h \
-enable.h \
-enums.h \
-eval.h \
-extensions.h \
-fbobject.h \
-feedback.h \
-fog.h \
-framebuffer.h \
-get.h \
-glheader.h \
-hash.h \
-hint.h \
-histogram.h \
-image.h \
-imports.h \
-light.h \
-lines.h \
-macros.h \
-matrix.h \
-mipmap.h \
-mm.h \
-mtypes.h \
-occlude.h \
-pixel.h \
-points.h \
-polygon.h \
-rastpos.h \
-rbadaptors.h \
-renderbuffer.h \
-simple_list.h \
-state.h \
-stencil.h \
-texcompress.h \
-texenvprogram.h \
-texformat.h \
-texformat_tmp.h \
-teximage.h \
-texobj.h \
-texrender.h \
-texstate.h \
-texstore.h \
-varray.h \
-version.h \
-vtxfmt.h \
-vtxfmt_tmp.h
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 288b334eaf..626c264863 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -213,6 +213,9 @@ calculate_derived_texenv( struct gl_tex_env_combine_state *state,
       return;
    }
 
+   if (mode == GL_REPLACE_EXT)
+      mode = GL_REPLACE;
+
    switch (mode) {
    case GL_REPLACE:
    case GL_MODULATE:
@@ -315,7 +318,9 @@ _mesa_TexEnvfv( GLenum target, GLenum pname, const GLfloat *param )
       switch (pname) {
       case GL_TEXTURE_ENV_MODE:
          {
-            const GLenum mode = (GLenum) (GLint) *param;
+            GLenum mode = (GLenum) (GLint) *param;
+            if (mode == GL_REPLACE_EXT)
+               mode = GL_REPLACE;
 	    if (texUnit->EnvMode == mode)
 	       return;
             if (mode == GL_MODULATE ||
diff --git a/src/mesa/math/sources b/src/mesa/math/sources
deleted file mode 100644
index 7c7dcccedf..0000000000
--- a/src/mesa/math/sources
+++ /dev/null
@@ -1,25 +0,0 @@
-MESA_MATH_SOURCES = \
-m_debug_clip.c \
-m_debug_norm.c \
-m_debug_xform.c \
-m_eval.c \
-m_matrix.c \
-m_translate.c \
-m_vector.c \
-m_xform.c
-
-MESA_MATH_HEADERS = \
-m_clip_tmp.h \
-m_copy_tmp.h \
-m_debug.h \
-m_debug_util.h \
-m_dotprod_tmp.h \
-m_eval.h \
-m_matrix.h \
-m_norm_tmp.h \
-m_trans_tmp.h \
-m_translate.h \
-m_vector.h \
-m_xform.h \
-m_xform_tmp.h \
-mathmod.h
diff --git a/src/mesa/shader/grammar/sources b/src/mesa/shader/grammar/sources
deleted file mode 100644
index a6bbfd3ffd..0000000000
--- a/src/mesa/shader/grammar/sources
+++ /dev/null
@@ -1,8 +0,0 @@
-MESA_SHADER_GRAMMAR_SOURCES = \
-grammar_mesa.c
-
-MESA_SHADER_GRAMMAR_HEADERS = \
-grammar.c \
-grammar.h \
-grammar_mesa.h \
-grammar_syn.h
diff --git a/src/mesa/shader/prog_execute.c b/src/mesa/shader/prog_execute.c
index 7f9687c36f..cb17aa501c 100644
--- a/src/mesa/shader/prog_execute.c
+++ b/src/mesa/shader/prog_execute.c
@@ -1520,8 +1520,9 @@ _mesa_execute_program(GLcontext * ctx,
       case OPCODE_END:
          return GL_TRUE;
       default:
-         _mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
+         _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
                        inst->Opcode);
+		       assert(0);
          return GL_TRUE;        /* return value doesn't matter */
 
       }
diff --git a/src/mesa/shader/prog_parameter.c b/src/mesa/shader/prog_parameter.c
index 46d30872e4..3ad7215755 100644
--- a/src/mesa/shader/prog_parameter.c
+++ b/src/mesa/shader/prog_parameter.c
@@ -40,8 +40,7 @@
 struct gl_program_parameter_list *
 _mesa_new_parameter_list(void)
 {
-   return (struct gl_program_parameter_list *)
-      _mesa_calloc(sizeof(struct gl_program_parameter_list));
+   return CALLOC_STRUCT(gl_program_parameter_list);
 }
 
 
diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index d2c9183558..c539b52720 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -230,7 +230,6 @@ _mesa_init_program_struct( GLcontext *ctx, struct gl_program *prog,
 {
    (void) ctx;
    if (prog) {
-      _mesa_bzero(prog, sizeof(*prog));
       prog->Id = id;
       prog->Target = target;
       prog->Resident = GL_TRUE;
diff --git a/src/mesa/shader/shader_api.c b/src/mesa/shader/shader_api.c
index 01a237c525..b0f79c29c1 100644
--- a/src/mesa/shader/shader_api.c
+++ b/src/mesa/shader/shader_api.c
@@ -133,6 +133,11 @@ _mesa_free_shader_program_data(GLcontext *ctx,
       _mesa_free(shProg->Shaders);
       shProg->Shaders = NULL;
    }
+
+   if (shProg->InfoLog) {
+      _mesa_free(shProg->InfoLog);
+      shProg->InfoLog = NULL;
+   }
 }
 
 
@@ -143,10 +148,7 @@ void
 _mesa_free_shader_program(GLcontext *ctx, struct gl_shader_program *shProg)
 {
    _mesa_free_shader_program_data(ctx, shProg);
-   if (shProg->Shaders) {
-      _mesa_free(shProg->Shaders);
-      shProg->Shaders = NULL;
-   }
+
    _mesa_free(shProg);
 }
 
diff --git a/src/mesa/shader/slang/sources b/src/mesa/shader/slang/sources
deleted file mode 100644
index 00d617fa8a..0000000000
--- a/src/mesa/shader/slang/sources
+++ /dev/null
@@ -1,44 +0,0 @@
-MESA_SHADER_SLANG_SOURCES = \
-slang_analyse.c \
-slang_assemble_assignment.c \
-slang_assemble.c \
-slang_assemble_conditional.c \
-slang_assemble_constructor.c \
-slang_assemble_typeinfo.c \
-slang_compile.c \
-slang_compile_function.c \
-slang_compile_operation.c \
-slang_compile_struct.c \
-slang_compile_variable.c \
-slang_execute.c \
-slang_execute_x86.c \
-slang_export.c \
-slang_library_texsample.c \
-slang_library_noise.c \
-slang_link.c \
-slang_preprocess.c \
-slang_storage.c \
-slang_utility.c
-
-MESA_SHADER_SLANG_HEADERS = \
-slang_analyse.h \
-slang_assemble.h \
-slang_assemble_assignment.h \
-slang_assemble_conditional.h \
-slang_assemble_constructor.h \
-slang_assemble_typeinfo.h \
-slang_compile.h \
-slang_compile_function.h \
-slang_compile_operation.h \
-slang_compile_struct.h \
-slang_compile_variable.h \
-slang_execute.h \
-slang_export.h \
-slang_library_noise.h \
-slang_library_texsample.h \
-slang_link.h \
-slang_mesa.h \
-slang_preprocess.h \
-slang_storage.h \
-slang_utility.h \
-traverse_wrap.h
diff --git a/src/mesa/shader/sources b/src/mesa/shader/sources
deleted file mode 100644
index 2787187276..0000000000
--- a/src/mesa/shader/sources
+++ /dev/null
@@ -1,28 +0,0 @@
-# List of source files in this directory used for X.org xserver build
-MESA_SHADER_SOURCES = \
-arbprogparse.c \
-arbprogram.c \
-atifragshader.c \
-nvfragparse.c \
-nvprogram.c \
-nvvertexec.c \
-nvvertparse.c \
-program.c \
-programopt.c \
-shaderobjects.c \
-shaderobjects_3dlabs.c
-
-MESA_SHADER_HEADERS = \
-arbprogparse.h \
-arbprogram.h \
-arbprogram_syn.h \
-atifragshader.h \
-nvfragparse.h \
-nvprogram.h \
-nvvertexec.h \
-nvvertparse.h \
-programopt.h \
-program.h \
-program_instruction.h \
-shaderobjects.h \
-shaderobjects_3dlabs.h
diff --git a/src/mesa/swrast/sources b/src/mesa/swrast/sources
deleted file mode 100644
index 9ffd4cca72..0000000000
--- a/src/mesa/swrast/sources
+++ /dev/null
@@ -1,65 +0,0 @@
-# List of source files in this directory used for X.org xserver build
-MESA_SWRAST_SOURCES = \
-s_aaline.c \
-s_aatriangle.c \
-s_accum.c \
-s_alpha.c \
-s_arbshader.c \
-s_atifragshader.c \
-s_bitmap.c \
-s_blend.c \
-s_blit.c \
-s_buffers.c \
-s_context.c \
-s_copypix.c \
-s_depth.c \
-s_drawpix.c \
-s_feedback.c \
-s_fog.c \
-s_imaging.c \
-s_lines.c \
-s_logic.c \
-s_masking.c \
-s_nvfragprog.c \
-s_points.c \
-s_readpix.c \
-s_span.c \
-s_stencil.c \
-s_texcombine.c \
-s_texfilter.c \
-s_texstore.c \
-s_triangle.c \
-s_zoom.c
-
-MESA_SWRAST_HEADERS = \
-s_aaline.h \
-s_aalinetemp.h \
-s_aatriangle.h \
-s_aatritemp.h \
-s_accum.h \
-s_alpha.h \
-s_arbshader.h \
-s_atifragshader.h \
-s_blend.h \
-s_context.h \
-s_depth.h \
-s_drawpix.h \
-s_feedback.h \
-s_fog.h \
-s_lines.h \
-s_linetemp.h \
-s_logic.h \
-s_masking.h \
-s_nvfragprog.h \
-s_points.h \
-s_pointtemp.h \
-s_span.h \
-s_spantemp.h \
-s_stencil.h \
-s_texcombine.h \
-s_texfilter.h \
-s_triangle.h \
-s_trispan.h \
-s_tritemp.h \
-s_zoom.h \
-swrast.h
diff --git a/src/mesa/swrast_setup/sources b/src/mesa/swrast_setup/sources
deleted file mode 100644
index dee14b6774..0000000000
--- a/src/mesa/swrast_setup/sources
+++ /dev/null
@@ -1,10 +0,0 @@
-MESA_SWRAST_SETUP_SOURCES = \
-ss_context.c \
-ss_triangle.c
-
-MESA_SWRAST_SETUP_HEADERS = \
-ss_context.h \
-ss_triangle.h \
-ss_tritmp.h \
-ss_vb.h \
-swrast_setup.h
diff --git a/src/mesa/tnl/sources b/src/mesa/tnl/sources
deleted file mode 100644
index a0888be11d..0000000000
--- a/src/mesa/tnl/sources
+++ /dev/null
@@ -1,34 +0,0 @@
-# List of source files in this directory used for X.org xserver build
-MESA_TNL_SOURCES = \
-t_context.c \
-t_pipeline.c \
-t_vb_arbprogram.c \
-t_vb_arbprogram_sse.c \
-t_vb_arbshader.c \
-t_vb_cull.c \
-t_vb_fog.c \
-t_vb_light.c \
-t_vb_normals.c \
-t_vb_points.c \
-t_vb_program.c \
-t_vb_render.c \
-t_vb_texgen.c \
-t_vb_texmat.c \
-t_vb_vertex.c \
-t_vertex.c \
-t_vertex_generic.c \
-t_vertex_sse.c \
-t_vp_build.c 
-
-MESA_TNL_HEADERS = \
-t_array_api.h \
-t_array_import.h \
-t_context.h \
-t_pipeline.h \
-t_vb_arbprogram.h \
-t_vb_cliptmp.h \
-t_vb_lighttmp.h \
-t_vb_rendertmp.h \
-t_vertex.h \
-t_vp_build.h \
-tnl.h
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index a52521db64..1a782da1ea 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -30,7 +30,6 @@
 #include "main/state.h"
 #include "main/api_validate.h"
 #include "main/api_noop.h"
-#include "glapi/dispatch.h"
 
 #include "vbo_context.h"