17 files changed, 81 insertions, 62 deletions
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index ad184bd696..6179b5b65a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -98,9 +98,9 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
    /* loop over verts */
    for (i = 0; i < count; i += MAX_VERTICES) {
       const uint max_vertices = MIN2(MAX_VERTICES, count - i);
-      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
-      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
-      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16, float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4]);
+      PIPE_ALIGN_VAR(16, float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4]);
+      PIPE_ALIGN_VAR(16, float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4]);
       uint attr;
 
       /* convert (up to) four input verts to SoA format */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 138d2d095b..cec5b11fd3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -51,7 +51,7 @@
  * Since it's pretty much impossible to form PPC vector immediates, load
  * them from memory here:
  */
-const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16, const float ppc_builtin_constants[]) = {
    1.0f, -128.0f, 128.0, 0.0
 };
 
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d5f5c7bbba..aa29dcb394 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -358,6 +358,7 @@ struct cell_spu_function_info
 
 
 /** This is the object passed to spe_create_thread() */
+PIPE_ALIGN_TYPE(16,
 struct cell_init_info
 {
    unsigned id;
@@ -370,7 +371,7 @@ struct cell_init_info
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 
    struct cell_spu_function_info *spu_functions;
-} ALIGN16_ATTRIB;
+});
 
 
 #endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 5c3188e7f9..fa6e4f65cd 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, struct cell_fence fence);
    struct cell_buffer_node *head;
 };
 
@@ -150,18 +150,18 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
-   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, struct cell_spu_function_info spu_functions);
 
    uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
-   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE]);
 
    int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4]);
 
 
    /** Associated with each command/batch buffer is a list of pipe_buffers
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 12b855a3db..2a62db4b79 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -53,8 +53,7 @@ struct spu_vs_context draw;
 /**
  * Buffers containing dynamically generated SPU code:
  */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16, static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]);
 
 
 
@@ -543,7 +542,7 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, qword buffer[CELL_BUFFER_SIZE / 16]);
    const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index d86d8e09a5..6db8ed419b 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1839,10 +1839,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
+         PIPE_ALIGN_VAR(16,
          union {
             struct tgsi_full_declaration decl;
             qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
-         } d ALIGN16_ATTRIB;
+         } d);
          unsigned ea = (unsigned) (mach->Declarations + pc);
 
          spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
@@ -1853,10 +1854,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      PIPE_ALIGN_VAR(16,
       union {
          struct tgsi_full_instruction inst;
          qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
-      } i ALIGN16_ATTRIB;
+      } i);
       unsigned ea = (unsigned) (mach->Instructions + pc);
 
       spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index 8605679940..c8c6183e2e 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -98,9 +98,9 @@ struct spu_exec_machine
     * 4  internal temporaries
     * 1  address
     */
+   PIPE_ALIGN_VAR(16,
    struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
-                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
-       ALIGN16_ATTRIB;
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]);
 
    struct spu_exec_vector       *Addrs;
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index ff3d609d25..a4e560b0a5 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -144,7 +144,7 @@ export_func(struct cell_spu_function_info *spu_functions,
 void
 return_function_info(void)
 {
-   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, struct cell_spu_function_info funcs);
    int tag = TAG_MISC;
 
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 33767e7c51..8500f1bb87 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -93,6 +93,7 @@ typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
                                                          vector float *constants);
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -107,10 +108,11 @@ struct spu_framebuffer
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
    float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
-} ALIGN16_ATTRIB;
+});
 
 
 /** per-texture level info */
+PIPE_ALIGN_TYPE(16,
 struct spu_texture_level
 {
    void *start;
@@ -123,20 +125,22 @@ struct spu_texture_level
    vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
    vector signed int max_s, max_t, max_r;
-} ALIGN16_ATTRIB;
+});
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
    uint target;  /**< PIPE_TEXTURE_x */
-} ALIGN16_ATTRIB;
+});
 
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
  */
+PIPE_ALIGN_TYPE(16,
 struct spu_global
 {
    /** One-time init/constant info */
@@ -155,8 +159,8 @@ struct spu_global
    struct vertex_info vertex_info;
 
    /** Current color and Z tiles */
-   tile_t ctile ALIGN16_ATTRIB;
-   tile_t ztile ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, tile_t ctile);
+   PIPE_ALIGN_VAR(16, tile_t ztile);
 
    /** Read depth/stencil tiles? */
    boolean read_depth_stencil;
@@ -165,8 +169,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE]);
+   PIPE_ALIGN_VAR(16, ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE]);
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint *fragment_ops_code;
@@ -175,7 +179,7 @@ struct spu_global
    spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   PIPE_ALIGN_VAR(8, uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]);
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
@@ -187,7 +191,7 @@ struct spu_global
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
 
-} ALIGN16_ATTRIB;
+});
 
 
 extern struct spu_global spu;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5ffb7073ab..b13fe3184f 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -169,7 +169,7 @@ void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16, ubyte vertex_data[CELL_BUFFER_SIZE]);
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    uint index_bytes;
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 03375d84a5..43600dfe0b 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -43,7 +43,7 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16, static const qword fetch_shuffle_data[5]) = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -110,7 +110,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16, qword in[2 * 4]);
 
 
       /* Fetch four attributes for four vertices.  
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index fbe5b34d39..49938a8001 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -107,8 +107,8 @@ run_vertex_program(struct spu_vs_context *draw,
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   PIPE_ALIGN_VAR(16, struct spu_exec_vector inputs[PIPE_MAX_ATTRIBS]);
+   PIPE_ALIGN_VAR(16, struct spu_exec_vector outputs[PIPE_MAX_ATTRIBS]);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -119,8 +119,8 @@ run_vertex_program(struct spu_vs_context *draw,
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
+   machine->Inputs = inputs;
+   machine->Outputs = outputs;
 
    spu_vertex_fetch( draw, machine, elts, count );
 
@@ -132,8 +132,9 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      PIPE_ALIGN_VAR(16,
       unsigned char buffer[sizeof(struct vertex_header)
-          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE]);
       struct vertex_header *const tmpOut =
           (struct vertex_header *) buffer;
       const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
@@ -186,8 +187,8 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16,
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]);
 
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
index 7eb05de77a..eb285e355e 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad.h
+++ b/src/gallium/drivers/llvmpipe/lp_quad.h
@@ -31,6 +31,7 @@
 #ifndef LP_QUAD_H
 #define LP_QUAD_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_exec.h"
 
@@ -83,7 +84,7 @@ struct quad_header_inout
 struct quad_header_output
 {
    /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   PIPE_ALIGN_VAR(16, float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE]);
 };
 
 
@@ -92,9 +93,9 @@ struct quad_header_output
  */
 struct quad_interp_coef
 {
-   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16, float a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]);
+   PIPE_ALIGN_VAR(16, float dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]);
+   PIPE_ALIGN_VAR(16, float dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]);
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index b18f17c0cd..29033a067c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -130,7 +130,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    uint8_t *tile;
    uint8_t *color;
    void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16, uint32_t mask[4][NUM_CHANNELS]);
    unsigned chan_index;
    unsigned q;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 29fff91981..de8f872e58 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -531,11 +531,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16, uint8_t src[LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t con[LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t res[LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8]);
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -596,11 +596,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16, uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8]);
+         PIPE_ALIGN_VAR(16, uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8]);
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index faddfb9677..3a6353b916 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -230,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16, uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]);
+      PIPE_ALIGN_VAR(16, uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]);
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 26a940593f..80610a07b6 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -139,22 +139,33 @@ typedef unsigned char boolean;
 
 
 
+/* Macros for data alignment. */
 #if defined(__GNUC__)
-#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
-#define ALIGN16_ASSIGN(NAME) NAME##___aligned
-#define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
-#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
+
+/* See http://gcc.gnu.org/onlinedocs/gcc-4.4.2/gcc/Type-Attributes.html */
+#define PIPE_ALIGN_TYPE(_alignment, _type) _type __attribute__((aligned(_alignment)))
+
+/* See http://gcc.gnu.org/onlinedocs/gcc-4.4.2/gcc/Variable-Attributes.html */
+#define PIPE_ALIGN_VAR(_alignment, _decl) _decl __attribute__((aligned(_alignment)))
+
 #if (__GNUC__ > 4 || (__GNUC__ == 4 &&__GNUC_MINOR__>1)) && !defined(PIPE_ARCH_X86_64)
 #define ALIGN_STACK __attribute__((force_align_arg_pointer))
 #else
 #define ALIGN_STACK
 #endif
-#else
-#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
-#define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
-#define ALIGN16_ATTRIB
-#define ALIGN8_ATTRIB
+
+#elif defined(_MSC_VER)
+
+/* See http://msdn.microsoft.com/en-us/library/83ythb65.aspx */
+#define PIPE_ALIGN_TYPE(_alignment, _type) __declspec(align(_alignment)) _type
+#define PIPE_ALIGN_VAR(_alignment, _decl) __declspec(align(_alignment)) _decl
+
 #define ALIGN_STACK
+
+#else
+
+#error "Unsupported compiler"
+
 #endif