/**************************************************************************
 * 
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

/* Author:
 *    Brian Paul
 *    Keith Whitwell
 */


/** TEMP */
#include "main/context.h"
#include "main/macros.h"

#include "pipe/p_defines.h"
#include "pipe/p_context.h"
#include "pipe/p_winsys.h"


#include "sp_context.h"
#include "sp_state.h"

#include "pipe/draw/draw_private.h"
#include "pipe/draw/draw_context.h"
#include "pipe/draw/draw_prim.h"

#include "pipe/tgsi/core/tgsi_exec.h"
#include "pipe/tgsi/core/tgsi_build.h"
#include "pipe/tgsi/core/tgsi_util.h"


#if defined __GNUC__
#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME[SIZE] __attribute__(( aligned( 16 ) ))
#define ALIGN16_ASSIGN(P) P
#else
#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME[SIZE + 1]
#define ALIGN16_ASSIGN(P) align16(P)
#endif


static INLINE unsigned
compute_clipmask(float cx, float cy, float cz, float cw)
{
   unsigned mask;
#if defined(macintosh) || defined(__powerpc__)
   /* on powerpc cliptest is 17% faster in this way. */
   mask =  (((cw <  cx) << CLIP_RIGHT_SHIFT));
   mask |= (((cw < -cx) << CLIP_LEFT_SHIFT));
   mask |= (((cw <  cy) << CLIP_TOP_SHIFT));
   mask |= (((cw < -cy) << CLIP_BOTTOM_SHIFT));
   mask |= (((cw <  cz) << CLIP_FAR_SHIFT));
   mask |= (((cw < -cz) << CLIP_NEAR_SHIFT));
#else /* !defined(macintosh)) */
   mask = 0x0;
   if (-cx + cw < 0) mask |= CLIP_RIGHT_BIT;
   if ( cx + cw < 0) mask |= CLIP_LEFT_BIT;
   if (-cy + cw < 0) mask |= CLIP_TOP_BIT;
   if ( cy + cw < 0) mask |= CLIP_BOTTOM_BIT;
   if (-cz + cw < 0) mask |= CLIP_FAR_BIT;
   if ( cz + cw < 0) mask |= CLIP_NEAR_BIT;
#endif /* defined(macintosh) */
   return mask;
}


/**
 * Fetch a float[4] vertex attribute from memory, doing format/type
 * conversion as needed.
 * XXX this might be a temporary thing.
 */
static void
fetch_attrib4(const void *ptr, unsigned format, float attrib[4])
{
   /* defaults */
   attrib[1] = 0.0;
   attrib[2] = 0.0;
   attrib[3] = 1.0;
   switch (format) {
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      attrib[3] = ((float *) ptr)[3];
      /* fall-through */
   case PIPE_FORMAT_R32G32B32_FLOAT:
      attrib[2] = ((float *) ptr)[2];
      /* fall-through */
   case PIPE_FORMAT_R32G32_FLOAT:
      attrib[1] = ((float *) ptr)[1];
      /* fall-through */
   case PIPE_FORMAT_R32_FLOAT:
      attrib[0] = ((float *) ptr)[0];
      break;
   default:
      assert(0);
   }
}


/**
 * Transform vertices with the current vertex program/shader
 * Up to four vertices can be shaded at a time.
 * \param vbuffer  the input vertex data
 * \param elts  indexes of four input vertices
 * \param count  number of vertices to shade [1..4]
 * \param vOut  array of pointers to four output vertices
 */
static void
run_vertex_program(struct draw_context *draw,
                   const void *vbuffer, unsigned elts[4], unsigned count,
                   struct vertex_header *vOut[])
{
   struct softpipe_context *sp = softpipe_context(draw->pipe);
   struct tgsi_exec_machine machine;
   unsigned int j;

   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;

   assert(count <= 4);

#ifdef DEBUG
   memset( &machine, 0, sizeof( machine ) );
#endif

   /* init machine state */
   tgsi_exec_machine_init(
                          &machine,
                          sp->vs.tokens,
                          PIPE_MAX_SAMPLERS,
                          NULL /*samplers*/ );

   /* Consts does not require 16 byte alignment. */
   machine.Consts = sp->vs.constants->constant;

   machine.Inputs = ALIGN16_ASSIGN(inputs);
   machine.Outputs = ALIGN16_ASSIGN(outputs);


   if (0)
   {
      unsigned attr;
      for (attr = 0; attr < 16; attr++) {
         if (sp->vs.inputs_read & (1 << attr)) {
            printf("attr %d: buf_off %d  src_off %d  pitch %d\n",
                   attr, 
                   draw->vertex_buffer[attr].buffer_offset,
                   draw->vertex_element[attr].src_offset,
                   draw->vertex_buffer[attr].pitch);
         }
      }
   }

   /* load machine inputs */
   for (j = 0; j < count; j++) {
      unsigned attr;
      for (attr = 0; attr < 16; attr++) {
         if (sp->vs.inputs_read & (1 << attr)) {
            const void *src
               = (const void *) ((const ubyte *) vbuffer
                                 + draw->vertex_buffer[attr].buffer_offset
                                 + draw->vertex_element[attr].src_offset
                                 + elts[j] * draw->vertex_buffer[attr].pitch);
            float p[4];

            fetch_attrib4(src, draw->vertex_element[attr].src_format, p);

            machine.Inputs[attr].xyzw[0].f[j] = p[0]; /*X*/
            machine.Inputs[attr].xyzw[1].f[j] = p[1]; /*Y*/
            machine.Inputs[attr].xyzw[2].f[j] = p[2]; /*Z*/
            machine.Inputs[attr].xyzw[3].f[j] = p[3]; /*W*/
#if 0
            if (attr == 0) {
               printf("Input vertex %d: %f %f %f\n",
                      j, p[0], p[1], p[2]);
            }
#endif
         }
      }
   }

#if 0
   printf("Consts:\n");
   for (i = 0; i < 4; i++) {
      printf(" %d: %f %f %f %f\n", i,
             machine.Consts[i][0],
             machine.Consts[i][1],
             machine.Consts[i][2],
             machine.Consts[i][3]);
   }
#endif

   /* run shader */
   tgsi_exec_machine_run( &machine );

#if 0
   printf("VS result: %f %f %f %f\n",
          outputs[0].xyzw[0].f[0],
          outputs[0].xyzw[1].f[0],
          outputs[0].xyzw[2].f[0],
          outputs[0].xyzw[3].f[0]);
#endif

   /* store machine results */
   assert(sp->vs.outputs_written & (1 << VERT_RESULT_HPOS));
   for (j = 0; j < count; j++) {
      unsigned attr, slot;
      float x, y, z, w;

      /* Handle attr[0] (position) specially: */
      x = vOut[j]->clip[0] = outputs[0].xyzw[0].f[j];
      y = vOut[j]->clip[1] = outputs[0].xyzw[1].f[j];
      z = vOut[j]->clip[2] = outputs[0].xyzw[2].f[j];
      w = vOut[j]->clip[3] = outputs[0].xyzw[3].f[j];

      vOut[j]->clipmask = compute_clipmask(x, y, z, w);
      vOut[j]->edgeflag = 1;

      /* divide by w */
      w = 1.0 / w;
      x *= w;
      y *= w;
      z *= w;

      /* Viewport mapping */
      vOut[j]->data[0][0] = x * scale[0] + trans[0];
      vOut[j]->data[0][1] = y * scale[1] + trans[1];
      vOut[j]->data[0][2] = z * scale[2] + trans[2];
      vOut[j]->data[0][3] = w;
#if 0
      printf("wincoord: %f %f %f\n",
             vOut[j]->data[0][0],
             vOut[j]->data[0][1],
             vOut[j]->data[0][2]);
#endif

      /* remaining attributes: */
      /* pack into sequential post-transform attrib slots */
      slot = 1;
      for (attr = 1; attr < VERT_RESULT_MAX; attr++) {
         if (sp->vs.outputs_written & (1 << attr)) {
            assert(slot < draw->nr_attrs - 2);
            vOut[j]->data[slot][0] = outputs[attr].xyzw[0].f[j];
            vOut[j]->data[slot][1] = outputs[attr].xyzw[1].f[j];
            vOut[j]->data[slot][2] = outputs[attr].xyzw[2].f[j];
            vOut[j]->data[slot][3] = outputs[attr].xyzw[3].f[j];
            slot++;
         }
      }
   }

#if 0
   memcpy(
      quad->outputs.color,
      &machine.Outputs[1].xyzw[0].f[0],
      sizeof( quad->outputs.color ) );
#endif
}


/**
 * Stand-in for actual vertex program execution
 * XXX this will probably live in a new file, like "sp_vs.c"
 * \param draw  the drawing context
 * \param vbuffer  the mapped vertex buffer pointer
 * \param elem  which element of the vertex buffer to use as input
 * \param vOut  the output vertex
 */
#if 0
static void
run_vertex_program(struct draw_context *draw,
                   const void *vbuffer, unsigned elem,
                   struct vertex_header *vOut)
{
   const float *vIn, *cIn;
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;
   const void *mapped = vbuffer;

   /* XXX temporary hack: */
   GET_CURRENT_CONTEXT(ctx);
   const float *m = ctx->_ModelProjectMatrix.m;

   vIn = (const float *) ((const ubyte *) mapped
                          + draw->vertex_buffer[0].buffer_offset
                          + draw->vertex_element[0].src_offset
                          + elem * draw->vertex_buffer[0].pitch);

   cIn = (const float *) ((const ubyte *) mapped
                          + draw->vertex_buffer[3].buffer_offset
                          + draw->vertex_element[3].src_offset
                          + elem * draw->vertex_buffer[3].pitch);

   {
      float x = vIn[0];
      float y = vIn[1];
      float z = vIn[2];
      float w = 1.0;

      vOut->clipmask = 0x0;
      vOut->edgeflag = 0;
      /* MVP */
      vOut->clip[0] = m[0] * x + m[4] * y + m[ 8] * z + m[12] * w;
      vOut->clip[1] = m[1] * x + m[5] * y + m[ 9] * z + m[13] * w;
      vOut->clip[2] = m[2] * x + m[6] * y + m[10] * z + m[14] * w;
      vOut->clip[3] = m[3] * x + m[7] * y + m[11] * z + m[15] * w;

      /* divide by w */
      x = vOut->clip[0] / vOut->clip[3];
      y = vOut->clip[1] / vOut->clip[3];
      z = vOut->clip[2] / vOut->clip[3];
      w = 1.0 / vOut->clip[3];

      /* Viewport */
      vOut->data[0][0] = scale[0] * x + trans[0];
      vOut->data[0][1] = scale[1] * y + trans[1];
      vOut->data[0][2] = scale[2] * z + trans[2];
      vOut->data[0][3] = w;

      /* color */
      vOut->data[1][0] = cIn[0];
      vOut->data[1][1] = cIn[1];
      vOut->data[1][2] = cIn[2];
      vOut->data[1][3] = 1.0;
   }
}
#endif


/**
 * Called by the draw module when the vertx cache needs to be flushed.
 * This involves running the vertex shader.
 */
static void vs_flush( struct draw_context *draw )
{
   unsigned i, j;

   /* We're not really running a vertex shader yet, so flushing the vs
    * queue is just a matter of building the vertices and returning.
    */ 
   /* Actually, I'm cheating even more and pre-building them still
    * with the mesa/vf module.  So it's very easy...
    */
#if 0
   for (i = 0; i < draw->vs.queue_nr; i++) {
#else
   for (i = 0; i < draw->vs.queue_nr; i+=4) {
#endif
      /* Would do the following steps here:
       *
       * 1) Loop over vertex element descriptors, fetch data from each
       *    to build the pre-tnl vertex.  This might require a new struct
       *    to represent the pre-tnl vertex.
       * 
       * 2) Bundle groups of upto 4 pre-tnl vertices together and pass
       *    to vertex shader.  
       *
       * 3) Do any necessary unswizzling, make sure vertex headers are
       *    correctly populated, store resulting post-transformed
       *    vertices in vcache.
       *
       * In this version, just do the last step:
       */
#if 0
      const unsigned elt = draw->vs.queue[i].elt;
      struct vertex_header *dest = draw->vs.queue[i].dest;

      run_vertex_program(draw, draw->mapped_vbuffer, elt, dest);
#else
      struct vertex_header *dests[4];
      unsigned elts[4];
      int n;

      for (j = 0; j < 4; j++) {
         elts[j] = draw->vs.queue[i + j].elt;
         dests[j] = draw->vs.queue[i + j].dest;
      }

      n = MIN2(4, draw->vs.queue_nr - i);
      assert(n > 0);
      assert(n <= 4);

      run_vertex_program(draw, draw->mapped_vbuffer, elts, n, dests);
#endif
   }
   draw->vs.queue_nr = 0;
}


void
softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                     unsigned start, unsigned count)
{
   struct softpipe_context *sp = softpipe_context(pipe);
   struct draw_context *draw = sp->draw;
   struct pipe_buffer_handle *buf;

   if (sp->dirty)
      softpipe_update_derived( sp );

   softpipe_map_surfaces(sp);

   /*
    * Map vertex buffers
    */
   buf = sp->vertex_buffer[0].buffer;
   assert(buf);
   draw->mapped_vbuffer
      = pipe->winsys->buffer_map(pipe->winsys, buf, PIPE_BUFFER_FLAG_READ);


   /* tell drawing pipeline we're beginning drawing */
   draw->pipeline.first->begin( draw->pipeline.first );

   draw->vs_flush = vs_flush;
   draw->pipe = pipe;  /* XXX pass pipe to draw_create() */

   draw_invalidate_vcache( draw );

   draw_set_element_buffer(draw, 0, NULL);  /* no index/element buffer */
   draw_set_prim( draw, mode );

   /* XXX draw_prim_info() and TRIM here */
   draw_prim(draw, start, count);

   /* draw any left-over buffered prims */
   draw_flush(draw);

   /* tell drawing pipeline we're done drawing */
   draw->pipeline.first->end( draw->pipeline.first );

   /*
    * unmap vertex buffer
    */
   pipe->winsys->buffer_unmap(pipe->winsys, buf);

   softpipe_unmap_surfaces(sp);
}


#define EMIT_ATTR( VF_ATTR, STYLE, SIZE )			\
do {								\
   if (draw->nr_attrs >= 2)					\
      draw->vf_attr_to_slot[VF_ATTR] = draw->nr_attrs - 2;	\
   draw->attrs[draw->nr_attrs].attrib = VF_ATTR;		\
   draw->attrs[draw->nr_attrs].format = STYLE;			\
   draw->nr_attrs++;						\
   draw->vertex_size += SIZE;					\
} while (0)


/**
 * XXX very similar to same func in draw_vb.c (which will go away)
 */
void
draw_set_vertex_attributes2( struct draw_context *draw,
                             const unsigned *slot_to_vf_attr,
                             unsigned nr_attrs )
{
   unsigned i;

   memset(draw->vf_attr_to_slot, 0, sizeof(draw->vf_attr_to_slot));
   draw->nr_attrs = 0;
   draw->vertex_size = 0;

   /*
    * First three attribs are always the same: header, clip pos, winpos
    */
   EMIT_ATTR(VF_ATTRIB_VERTEX_HEADER, EMIT_1F, 1);
   EMIT_ATTR(VF_ATTRIB_CLIP_POS, EMIT_4F, 4);

   assert(slot_to_vf_attr[0] == VF_ATTRIB_POS);
   EMIT_ATTR(slot_to_vf_attr[0], EMIT_4F_VIEWPORT, 4);

   /*
    * Remaining attribs (color, texcoords, etc)
    */
   for (i = 1; i < nr_attrs; i++) 
      EMIT_ATTR(slot_to_vf_attr[i], EMIT_4F, 4);

   draw->vertex_size *= 4; /* floats to bytes */
}