Add finite sampling for better simulation accuracy

author: Thomas White <taw@physics.org> 2010-02-19 17:44:13 +0100
committer: Thomas White <taw@physics.org> 2010-02-19 17:44:13 +0100
commit: ab3573ec308bd8d4e4790ab2c7e54ad4b6b81710 (patch)
tree: 34c2105bcbcf980c2317cf3f1f32988fdd41d2cb
parent: 632c9738cac050bb6802ab839b07f3a7d3685d11 (diff)
3 files changed, 63 insertions, 23 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl
index 82a4adcf..babfc6b7 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -46,7 +46,7 @@ float4 quat_rot(float4 q, float4 z)
 
 
 float4 get_q(int x, int y, float cx, float cy, float res, float clen, float k,
-             float *ttp, float4 z)
+             float *ttp, float4 z, int sampling)
 {
 	float rx, ry, r;
 	float ttx, tty, tt;
@@ -131,19 +131,43 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
                        int w, float cx, float cy,
                        float res, float clen, float16 cell,
                        global float2 *sfacs, float4 z, int4 ncells,
-                       int xmin, int ymin)
+                       int xmin, int ymin, int sampling, local float2 *tmp)
 {
 	float ttv;
 	const int x = get_global_id(0) + xmin;
 	const int y = get_global_id(1) + ymin;
 	float f_lattice;
 	float2 f_molecule;
+	float4 q;
+	const int lx = get_local_id(0);
+	const int ly = get_local_id(1);
+	const int ax = x / sampling;
+	const int ay = y / sampling;
 
-	float4 q = get_q(x, y, cx, cy, res, clen, k, &ttv, z);
-
+	/* Calculate value */
+	q = get_q(x, y, cx, cy, res, clen, k, &ttv, z, sampling);
 	f_lattice = lattice_factor(cell, q, ncells);
 	f_molecule = get_sfac(sfacs, cell, q);
 
-	diff[x+w*y] = f_molecule * f_lattice;
-	tt[x+w*y] = ttv;
+	/* Write the value to local memory */
+	tmp[lx+sampling*ly] = f_molecule * f_lattice;
+
+	/* Memory fence */
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Leader thread sums the values */
+	if ( lx + ly == 0 ) {
+
+		int i;
+		float2 sum = (0.0, 0.0);
+
+		for ( i=0; i<sampling*sampling; i++ ) sum += tmp[i];
+
+		diff[ax+w*ay] = sum / (sampling*sampling);
+
+		/* Leader thread also records 2theta value.
+		 * This should really be averaged across all pixels, but
+		 * I strongly suspect this would be a waste of time. */
+		tt[ax+w*ay] = ttv;
+	}
 }
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index 344da6c6..9057d3cc 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -24,9 +24,9 @@
 #include "sfac.h"
 
 
-#define SAMPLING (5)
-#define BWSAMPLING (10)
-#define BANDWIDTH (1.0 / 100.0)
+#define SAMPLING (4)
+#define BWSAMPLING (1)
+#define BANDWIDTH (0.0 / 100.0)
 
 
 struct gpu_context
@@ -150,7 +150,6 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	double bx, by, bz;
 	double cx, cy, cz;
 	float kc;
-	size_t dims[2];
 	cl_event *event;
 	int p;
 	float *tt_ptr;
@@ -159,6 +158,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	float *diff_ptr;
 	cl_float4 orientation;
 	cl_int4 ncells;
+	const int sampling = SAMPLING;
 
 	cell_get_cartesian(image->molecule->cell, &ax, &ay, &az,
 		                                  &bx, &by, &bz,
@@ -221,30 +221,49 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		ERROR("Couldn't set arg 11: %s\n", clError(err));
 		return;
 	}
+	clSetKernelArg(gctx->kern, 14, sizeof(cl_int), &sampling);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 14: %s\n", clError(err));
+		return;
+	}
+	/* Local memory for reduction */
+	clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float),
+	               NULL);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 15: %s\n", clError(err));
+		return;
+	}
 
 	/* Iterate over panels */
 	event = malloc(image->det.n_panels * sizeof(cl_event));
 	for ( p=0; p<image->det.n_panels; p++ ) {
 
+		size_t dims[2];
+		size_t ldims[2] = {SAMPLING, SAMPLING};
+		cl_float res = image->det.panels[p].res * SAMPLING;
+		int offsx = image->det.panels[p].min_x * SAMPLING;
+		int offsy = image->det.panels[p].min_y * SAMPLING;
+		cl_float centx = image->det.panels[p].cx * SAMPLING;
+		cl_float centy = image->det.panels[p].cy * SAMPLING;
+
 		/* In a future version of OpenCL, this could be done
 		 * with a global work offset.  But not yet... */
 		dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x;
 		dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y;
+		dims[0] *= SAMPLING;
+		dims[1] *= SAMPLING;
 
-		clSetKernelArg(gctx->kern, 4, sizeof(cl_float),
-		               &image->det.panels[p].cx);
+		clSetKernelArg(gctx->kern, 4, sizeof(cl_float), &centx);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't set arg 4: %s\n", clError(err));
 			return;
 		}
-		clSetKernelArg(gctx->kern, 5, sizeof(cl_float),
-		               &image->det.panels[p].cy);
+		clSetKernelArg(gctx->kern, 5, sizeof(cl_float), &centy);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't set arg 5: %s\n", clError(err));
 			return;
 		}
-		clSetKernelArg(gctx->kern, 6, sizeof(cl_float),
-		               &image->det.panels[p].res);
+		clSetKernelArg(gctx->kern, 6, sizeof(cl_float), &res);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't set arg 6: %s\n", clError(err));
 			return;
@@ -255,22 +274,19 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 			ERROR("Couldn't set arg 7: %s\n", clError(err));
 			return;
 		}
-
-		clSetKernelArg(gctx->kern, 12, sizeof(cl_int),
-		               &image->det.panels[p].min_x);
+		clSetKernelArg(gctx->kern, 12, sizeof(cl_int), &offsx);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't set arg 12: %s\n", clError(err));
 			return;
 		}
-		clSetKernelArg(gctx->kern, 13, sizeof(cl_int),
-		               &image->det.panels[p].min_y);
+		clSetKernelArg(gctx->kern, 13, sizeof(cl_int), &offsy);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't set arg 13: %s\n", clError(err));
 			return;
 		}
 
 		err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL,
-		                             dims, NULL, 0, NULL, &event[p]);
+		                             dims, ldims, 0, NULL, &event[p]);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't enqueue diffraction kernel: %s\n",
 			      clError(err));
diff --git a/src/diffraction.c b/src/diffraction.c
index a66c2310..876fac5d 100644
--- a/src/diffraction.c
+++ b/src/diffraction.c
@@ -23,7 +23,7 @@
 #include "sfac.h"
 
 
-#define SAMPLING (1)
+#define SAMPLING (4)
 #define BWSAMPLING (1)
 #define BANDWIDTH (0.0 / 100.0)
author	Thomas White <taw@physics.org>	2010-02-19 17:44:13 +0100
committer	Thomas White <taw@physics.org>	2010-02-19 17:44:13 +0100
commit	ab3573ec308bd8d4e4790ab2c7e54ad4b6b81710 (patch)
tree	34c2105bcbcf980c2317cf3f1f32988fdd41d2cb
parent	632c9738cac050bb6802ab839b07f3a7d3685d11 (diff)