Use a lookup table for sinc values in GPU calculation

author: Thomas White <taw@physics.org> 2010-03-01 17:10:41 +0100
committer: Thomas White <taw@physics.org> 2010-03-01 17:52:03 +0100
commit: f213d4dc787d4d6edb8981c41138f4ace1e2f324 (patch)
tree: b9e3b4d59993474a6c5e4c769d3d4f75638b04c1
parent: 8c01976e4bc6670c6f824479357c42caa18fb2ff (diff)
5 files changed, 115 insertions, 50 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl
index ebacb416..cc3e91bd 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -15,6 +15,15 @@
 #define M_PI ((float)(3.14159265))
 #endif
 
+
+const sampler_t sampler_a = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_REPEAT
+                             | CLK_FILTER_LINEAR;
+const sampler_t sampler_b = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_REPEAT
+                             | CLK_FILTER_LINEAR;
+const sampler_t sampler_c = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_REPEAT
+                             | CLK_FILTER_LINEAR;
+
+
 float4 quat_rot(float4 q, float4 z)
 {
 	float4 res;
@@ -80,36 +89,23 @@ float range(float a)
 }
 
 
-float lattice_factor(float16 cell, float4 q, int4 ncells)
+float lattice_factor(float16 cell, float4 q,
+                     read_only image2d_t func_a,
+                     read_only image2d_t func_b,
+                     read_only image2d_t func_c)
 {
 	float f1, f2, f3, v;
 	float4 Udotq;
-	const int na = ncells.s0;
-	const int nb = ncells.s1;
-	const int nc = ncells.s2;
 
 	Udotq.x = cell.s0*q.x + cell.s1*q.y + cell.s2*q.z;
 	Udotq.y = cell.s3*q.x + cell.s4*q.y + cell.s5*q.z;
 	Udotq.z = cell.s6*q.x + cell.s7*q.y + cell.s8*q.z;
 
-	/* At exact Bragg condition, f1 = na */
-	v = M_PI*Udotq.x;
-	f1 = sin(v*(float)na) / sin(v);
-	f1 = isnan(f1) ? na : f1;
-
-	/* At exact Bragg condition, f2 = nb */
-	v = M_PI*Udotq.y;
-	f2 = sin(v*(float)nb) / sin(v);
-	f2 = isnan(f2) ? nb : f2;
-
-	/* At exact Bragg condition, f3 = nc */
-	v = M_PI*Udotq.z;
-	f3 = sin(v*(float)nc) / sin(v);
-	f3 = isnan(f3) ? nc : f3;
+	/* Look up values from precalculated sinc() table */
+	f1 = read_imagef(func_a, sampler_a, (float2)(Udotq.x, 0.0)).s0;
+	f2 = read_imagef(func_b, sampler_b, (float2)(Udotq.y, 0.0)).s0;
+	f3 = read_imagef(func_c, sampler_c, (float2)(Udotq.z, 0.0)).s0;
 
-	/* At exact Bragg condition, this will multiply the molecular
-	 * part of the structure factor by the number of unit cells,
-	 * as desired (more scattering from bigger crystal!) */
 	return f1 * f2 * f3;
 }
 
@@ -148,9 +144,12 @@ float2 get_sfac(global float2 *sfacs, float16 cell, float4 q)
 kernel void diffraction(global float *diff, global float *tt, float klow,
                        int w, float cx, float cy,
                        float res, float clen, float16 cell,
-                       global float2 *sfacs, float4 z, int4 ncells,
+                       global float2 *sfacs, float4 z,
                        int xmin, int ymin, int sampling, local float *tmp,
-                       float kstep)
+                       float kstep,
+                       read_only image2d_t func_a,
+                       read_only image2d_t func_b,
+                       read_only image2d_t func_c)
 {
 	float ttv;
 	const int x = get_global_id(0) + (xmin*sampling);
@@ -169,7 +168,7 @@ kernel void diffraction(global float *diff, global float *tt, float klow,
 
 	/* Calculate value */
 	q = get_q(x, y, cx, cy, res, clen, k, &ttv, z, sampling);
-	f_lattice = lattice_factor(cell, q, ncells);
+	f_lattice = lattice_factor(cell, q, func_a, func_b, func_c);
 	f_molecule = get_sfac(sfacs, cell, q);
 
 	/* Write the value to local memory */
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index 69d21912..c54b9aea 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -29,6 +29,8 @@
 #define BWSAMPLING (10)
 #define BANDWIDTH (1.0 / 100.0)
 
+#define SINC_LUT_ELEMENTS (4096)
+
 
 struct gpu_context
 {
@@ -43,6 +45,13 @@ struct gpu_context
 
 	cl_mem diff;
 	size_t diff_size;
+
+	cl_mem func_a;
+	cl_float *func_a_ptr;
+	cl_mem func_b;
+	cl_float *func_b_ptr;
+	cl_mem func_c;
+	cl_float *func_c_ptr;
 };
 
 
@@ -128,27 +137,22 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		ERROR("Couldn't set arg 10: %s\n", clError(err));
 		return;
 	}
-	clSetKernelArg(gctx->kern, 11, sizeof(cl_int4), &ncells);
-	if ( err != CL_SUCCESS ) {
-		ERROR("Couldn't set arg 11: %s\n", clError(err));
-		return;
-	}
-	clSetKernelArg(gctx->kern, 14, sizeof(cl_int), &sampling);
+	clSetKernelArg(gctx->kern, 13, sizeof(cl_int), &sampling);
 	if ( err != CL_SUCCESS ) {
-		ERROR("Couldn't set arg 14: %s\n", clError(err));
+		ERROR("Couldn't set arg 13: %s\n", clError(err));
 		return;
 	}
 	/* Local memory for reduction */
-	clSetKernelArg(gctx->kern, 15,
+	clSetKernelArg(gctx->kern, 14,
 	               BWSAMPLING*SAMPLING*SAMPLING*sizeof(cl_float), NULL);
 	if ( err != CL_SUCCESS ) {
-		ERROR("Couldn't set arg 15: %s\n", clError(err));
+		ERROR("Couldn't set arg 14: %s\n", clError(err));
 		return;
 	}
 	/* Bandwidth sampling step */
-	clSetKernelArg(gctx->kern, 16, sizeof(cl_float), &bwstep);
+	clSetKernelArg(gctx->kern, 15, sizeof(cl_float), &bwstep);
 	if ( err != CL_SUCCESS ) {
-		ERROR("Couldn't set arg 16: %s\n", clError(err));
+		ERROR("Couldn't set arg 15: %s\n", clError(err));
 		return;
 	}
 
@@ -191,16 +195,16 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 			ERROR("Couldn't set arg 7: %s\n", clError(err));
 			return;
 		}
-		clSetKernelArg(gctx->kern, 12, sizeof(cl_int),
+		clSetKernelArg(gctx->kern, 11, sizeof(cl_int),
 		               &image->det.panels[p].min_x);
 		if ( err != CL_SUCCESS ) {
-			ERROR("Couldn't set arg 12: %s\n", clError(err));
+			ERROR("Couldn't set arg 11: %s\n", clError(err));
 			return;
 		}
-		clSetKernelArg(gctx->kern, 13, sizeof(cl_int),
+		clSetKernelArg(gctx->kern, 12, sizeof(cl_int),
 		               &image->det.panels[p].min_y);
 		if ( err != CL_SUCCESS ) {
-			ERROR("Couldn't set arg 13: %s\n", clError(err));
+			ERROR("Couldn't set arg 12: %s\n", clError(err));
 			return;
 		}
 
@@ -263,7 +267,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 
 /* Setup the OpenCL stuff, create buffers, load the structure factor table */
 struct gpu_context *setup_gpu(int no_sfac, struct image *image,
-                              struct molecule *molecule)
+                              struct molecule *molecule, int na, int nb, int nc)
 {
 	struct gpu_context *gctx;
 	cl_uint nplat;
@@ -274,6 +278,9 @@ struct gpu_context *setup_gpu(int no_sfac, struct image *image,
 	size_t sfac_size;
 	float *sfac_ptr;
 	size_t maxwgsize;
+	size_t sinc_lut_size;
+	cl_image_format fmt;
+	int i;
 
 	if ( molecule == NULL ) return NULL;
 
@@ -332,13 +339,11 @@ struct gpu_context *setup_gpu(int no_sfac, struct image *image,
 	sfac_size = IDIM*IDIM*IDIM*sizeof(cl_float)*2; /* complex */
 	sfac_ptr = malloc(sfac_size);
 	if ( !no_sfac ) {
-		int i;
 		for ( i=0; i<IDIM*IDIM*IDIM; i++ ) {
 			sfac_ptr[2*i+0] = creal(molecule->reflections[i]);
 			sfac_ptr[2*i+1] = cimag(molecule->reflections[i]);
 		}
 	} else {
-		int i;
 		for ( i=0; i<IDIM*IDIM*IDIM; i++ ) {
 			sfac_ptr[2*i+0] = 10000.0;
 			sfac_ptr[2*i+1] = 0.0;
@@ -377,6 +382,67 @@ struct gpu_context *setup_gpu(int no_sfac, struct image *image,
 		return NULL;
 	}
 
+	fmt.image_channel_order = CL_INTENSITY;
+	fmt.image_channel_data_type = CL_FLOAT;
+	sinc_lut_size = SINC_LUT_ELEMENTS*sizeof(cl_float);
+
+	/* Set up sinc LUT for a* direction */
+	gctx->func_a_ptr = malloc(sinc_lut_size);
+	gctx->func_a_ptr[0] = na;
+	for ( i=1; i<SINC_LUT_ELEMENTS; i++ ) {
+		double x, val;
+		x = (double)i/SINC_LUT_ELEMENTS;
+		val = sin(M_PI*na*x)/sin(M_PI*x);
+		gctx->func_a_ptr[i] = val;
+	}
+	gctx->func_a = clCreateImage2D(gctx->ctx,
+	                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+	                               &fmt, SINC_LUT_ELEMENTS, 1, 0,
+	                               gctx->func_a_ptr, &err);
+	clSetKernelArg(gctx->kern, 16, sizeof(cl_mem), &gctx->func_a);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 16: %s\n", clError(err));
+		return NULL;
+	}
+
+	/* Set up sinc LUT for b* direction */
+	gctx->func_b_ptr = malloc(sinc_lut_size);
+	gctx->func_b_ptr[0] = nb;
+	for ( i=1; i<SINC_LUT_ELEMENTS; i++ ) {
+		double x, val;
+		x = (double)i/SINC_LUT_ELEMENTS;
+		val = sin(M_PI*nb*x)/sin(M_PI*x);
+		gctx->func_b_ptr[i] = val;
+	}
+	gctx->func_b = clCreateImage2D(gctx->ctx,
+	                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+	                               &fmt, SINC_LUT_ELEMENTS, 1, 0,
+	                               gctx->func_b_ptr, &err);
+	clSetKernelArg(gctx->kern, 17, sizeof(cl_mem), &gctx->func_b);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 17: %s\n", clError(err));
+		return NULL;
+	}
+
+	/* Set up sinc LUT for c* direction */
+	gctx->func_c_ptr = malloc(sinc_lut_size);
+	gctx->func_c_ptr[0] = nc;
+	for ( i=1; i<SINC_LUT_ELEMENTS; i++ ) {
+		double x, val;
+		x = (double)i/SINC_LUT_ELEMENTS;
+		val = sin(M_PI*nc*x)/sin(M_PI*x);
+		gctx->func_c_ptr[i] = val;
+	}
+	gctx->func_c = clCreateImage2D(gctx->ctx,
+	                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+	                               &fmt, SINC_LUT_ELEMENTS, 1, 0,
+	                               gctx->func_c_ptr, &err);
+	clSetKernelArg(gctx->kern, 18, sizeof(cl_mem), &gctx->func_c);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 18: %s\n", clError(err));
+		return NULL;
+	}
+
 	STATUS("done\n");
 
 	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE,
diff --git a/src/diffraction-gpu.h b/src/diffraction-gpu.h
index f3bed8bf..05f393a1 100644
--- a/src/diffraction-gpu.h
+++ b/src/diffraction-gpu.h
@@ -23,10 +23,10 @@ struct gpu_context;
 
 #if HAVE_OPENCL
 
-extern void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
-                                int na, int nb, int nc);
+extern void get_diffraction_gpu(struct gpu_context *gctx, struct image *image);
 extern struct gpu_context *setup_gpu(int no_sfac, struct image *image,
-                                     struct molecule *molecule);
+                                     struct molecule *molecule,
+                                     int na, int nb, int nc);
 extern void cleanup_gpu(struct gpu_context *gctx);
 
 #else
diff --git a/src/indexamajig.c b/src/indexamajig.c
index 5033b73c..22bc2ea4 100644
--- a/src/indexamajig.c
+++ b/src/indexamajig.c
@@ -242,10 +242,10 @@ int main(int argc, char *argv[])
 				if ( config_gpu ) {
 					if ( gctx == NULL ) {
 						gctx = setup_gpu(0, &image,
-						                image.molecule);
+						                image.molecule,
+						                24, 24, 40);
 					}
-					get_diffraction_gpu(gctx, &image,
-					                    24, 24, 40);
+					get_diffraction_gpu(gctx, &image);
 				} else {
 					get_diffraction(&image, 8, 8, 8, 0, 0);
 				}
diff --git a/src/pattern_sim.c b/src/pattern_sim.c
index 65e020ac..4eee7c57 100644
--- a/src/pattern_sim.c
+++ b/src/pattern_sim.c
@@ -276,9 +276,9 @@ int main(int argc, char *argv[])
 		if ( config_gpu ) {
 			if ( gctx == NULL ) {
 				gctx = setup_gpu(config_nosfac, &image,
-				                 image.molecule);
+				                 image.molecule, na, nb, nc);
 			}
-			get_diffraction_gpu(gctx, &image, na, nb, nc);
+			get_diffraction_gpu(gctx, &image);
 		} else {
 			get_diffraction(&image, na, nb, nc, config_nosfac,
 			                !config_nowater);
author	Thomas White <taw@physics.org>	2010-03-01 17:10:41 +0100
committer	Thomas White <taw@physics.org>	2010-03-01 17:52:03 +0100
commit	f213d4dc787d4d6edb8981c41138f4ace1e2f324 (patch)
tree	b9e3b4d59993474a6c5e4c769d3d4f75638b04c1
parent	8c01976e4bc6670c6f824479357c42caa18fb2ff (diff)