4 files changed, 56 insertions, 204 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl
index f8be5179..f6318378 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -13,7 +13,7 @@
 /* Maxmimum index to hold values up to (can be increased if necessary)
  * WARNING: Altering this value constitutes an ABI change, and means you must
  * update src/pattern_sim.h then recompile and reinstall everything. */
-#define INDMAX 200
+#define INDMAX 120
 #define IDIM (INDMAX*2 +1)
 
 #ifndef M_PI
@@ -34,15 +34,13 @@ const sampler_t sampler_c = CLK_NORMALIZED_COORDS_TRUE
 
 float4 get_q(float fs, float ss, float res, float clen, float k,
              float *ttp, float corner_x, float corner_y,
-             float fsx, float fsy, float ssx, float ssy,
-             float xdiv, float ydiv)
+             float fsx, float fsy, float ssx, float ssy)
 {
 	float rx, ry, r;
 	float az, tt;
 	float4 q;
 	float xs, ys;
 	float kx, ky, kz;
-	float kxn, kyn, kzn;
 
 	xs = fs*fsx + ss*ssx;
 	ys = fs*fsy + ss*ssy;
@@ -57,20 +55,9 @@ float4 get_q(float fs, float ss, float res, float clen, float k,
 
 	az = atan2(ry, rx);
 
-	kxn = k*native_sin(tt)*native_cos(az);
-	kyn = k*native_sin(tt)*native_sin(az);
-	kzn = k*(native_cos(tt)-1.0);
-
-	/* x divergence */
-	kx =  kxn*native_cos(xdiv) +kzn*native_sin(xdiv);
-	ky =  kyn;
-	kz = -kxn*native_sin(xdiv) +kzn*native_cos(xdiv);
-	kxn = kx;  kyn = ky;  kzn = kz;
-
-	/* y divergence */
-	kx =  kxn;
-	ky =  kyn*cos(ydiv) +kzn*sin(ydiv);
-	kz = -kyn*sin(ydiv) +kzn*cos(ydiv);
+	kx = k*native_sin(tt)*native_cos(az);
+	ky = k*native_sin(tt)*native_sin(az);
+	kz = k*(native_cos(tt)-1.0);
 
 	q = (float4)(kx, ky, kz, 0.0);
 
@@ -224,81 +211,39 @@ float molecule_factor(global float *intensities, global float *flags,
 }
 
 
-kernel void diffraction(global float *diff, global float *tt, float klow,
+kernel void diffraction(global float *diff, global float *ttp, float k,
                         int w, float corner_x, float corner_y,
                         float res, float clen, float16 cell,
                         global float *intensities,
-                        int min_fs, int min_ss, int sampling, local float *tmp,
-                        float kstep,
                         read_only image2d_t func_a,
                         read_only image2d_t func_b,
                         read_only image2d_t func_c,
                         global float *flags,
-                        float fsx, float fsy, float ssx, float ssy,
-                        float divxlow, float divxstep, int divxsamp,
-                        float divylow, float divystep, int divysamp)
+                        float fsx, float fsy, float ssx, float ssy)
 {
-	float ttv;
+	float tt;
 	float fs, ss;
 	float f_lattice, I_lattice;
 	float I_molecule;
 	float4 q;
-	float k = klow + kstep * get_local_id(2);
 	float intensity;
-	const int ls0 = get_local_size(0);
-	const int ls1 = get_local_size(1);
-	const int ls2 = get_local_size(2) / (divxsamp*divysamp);
-	const int ls3 = divxsamp;
-	const int ls4 = divysamp;
-	const int li0 = get_local_id(0);
-	const int li1 = get_local_id(1);
-	const int li234 = get_local_id(2);
-	const int li2 = li234 / (ls3*ls4);
-	const int li234leftover = li234 % (ls3*ls4);
-	const int li3 = li234leftover / ls4;
-	const int li4 = li234leftover % ls4;
-	const int ls = ls0 * ls1 * ls2 * ls3 * ls4;
-	float xdiv = divxlow + divxstep*ls4;
-	float ydiv = divylow + divystep*ls3;
+	int idx;
 
 	/* Calculate fractional coordinates in fs/ss */
-	fs = convert_float(get_global_id(0)) / convert_float(sampling);
-	ss = convert_float(get_global_id(1)) / convert_float(sampling);
+	fs = convert_float(get_global_id(0));
+	ss = convert_float(get_global_id(1));
 
 	/* Get the scattering vector */
-	q = get_q(fs, ss, res, clen, k, &ttv,
-	          corner_x, corner_y, fsx, fsy, ssx, ssy, xdiv, ydiv);
+	q = get_q(fs, ss, res, clen, k, &tt,
+	          corner_x, corner_y, fsx, fsy, ssx, ssy);
 
 	/* Calculate the diffraction */
 	f_lattice = lattice_factor(cell, q, func_a, func_b, func_c);
 	I_molecule = molecule_factor(intensities, flags, cell, q);
 	I_lattice = pow(f_lattice, 2.0f);
 
-	/* Write the value to local memory */
-	intensity = I_molecule * I_lattice;
-	tmp[li0 + ls0*li1 + ls0*ls1*li2 + ls0*ls1*ls2*li3
-	        + ls0*ls1*ls2*ls3*li4] = intensity;
-
-	/* Memory fence */
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* Leader thread sums the values */
-	if ( li0 + li1 + li2 + li3 + li4 == 0 ) {
-
-		int i;
-		float sum = 0.0;
-		float val;
-		int idx;
-
-		idx = convert_int_rtz(fs) + w*convert_int_rtz(ss);
-
-		for ( i=0; i<ls; i++ ) sum += tmp[i];
-
-		val = sum / convert_float(ls);
-		diff[idx] = val;
-
-		/* Leader thread also records the 2theta value */
-		tt[idx] = ttv;
-
-	}
+	/* Write the value to memory */
+	idx = convert_int_rtz(fs) + w*convert_int_rtz(ss);
+	diff[idx] = I_molecule * I_lattice;
+	ttp[idx] = tt;
 }
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index b3950570..464adf02 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -34,9 +34,6 @@
 #include "pattern_sim.h"
 
 
-#define SAMPLING (4)
-#define BWSAMPLING (10)
-#define DIVSAMPLING (1)
 #define SINC_LUT_ELEMENTS (4096)
 
 
@@ -160,18 +157,12 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	double ax, ay, az;
 	double bx, by, bz;
 	double cx, cy, cz;
-	float klow, khigh;
 	int i;
 	cl_float16 cell;
 	cl_int4 ncells;
-	const int sampling = SAMPLING;
-	cl_float bwstep;
 	int n_inf = 0;
 	int n_neg = 0;
-	cl_float divxlow, divxstep;
-	cl_float divylow, divystep;
 	int n_nan = 0;
-	int sprod;
 
 	if ( gctx == NULL ) {
 		ERROR("GPU setup failed.\n");
@@ -183,17 +174,6 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	cell.s[3] = bx;  cell.s[4] = by;  cell.s[5] = bz;
 	cell.s[6] = cx;  cell.s[7] = cy;  cell.s[8] = cz;
 
-	/* Calculate wavelength */
-	klow = 1.0/(image->lambda*(1.0 + image->beam->bandwidth/2.0));
-	khigh = 1.0/(image->lambda*(1.0 - image->beam->bandwidth/2.0));
-	bwstep = (khigh-klow) / BWSAMPLING;
-
-	/* Calculate divergence stuff */
-	divxlow = -image->beam->divergence/2.0;
-	divylow = -image->beam->divergence/2.0;
-	divxstep = image->beam->divergence / DIVSAMPLING;
-	divystep = image->beam->divergence / DIVSAMPLING;
-
 	ncells.s[0] = na;
 	ncells.s[1] = nb;
 	ncells.s[2] = nc;
@@ -204,20 +184,12 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	check_sinc_lut(gctx, nb);
 	check_sinc_lut(gctx, nc);
 
-	if ( set_arg_float(gctx, 2, klow) ) return;
+	if ( set_arg_float(gctx, 2, 1.0/image->lambda) ) return;
 	if ( set_arg_mem(gctx, 9, gctx->intensities) ) return;
-	if ( set_arg_int(gctx, 12, sampling) ) return;
-	if ( set_arg_float(gctx, 14, bwstep) ) return;
-	if ( set_arg_mem(gctx, 15, gctx->sinc_luts[na-1]) ) return;
-	if ( set_arg_mem(gctx, 16, gctx->sinc_luts[nb-1]) ) return;
-	if ( set_arg_mem(gctx, 17, gctx->sinc_luts[nc-1]) ) return;
-	if ( set_arg_mem(gctx, 18, gctx->flags) ) return;
-	if ( set_arg_float(gctx, 23, divxlow) ) return;
-	if ( set_arg_float(gctx, 24, divxstep) ) return;
-	if ( set_arg_int(gctx, 25, DIVSAMPLING) ) return;
-	if ( set_arg_float(gctx, 26, divylow) ) return;
-	if ( set_arg_float(gctx, 27, divystep) ) return;
-	if ( set_arg_int(gctx, 28, DIVSAMPLING) ) return;
+	if ( set_arg_mem(gctx, 10, gctx->sinc_luts[na-1]) ) return;
+	if ( set_arg_mem(gctx, 11, gctx->sinc_luts[nb-1]) ) return;
+	if ( set_arg_mem(gctx, 12, gctx->sinc_luts[nc-1]) ) return;
+	if ( set_arg_mem(gctx, 13, gctx->flags) ) return;
 
 	/* Unit cell */
 	err = clSetKernelArg(gctx->kern, 8, sizeof(cl_float16), &cell);
@@ -226,14 +198,6 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		return;
 	}
 
-	/* Local memory for reduction */
-	sprod = BWSAMPLING*SAMPLING*SAMPLING*DIVSAMPLING*DIVSAMPLING;
-	err = clSetKernelArg(gctx->kern, 13, sprod*sizeof(cl_float), NULL);
-	if ( err != CL_SUCCESS ) {
-		ERROR("Couldn't set local memory: %s\n", clError(err));
-		return;
-	}
-
 	/* Allocate memory for the result */
 	image->data = calloc(image->width * image->height, sizeof(float));
 	image->twotheta = calloc(image->width * image->height, sizeof(double));
@@ -241,9 +205,8 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	/* Iterate over panels */
 	for ( i=0; i<image->det->n_panels; i++ ) {
 
-		size_t dims[3];
-		size_t ldims[3] = {SAMPLING, SAMPLING,
-		                   BWSAMPLING * DIVSAMPLING * DIVSAMPLING};
+		size_t dims[2];
+		size_t ldims[2] = {1, 1};
 		struct panel *p;
 		cl_mem tt;
 		size_t tt_size;
@@ -282,18 +245,15 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		if ( set_arg_float(gctx, 5, p->cny) ) return;
 		if ( set_arg_float(gctx, 6, p->res) ) return;
 		if ( set_arg_float(gctx, 7, p->clen) ) return;
-		if ( set_arg_int(gctx, 10, p->min_fs) ) return;
-		if ( set_arg_int(gctx, 11, p->min_ss) ) return;
-		if ( set_arg_float(gctx, 19, p->fsx) ) return;
-		if ( set_arg_float(gctx, 20, p->fsy) ) return;
-		if ( set_arg_float(gctx, 21, p->ssx) ) return;
-		if ( set_arg_float(gctx, 22, p->ssy) ) return;
-
-		dims[0] = pan_width * SAMPLING;
-		dims[1] = pan_height * SAMPLING;
-		dims[2] = BWSAMPLING * DIVSAMPLING * DIVSAMPLING;
-
-		err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 3, NULL,
+		if ( set_arg_float(gctx, 14, p->fsx) ) return;
+		if ( set_arg_float(gctx, 15, p->fsy) ) return;
+		if ( set_arg_float(gctx, 16, p->ssx) ) return;
+		if ( set_arg_float(gctx, 17, p->ssy) ) return;
+
+		dims[0] = pan_width;
+		dims[1] = pan_height;
+
+		err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL,
 		                             dims, ldims, 0, NULL, NULL);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't enqueue diffraction kernel: %s\n",
diff --git a/src/diffraction.c b/src/diffraction.c
index de994133..a37c2fcb 100644
--- a/src/diffraction.c
+++ b/src/diffraction.c
@@ -27,9 +27,6 @@
 #include "pattern_sim.h"
 
 
-#define SAMPLING (4)
-#define BWSAMPLING (10)
-#define DIVSAMPLING (1)
 #define SINC_LUT_ELEMENTS (4096)
 
 
@@ -355,11 +352,9 @@ void get_diffraction(struct image *image, int na, int nb, int nc,
 	double ax, ay, az;
 	double bx, by, bz;
 	double cx, cy, cz;
-	float klow, khigh, bwstep;
 	double *lut_a;
 	double *lut_b;
 	double *lut_c;
-	double divxlow, divylow, divxstep, divystep;
 
 	cell_get_cartesian(cell, &ax, &ay, &az, &bx, &by, &bz, &cx, &cy, &cz);
 
@@ -369,15 +364,6 @@ void get_diffraction(struct image *image, int na, int nb, int nc,
 	/* Needed later for Lorentz calculation */
 	image->twotheta = malloc(image->width * image->height * sizeof(double));
 
-	klow = 1.0/(image->lambda*(1.0 + image->beam->bandwidth/2.0));
-	khigh = 1.0/(image->lambda*(1.0 - image->beam->bandwidth/2.0));
-	bwstep = (khigh-klow) / BWSAMPLING;
-
-	divxlow = -image->beam->divergence/2.0;
-	divylow = -image->beam->divergence/2.0;
-	divxstep = image->beam->divergence / DIVSAMPLING;
-	divystep = image->beam->divergence / DIVSAMPLING;
-
 	lut_a = get_sinc_lut(na);
 	lut_b = get_sinc_lut(nb);
 	lut_c = get_sinc_lut(nc);
@@ -385,73 +371,34 @@ void get_diffraction(struct image *image, int na, int nb, int nc,
 	for ( fs=0; fs<image->width; fs++ ) {
 	for ( ss=0; ss<image->height; ss++ ) {
 
-		int fs_step, ss_step, kstep;
-		int divxval, divyval;
-		int idx = fs + image->width*ss;
-
-		for ( fs_step=0; fs_step<SAMPLING; fs_step++ ) {
-		for ( ss_step=0; ss_step<SAMPLING; ss_step++ ) {
-		for ( kstep=0; kstep<BWSAMPLING; kstep++ ) {
-		for ( divxval=0; divxval<DIVSAMPLING; divxval++ ) {
-		for ( divyval=0; divyval<DIVSAMPLING; divyval++ ) {
-
-			double k;
-			double intensity;
-			double f_lattice, I_lattice;
-			double I_molecule;
-			struct rvec q, qn;
-			double twotheta;
-			const double dfs = (double)fs
-			                    + ((double)fs_step / SAMPLING);
-			const double dss = (double)ss
-			                    + ((double)ss_step / SAMPLING);
-
-			double xdiv = divxlow + divxstep*(double)divxval;
-			double ydiv = divylow + divystep*(double)divyval;
-
-			/* Calculate k this time round */
-			k = klow + (double)kstep * bwstep;
-
-			qn = get_q(image, dfs, dss, &twotheta, k);
-
-			/* x divergence */
-			q.u =  qn.u*cos(xdiv) +qn.w*sin(xdiv);
-			q.v =  qn.v;
-			q.w = -qn.u*sin(xdiv) +qn.w*cos(xdiv);
+		int idx;
+		double k;
+		double intensity;
+		double f_lattice, I_lattice;
+		double I_molecule;
+		struct rvec q;
+		double twotheta;
 
-			qn = q;
+		/* Calculate k this time round */
+		k = 1.0/image->lambda;
 
-			/* y divergence */
-			q.v =  qn.v*cos(ydiv) +qn.w*sin(ydiv);
-			q.w = -qn.v*sin(ydiv) +qn.w*cos(ydiv);
+		q = get_q(image, fs, ss, &twotheta, k);
 
-			f_lattice = lattice_factor(q, ax, ay, az,
-			                              bx, by, bz,
-			                              cx, cy, cz,
-			                              lut_a, lut_b, lut_c);
+		f_lattice = lattice_factor(q, ax, ay, az,
+		                              bx, by, bz,
+		                              cx, cy, cz,
+		                              lut_a, lut_b, lut_c);
 
-			I_molecule = molecule_factor(intensities,
-			                             phases, flags, q,
-			                             ax,ay,az,bx,by,bz,cx,cy,cz,
-			                             m, sym);
+		I_molecule = molecule_factor(intensities,
+		                             phases, flags, q,
+		                             ax,ay,az,bx,by,bz,cx,cy,cz,
+		                             m, sym);
 
-			I_lattice = pow(f_lattice, 2.0);
-			intensity = I_lattice * I_molecule;
-
-			image->data[idx] += intensity;
-
-			if ( fs_step + ss_step + kstep == 0 ) {
-				image->twotheta[idx] = twotheta;
-			}
-
-		}
-		}
-		}
-		}
-		}
+		I_lattice = pow(f_lattice, 2.0);
 
-		image->data[idx] /= (SAMPLING*SAMPLING*BWSAMPLING
-		                     *DIVSAMPLING*DIVSAMPLING);
+		idx = fs + image->width*ss;
+		image->data[idx] = I_lattice * I_molecule;
+		image->twotheta[idx] = twotheta;
 
 
 	}
diff --git a/src/pattern_sim.h b/src/pattern_sim.h
index fb376596..5d31dadf 100644
--- a/src/pattern_sim.h
+++ b/src/pattern_sim.h
@@ -20,7 +20,7 @@
 /* Maxmimum index to hold values up to (can be increased if necessary)
  * WARNING: Altering this value constitutes an ABI change, and means you must
  * update data/diffraction.cl then recompile and reinstall everything. */
-#define INDMAX 200
+#define INDMAX 120
 
 /* Array size */
 #define IDIM (INDMAX*2 +1)