aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas White <taw@physics.org>2010-02-19 19:12:55 +0100
committerThomas White <taw@physics.org>2010-02-19 19:12:55 +0100
commitc7e0450702ab668cdcda34541e6bf815d50be8a8 (patch)
tree085d35a5b5b846572b9c043bbe7b15f01c0e3325
parentd8115c2fc3bc1c69b751f907e323acc45f6a758a (diff)
Add bandwidth to GPU calculation
Also: alter CPU version to be cleaner and give exactly the same results at GPU, and fix an indexing bug
-rw-r--r--data/diffraction.cl14
-rw-r--r--src/diffraction-gpu.c34
-rw-r--r--src/diffraction.c13
3 files changed, 38 insertions, 23 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl
index 138af028..41d331b3 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -127,11 +127,12 @@ float2 get_sfac(global float2 *sfacs, float16 cell, float4 q)
}
-kernel void diffraction(global float2 *diff, global float *tt, float k,
+kernel void diffraction(global float2 *diff, global float *tt, float klow,
int w, float cx, float cy,
float res, float clen, float16 cell,
global float2 *sfacs, float4 z, int4 ncells,
- int xmin, int ymin, int sampling, local float2 *tmp)
+ int xmin, int ymin, int sampling, local float2 *tmp,
+ float kstep)
{
float ttv;
const int x = get_global_id(0) + (xmin*sampling);
@@ -141,6 +142,8 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
float4 q;
const int lx = get_local_id(0);
const int ly = get_local_id(1);
+ const int lb = get_local_id(2);
+ float k = klow + kstep * get_local_id(2);
const int ax = x / sampling;
const int ay = y / sampling;
@@ -150,7 +153,7 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
f_molecule = get_sfac(sfacs, cell, q);
/* Write the value to local memory */
- tmp[lx+sampling*ly] = f_molecule * f_lattice;
+ tmp[lx+sampling*ly+sampling*sampling*lb] = f_molecule * f_lattice;
/* Memory fence */
barrier(CLK_LOCAL_MEM_FENCE);
@@ -161,9 +164,10 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
int i;
float2 sum = (0.0, 0.0);
- for ( i=0; i<sampling*sampling; i++ ) sum += tmp[i];
+ for ( i=0; i<sampling*sampling*get_local_size(2); i++ )
+ sum += tmp[i];
- diff[ax+w*ay] = sum / (sampling*sampling);
+ diff[ax+w*ay] = sum / (sampling*sampling*get_local_size(2));
/* Leader thread also records the 2theta value.
* This should really be averaged across all pixels, but
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index b9485241..133c0bc1 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -25,8 +25,8 @@
#define SAMPLING (4)
-#define BWSAMPLING (1)
-#define BANDWIDTH (0.0 / 100.0)
+#define BWSAMPLING (10)
+#define BANDWIDTH (1.0 / 100.0)
struct gpu_context
@@ -150,7 +150,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
double ax, ay, az;
double bx, by, bz;
double cx, cy, cz;
- float kc;
+ float k, klow;
cl_event *event;
int p;
float *tt_ptr;
@@ -160,6 +160,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
cl_float4 orientation;
cl_int4 ncells;
const int sampling = SAMPLING;
+ cl_float bwstep;
cell_get_cartesian(image->molecule->cell, &ax, &ay, &az,
&bx, &by, &bz,
@@ -169,7 +170,9 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
cell[6] = cx; cell[7] = cy; cell[8] = cz;
/* Calculate wavelength */
- kc = 1.0/image->lambda; /* Centre value */
+ k = 1.0/image->lambda; /* Centre value */
+ klow = k - k*(BANDWIDTH/2.0); /* Lower value */
+ bwstep = k * BANDWIDTH / BWSAMPLING;
/* Orientation */
orientation[0] = image->orientation.w;
@@ -192,7 +195,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
ERROR("Couldn't set arg 1: %s\n", clError(err));
return;
}
- clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &kc);
+ clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &klow);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 2: %s\n", clError(err));
return;
@@ -228,26 +231,33 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
return;
}
/* Local memory for reduction */
- clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float),
- NULL);
+ clSetKernelArg(gctx->kern, 15,
+ BWSAMPLING*SAMPLING*SAMPLING*2*sizeof(cl_float), NULL);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 15: %s\n", clError(err));
return;
}
+ /* Bandwidth sampling step */
+ clSetKernelArg(gctx->kern, 16, sizeof(cl_float), &bwstep);
+ if ( err != CL_SUCCESS ) {
+ ERROR("Couldn't set arg 16: %s\n", clError(err));
+ return;
+ }
/* Iterate over panels */
event = malloc(image->det.n_panels * sizeof(cl_event));
for ( p=0; p<image->det.n_panels; p++ ) {
- size_t dims[2];
- size_t ldims[2] = {SAMPLING, SAMPLING};
+ size_t dims[3];
+ size_t ldims[3] = {SAMPLING, SAMPLING, BWSAMPLING};
/* In a future version of OpenCL, this could be done
* with a global work offset. But not yet... */
- dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x;
- dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y;
+ dims[0] = 1+image->det.panels[0].max_x-image->det.panels[0].min_x;
+ dims[1] = 1+image->det.panels[0].max_y-image->det.panels[0].min_y;
dims[0] *= SAMPLING;
dims[1] *= SAMPLING;
+ dims[2] = BWSAMPLING;
clSetKernelArg(gctx->kern, 4, sizeof(cl_float),
&image->det.panels[p].cx);
@@ -286,7 +296,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
return;
}
- err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL,
+ err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 3, NULL,
dims, ldims, 0, NULL, &event[p]);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't enqueue diffraction kernel: %s\n",
diff --git a/src/diffraction.c b/src/diffraction.c
index 876fac5d..6df00164 100644
--- a/src/diffraction.c
+++ b/src/diffraction.c
@@ -24,8 +24,8 @@
#define SAMPLING (4)
-#define BWSAMPLING (1)
-#define BANDWIDTH (0.0 / 100.0)
+#define BWSAMPLING (10)
+#define BANDWIDTH (1.0 / 100.0)
static double lattice_factor(struct rvec q, double ax, double ay, double az,
@@ -184,7 +184,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
double ax, ay, az;
double bx, by, bz;
double cx, cy, cz;
- float kc;
+ float k, klow, bwstep;
if ( image->molecule == NULL ) return;
@@ -206,7 +206,9 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
/* Needed later for Lorentz calculation */
image->twotheta = malloc(image->width * image->height * sizeof(double));
- kc = 1.0/image->lambda; /* Centre value */
+ k = 1.0/image->lambda; /* Centre value */
+ klow = k - k*(BANDWIDTH/2.0); /* Lower value */
+ bwstep = k * BANDWIDTH / BWSAMPLING;
for ( xs=0; xs<image->width*SAMPLING; xs++ ) {
for ( ys=0; ys<image->height*SAMPLING; ys++ ) {
@@ -229,8 +231,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
double complex val;
/* Calculate k this time round */
- k = kc + (kstep-(BWSAMPLING/2)) *
- kc*(BANDWIDTH/BWSAMPLING);
+ k = klow + kstep * bwstep;
q = get_q(image, xs, ys, SAMPLING, &twotheta, k);
image->twotheta[x + image->width*y] = twotheta;