/* * diffraction-gpu.c * * Calculate diffraction patterns by Fourier methods (GPU version) * * Copyright © 2012-2020 Deutsches Elektronen-Synchrotron DESY, * a research centre of the Helmholtz Association. * * Authors: * 2009-2015 Thomas White * 2013 Alexandra Tolstikova * 2013-2014 Chun Hong Yoon * * This file is part of CrystFEL. * * CrystFEL is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * CrystFEL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with CrystFEL. If not, see . * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #ifdef HAVE_CL_CL_H #include #else #include #endif #include "image.h" #include "utils.h" #include "cell.h" #include "diffraction.h" #include "cl-utils.h" #include "pattern_sim.h" #include "diffraction.cl.h" #define SINC_LUT_ELEMENTS (4096) struct gpu_context { cl_context ctx; cl_command_queue cq; cl_program prog; cl_kernel kern; cl_mem intensities; cl_mem flags; /* Array of sinc LUTs */ cl_mem *sinc_luts; cl_float **sinc_lut_ptrs; int max_sinc_lut; /* Number of LUTs, i.e. one greater than the maximum * index. This equals the highest allowable "n". */ }; static void check_sinc_lut(struct gpu_context *gctx, int n, int no_fringes, int flat) { cl_int err; cl_image_format fmt; int i; if ( n > gctx->max_sinc_lut ) { gctx->sinc_luts = realloc(gctx->sinc_luts, n*sizeof(*gctx->sinc_luts)); gctx->sinc_lut_ptrs = realloc(gctx->sinc_lut_ptrs, n*sizeof(*gctx->sinc_lut_ptrs)); for ( i=gctx->max_sinc_lut; isinc_lut_ptrs[i] = NULL; } gctx->max_sinc_lut = n; } if ( gctx->sinc_lut_ptrs[n-1] != NULL ) return; /* Create a new sinc LUT */ gctx->sinc_lut_ptrs[n-1] = malloc(SINC_LUT_ELEMENTS*sizeof(cl_float)); gctx->sinc_lut_ptrs[n-1][0] = n; if ( n == 1 ) { for ( i=1; isinc_lut_ptrs[n-1][i] = 1.0; } } else { for ( i=1; i 1.0/n) && (1.0-x > 1.0/n) ) { val = 0.0; } else if ( flat ) { val = n; } else { val = fabs(sin(M_PI*n*x)/sin(M_PI*x)); } gctx->sinc_lut_ptrs[n-1][i] = val; } } fmt.image_channel_order = CL_INTENSITY; fmt.image_channel_data_type = CL_FLOAT; gctx->sinc_luts[n-1] = clCreateImage2D(gctx->ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &fmt, SINC_LUT_ELEMENTS, 1, 0, gctx->sinc_lut_ptrs[n-1], &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't create LUT for %i\n", n); return; } } static int set_arg_float(struct gpu_context *gctx, int idx, float val) { cl_int err; err = clSetKernelArg(gctx->kern, idx, sizeof(cl_float), &val); if ( err != CL_SUCCESS ) { ERROR("Couldn't set kernel argument %i: %s\n", idx, clError(err)); return 1; } return 0; } static int set_arg_int(struct gpu_context *gctx, int idx, int val) { cl_int err; err = clSetKernelArg(gctx->kern, idx, sizeof(cl_int), &val); if ( err != CL_SUCCESS ) { ERROR("Couldn't set kernel argument %i: %s\n", idx, clError(err)); return 1; } return 0; } static int set_arg_mem(struct gpu_context *gctx, int idx, cl_mem val) { cl_int err; err = clSetKernelArg(gctx->kern, idx, sizeof(cl_mem), &val); if ( err != CL_SUCCESS ) { ERROR("Couldn't set kernel argument %i: %s\n", idx, clError(err)); return 1; } return 0; } static int do_panels(struct gpu_context *gctx, struct image *image, double k, double weight, int *n_inf, int *n_neg, int *n_nan) { int i; const int sampling = 4; /* This, squared, number of samples / pixel */ if ( set_arg_float(gctx, 1, k) ) return 1; if ( set_arg_float(gctx, 2, weight) ) return 1; /* Iterate over panels */ for ( i=0; idet->n_panels; i++ ) { size_t dims[2]; size_t ldims[2]; struct panel *p; cl_mem diff; size_t diff_size; float *diff_ptr; int fs, ss; cl_int err; p = &image->det->panels[i]; /* Buffer for the results of this panel */ diff_size = p->w * p->h * sizeof(cl_float); diff = clCreateBuffer(gctx->ctx, CL_MEM_WRITE_ONLY, diff_size, NULL, &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't allocate diffraction memory\n"); return 1; } if ( set_arg_mem(gctx, 0, diff) ) return 1; if ( set_arg_int(gctx, 3, p->w) ) return 1; if ( set_arg_float(gctx, 4, p->cnx) ) return 1; if ( set_arg_float(gctx, 5, p->cny) ) return 1; if ( set_arg_float(gctx, 6, p->fsx) ) return 1; if ( set_arg_float(gctx, 7, p->fsy) ) return 1; if ( set_arg_float(gctx, 8, p->fsz) ) return 1; if ( set_arg_float(gctx, 9, p->ssx) ) return 1; if ( set_arg_float(gctx, 10, p->ssy) ) return 1; if ( set_arg_float(gctx, 11, p->ssz) ) return 1; if ( set_arg_float(gctx, 12, p->res) ) return 1; if ( set_arg_float(gctx, 13, p->clen) ) return 1; dims[0] = p->w * sampling; dims[1] = p->h * sampling; ldims[0] = sampling; ldims[1] = sampling; err = clSetKernelArg(gctx->kern, 20, sampling*sampling*sizeof(cl_float), NULL); if ( err != CL_SUCCESS ) { ERROR("Couldn't set local memory: %s\n", clError(err)); return 1; } err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL, dims, ldims, 0, NULL, NULL); if ( err != CL_SUCCESS ) { ERROR("Couldn't enqueue diffraction kernel: %s\n", clError(err)); return 1; } clFinish(gctx->cq); diff_ptr = clEnqueueMapBuffer(gctx->cq, diff, CL_TRUE, CL_MAP_READ, 0, diff_size, 0, NULL, NULL, &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't map diffraction buffer: %s\n", clError(err)); return 1; } for ( ss=0; ssh; ss++ ) { for ( fs=0; fsw; fs++ ) { float val; val = diff_ptr[fs + p->w*ss]; if ( isinf(val) ) (*n_inf)++; if ( val < 0.0 ) (*n_neg)++; if ( isnan(val) ) (*n_nan)++; image->dp[i][fs + p->w*ss] += val; } } clEnqueueUnmapMemObject(gctx->cq, diff, diff_ptr, 0, NULL, NULL); clReleaseMemObject(diff); } return 0; } int get_diffraction_gpu(struct gpu_context *gctx, struct image *image, int na, int nb, int nc, UnitCell *ucell, int no_fringes, int flat, int n_samples) { double ax, ay, az; double bx, by, bz; double cx, cy, cz; cl_float16 cell; cl_int err; int n_inf = 0; int n_neg = 0; int n_nan = 0; int i; double kmin, kmax, step, norm; if ( gctx == NULL ) { ERROR("GPU setup failed.\n"); return 1; } /* Ensure all required LUTs are available */ check_sinc_lut(gctx, na, no_fringes, flat); check_sinc_lut(gctx, nb, no_fringes, flat); check_sinc_lut(gctx, nc, no_fringes, flat); /* Unit cell */ cell_get_cartesian(ucell, &ax, &ay, &az, &bx, &by, &bz, &cx, &cy, &cz); cell.s[0] = ax; cell.s[1] = ay; cell.s[2] = az; cell.s[3] = bx; cell.s[4] = by; cell.s[5] = bz; cell.s[6] = cx; cell.s[7] = cy; cell.s[8] = cz; err = clSetKernelArg(gctx->kern, 14, sizeof(cl_float16), &cell); if ( err != CL_SUCCESS ) { ERROR("Couldn't set unit cell: %s\n", clError(err)); return 1; } if ( set_arg_mem(gctx, 15, gctx->intensities) ) return 1; if ( set_arg_mem(gctx, 16, gctx->flags) ) return 1; if ( set_arg_mem(gctx, 17, gctx->sinc_luts[na-1]) ) return 1; if ( set_arg_mem(gctx, 18, gctx->sinc_luts[nb-1]) ) return 1; if ( set_arg_mem(gctx, 19, gctx->sinc_luts[nc-1]) ) return 1; /* Allocate memory for the result */ image->dp = malloc(image->det->n_panels * sizeof(float *)); if ( image->dp == NULL ) { ERROR("Couldn't allocate memory for result.\n"); return 1; } for ( i=0; idet->n_panels; i++ ) { struct panel *p = &image->det->panels[i]; image->dp[i] = calloc(p->w * p->h, sizeof(float)); if ( image->dp[i] == NULL ) { ERROR("Couldn't allocate memory for panel %i\n", i); return 1; } } spectrum_get_range(image->spectrum, &kmin, &kmax); step = (kmax-kmin)/(n_samples+1); /* Determine normalisation factor such that weights add up to 1 after * sampling (bins must have constant width) */ norm = 0.0; for ( i=1; i<=n_samples; i++ ) { double k = kmin + i*step; norm += spectrum_get_density_at_k(image->spectrum, k); } for ( i=1; i<=n_samples; i++ ) { double k = kmin + i*step; double prob; /* Probability = p.d.f. times step width */ prob = spectrum_get_density_at_k(image->spectrum, k)/norm; STATUS("Wavelength: %e m, weight = %.5f\n", 1.0/k, prob); err = do_panels(gctx, image, k, prob, &n_inf, &n_neg, &n_nan); if ( err ) return 1; } if ( n_neg + n_inf + n_nan ) { ERROR("WARNING: The GPU calculation produced %i negative" " values, %i infinities and %i NaNs.\n", n_neg, n_inf, n_nan); } return 0; } /* Setup the OpenCL stuff, create buffers, load the structure factor table */ struct gpu_context *setup_gpu(int no_sfac, const double *intensities, unsigned char *flags, const char *sym, int dev_num) { struct gpu_context *gctx; cl_uint nplat; cl_platform_id platforms[8]; cl_context_properties prop[3]; cl_int err; cl_device_id dev; size_t intensities_size; float *intensities_ptr; size_t flags_size; float *flags_ptr; size_t maxwgsize; int i; int have_ctx = 0; char cflags[512] = ""; char *insert_stuff = NULL; STATUS("Setting up GPU...\n"); err = clGetPlatformIDs(8, platforms, &nplat); if ( err != CL_SUCCESS ) { ERROR("Couldn't get platform IDs: %i\n", err); return NULL; } if ( nplat == 0 ) { ERROR("Couldn't find at least one platform!\n"); return NULL; } /* Find a GPU platform in the list */ for ( i=0; ictx = clCreateContextFromType(prop, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); if ( err != CL_SUCCESS ) { if ( err == CL_DEVICE_NOT_FOUND ) { /* No GPU device in this platform */ continue; /* Try next platform */ } else { ERROR("Couldn't create OpenCL context: %s (%i)\n", clError(err), err); free(gctx); return NULL; } } else { STATUS("Using OpenCL platform %i (%i total)\n", i, nplat); have_ctx = 1; break; } } if ( !have_ctx ) { ERROR("Couldn't find a GPU device in any platform.\n"); return NULL; } dev = get_cl_dev(gctx->ctx, dev_num); gctx->cq = clCreateCommandQueue(gctx->ctx, dev, 0, &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't create OpenCL command queue\n"); free(gctx); return NULL; } /* Create a single-precision version of the scattering factors */ intensities_size = IDIM*IDIM*IDIM*sizeof(cl_float); intensities_ptr = malloc(intensities_size); if ( intensities != NULL ) { for ( i=0; iintensities = clCreateBuffer(gctx->ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, intensities_size, intensities_ptr, &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't allocate intensities memory\n"); free(gctx); return NULL; } free(intensities_ptr); if ( sym != NULL ) { int i, n; SymOpList *pg; size_t islen = 0; insert_stuff = malloc(16384); if ( insert_stuff == NULL ) return NULL; insert_stuff[0] = '\0'; pg = get_pointgroup(sym); n = num_equivs(pg, NULL); for ( i=0; i 16383 ) { ERROR("Too many symmetry operators.\n"); return NULL; } strcat(insert_stuff, line); } free_symoplist(pg); printf("Inserting --->%s<---\n", insert_stuff); } else { if ( intensities != NULL ) { ERROR("You gave me an intensities file but no point" " group. I'm assuming '1'.\n"); strncat(cflags, "-DPG1 ", 511-strlen(cflags)); } } /* Create a flag array */ flags_size = IDIM*IDIM*IDIM*sizeof(cl_float); flags_ptr = malloc(flags_size); if ( flags != NULL ) { for ( i=0; iflags = clCreateBuffer(gctx->ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, flags_size, flags_ptr, &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't allocate flag buffer\n"); free(gctx); return NULL; } free(flags_ptr); gctx->prog = load_program_from_string((char *)data_diffraction_cl, data_diffraction_cl_len, gctx->ctx, dev, &err, cflags, insert_stuff); if ( err != CL_SUCCESS ) { free(gctx); return NULL; } gctx->kern = clCreateKernel(gctx->prog, "diffraction", &err); if ( err != CL_SUCCESS ) { ERROR("Couldn't create kernel\n"); free(gctx); return NULL; } gctx->max_sinc_lut = 0; gctx->sinc_lut_ptrs = NULL; gctx->sinc_luts = NULL; clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxwgsize, NULL); STATUS("Maximum work group size = %lli\n", (long long int)maxwgsize); return gctx; } void cleanup_gpu(struct gpu_context *gctx) { int i; clReleaseProgram(gctx->prog); clReleaseMemObject(gctx->intensities); /* Release LUTs */ for ( i=1; i<=gctx->max_sinc_lut; i++ ) { if ( gctx->sinc_lut_ptrs[i-1] != NULL ) { clReleaseMemObject(gctx->sinc_luts[i-1]); free(gctx->sinc_lut_ptrs[i-1]); } } free(gctx->sinc_luts); free(gctx->sinc_lut_ptrs); clReleaseCommandQueue(gctx->cq); clReleaseContext(gctx->ctx); free(gctx); }