Paste code

#include <malloc.h>
#include <memory.h>
#include <stdio.h>
#include "convolution.h"

#define THREADS_X 32
#define THREADS_Y 16

#define MIN(a,b) ((a) < (b) ? (a) : (b))

__global__ void
convolution_kernel(short *in_rgb, short *out_rgb, unsigned int width, unsigned int height,
                float *kernel, unsigned int kernelWidth, unsigned int kernelHeight)
{
  unsigned int x, y, i, j;
  float rSum = 0, gSum = 0, bSum = 0, kSum = 0;

  x = blockIdx.x * blockDim.x + threadIdx.x;
  y = blockIdx.y * blockDim.y + threadIdx.y;

  if (x >= width || y >= height)
    {
      return;
    }

  for (i = 0; i < kernelWidth; i++)
    {
      for (j = 0; j < kernelHeight; j++)
        {
          int pixelPosX = x + (i - (kernelWidth / 2));
          int pixelPosY = y + (j - (kernelHeight / 2));
          if ((pixelPosX < 0) || 
              (pixelPosX >= width) || 
              (pixelPosY < 0) || 
              (pixelPosY >= height)) continue;

          short b = in_rgb[3 * (width * pixelPosY + pixelPosX) + 0];
          short g = in_rgb[3 * (width * pixelPosY + pixelPosX) + 1];
          short r = in_rgb[3 * (width * pixelPosY + pixelPosX) + 2];

          float kernelVal = kernel[j * kernelWidth + i];

          rSum += r * kernelVal;
          gSum += g * kernelVal;
          bSum += b * kernelVal;

          kSum += kernelVal;
        }
    }

  if (kSum <= 0) kSum = 1;

  rSum /= kSum;
  if (rSum < 0) rSum = 0;
  if (rSum > 255) rSum = 255;

  gSum /= kSum;
  if (gSum < 0) gSum = 0;
  if (gSum > 255) gSum = 255;

  bSum /= kSum;
  if (bSum < 0) bSum = 0;
  if (bSum > 255) bSum = 255;

  out_rgb[3 * (width * y + x) + 0] = (short)bSum;
  out_rgb[3 * (width * y + x) + 1] = (short)gSum;
  out_rgb[3 * (width * y + x) + 2] = (short)rSum;
}

void
get_grid_topology(unsigned int width, unsigned int height,
                  unsigned int *num_blocks_x, unsigned int *num_blocks_y,
                  unsigned int *num_threads)
{
  *num_blocks_x = width / THREADS_X + (width % THREADS_X ? 1 : 0);
  *num_blocks_y = width / THREADS_Y + (width % THREADS_Y ? 1 : 0);
  *num_threads = 512;
}

int
convolution_apply (short *rgb, unsigned int width, unsigned int height,
                   float **kernel,
                   unsigned int kernelWidth, unsigned int kernelHeight)
{
  unsigned int rgb_size, y;
  short *cuda_in_rgb, *cuda_out_rgb;
  float *cuda_kernel;
  unsigned int num_blocks_x, num_blocks_y, num_threads;

  /* Input RGB buffer */
  rgb_size = 3 * width * height * sizeof(short);
  cudaMalloc ((void**)&cuda;_in_rgb, rgb_size);
  cudaMemcpy (cuda_in_rgb, rgb, rgb_size, cudaMemcpyHostToDevice);

  /* Output RGB buffer */
  cudaMalloc ((void**)&cuda;_out_rgb, rgb_size);
  cudaMemcpy (cuda_out_rgb, rgb, rgb_size, cudaMemcpyHostToDevice);

  /* Pass kernel to CUDA */
  cudaMalloc ((void**)&cuda;_kernel, kernelWidth * kernelHeight * sizeof(float));
  for (y = 0; y < kernelHeight; ++y)
    {
      cudaMemcpy (cuda_kernel + (y * kernelWidth), kernel[y],
                  kernelWidth * sizeof(float), cudaMemcpyHostToDevice);
    }

  /* Run kernels */
  get_grid_topology (width, height, &num;_blocks_x, &num;_blocks_y, &num;_threads);
  {
    dim3 dim_grid(num_blocks_x, num_blocks_y);
    dim3 dim_block(THREADS_X, THREADS_Y);
    convolution_kernel <<< dim_grid, dim_block >>> (cuda_in_rgb, cuda_out_rgb, width, height,
       cuda_kernel, kernelWidth, kernelHeight);
  }

  cudaMemcpy (rgb, cuda_out_rgb, rgb_size, cudaMemcpyDeviceToHost);

  /* Free memory */
  cudaFree (cuda_in_rgb);
  cudaFree (cuda_out_rgb);
  cudaFree (cuda_kernel);

  return 0;
}