Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as C by nazgul ( 16 years ago )
#include <malloc.h>
#include <memory.h>
#include <stdio.h>
#include "convolution.h"
#define THREADS_X 32
#define THREADS_Y 16
#define MIN(a,b) ((a) < (b) ? (a) : (b))
__global__ void
convolution_kernel(short *in_rgb, short *out_rgb, unsigned int width, unsigned int height,
float *kernel, unsigned int kernelWidth, unsigned int kernelHeight)
{
unsigned int x, y, i, j;
float rSum = 0, gSum = 0, bSum = 0, kSum = 0;
x = blockIdx.x * blockDim.x + threadIdx.x;
y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height)
{
return;
}
for (i = 0; i < kernelWidth; i++)
{
for (j = 0; j < kernelHeight; j++)
{
int pixelPosX = x + (i - (kernelWidth / 2));
int pixelPosY = y + (j - (kernelHeight / 2));
if ((pixelPosX < 0) ||
(pixelPosX >= width) ||
(pixelPosY < 0) ||
(pixelPosY >= height)) continue;
short b = in_rgb[3 * (width * pixelPosY + pixelPosX) + 0];
short g = in_rgb[3 * (width * pixelPosY + pixelPosX) + 1];
short r = in_rgb[3 * (width * pixelPosY + pixelPosX) + 2];
float kernelVal = kernel[j * kernelWidth + i];
rSum += r * kernelVal;
gSum += g * kernelVal;
bSum += b * kernelVal;
kSum += kernelVal;
}
}
if (kSum <= 0) kSum = 1;
rSum /= kSum;
if (rSum < 0) rSum = 0;
if (rSum > 255) rSum = 255;
gSum /= kSum;
if (gSum < 0) gSum = 0;
if (gSum > 255) gSum = 255;
bSum /= kSum;
if (bSum < 0) bSum = 0;
if (bSum > 255) bSum = 255;
out_rgb[3 * (width * y + x) + 0] = (short)bSum;
out_rgb[3 * (width * y + x) + 1] = (short)gSum;
out_rgb[3 * (width * y + x) + 2] = (short)rSum;
}
void
get_grid_topology(unsigned int width, unsigned int height,
unsigned int *num_blocks_x, unsigned int *num_blocks_y,
unsigned int *num_threads)
{
*num_blocks_x = width / THREADS_X + (width % THREADS_X ? 1 : 0);
*num_blocks_y = width / THREADS_Y + (width % THREADS_Y ? 1 : 0);
*num_threads = 512;
}
int
convolution_apply (short *rgb, unsigned int width, unsigned int height,
float **kernel,
unsigned int kernelWidth, unsigned int kernelHeight)
{
unsigned int rgb_size, y;
short *cuda_in_rgb, *cuda_out_rgb;
float *cuda_kernel;
unsigned int num_blocks_x, num_blocks_y, num_threads;
/* Input RGB buffer */
rgb_size = 3 * width * height * sizeof(short);
cudaMalloc ((void**)&cuda;_in_rgb, rgb_size);
cudaMemcpy (cuda_in_rgb, rgb, rgb_size, cudaMemcpyHostToDevice);
/* Output RGB buffer */
cudaMalloc ((void**)&cuda;_out_rgb, rgb_size);
cudaMemcpy (cuda_out_rgb, rgb, rgb_size, cudaMemcpyHostToDevice);
/* Pass kernel to CUDA */
cudaMalloc ((void**)&cuda;_kernel, kernelWidth * kernelHeight * sizeof(float));
for (y = 0; y < kernelHeight; ++y)
{
cudaMemcpy (cuda_kernel + (y * kernelWidth), kernel[y],
kernelWidth * sizeof(float), cudaMemcpyHostToDevice);
}
/* Run kernels */
get_grid_topology (width, height, #_blocks_x, #_blocks_y, #_threads);
{
dim3 dim_grid(num_blocks_x, num_blocks_y);
dim3 dim_block(THREADS_X, THREADS_Y);
convolution_kernel <<< dim_grid, dim_block >>> (cuda_in_rgb, cuda_out_rgb, width, height,
cuda_kernel, kernelWidth, kernelHeight);
}
cudaMemcpy (rgb, cuda_out_rgb, rgb_size, cudaMemcpyDeviceToHost);
/* Free memory */
cudaFree (cuda_in_rgb);
cudaFree (cuda_out_rgb);
cudaFree (cuda_kernel);
return 0;
}
Revise this Paste