Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as Access logs by xzcv ( 7 years ago )
#define DIM_SIGNAL 10u
#define DIM_TEMPLATE 3u
int main(void)
{
unsigned int dimSignal = DIM_SIGNAL;
unsigned int dimTemplate = DIM_TEMPLATE;
signed int signal[DIM_SIGNAL] = { -1, 0, 1, 2, 3, 7, 5, 2, 7, 8 };
signed int templa[DIM_TEMPLATE] = { 1, 2, 3 };
double medSignal = 0.0;
double medTemplate = 0.0;
double corr[DIM_SIGNAL - DIM_TEMPLATE];
// calcul medie template
/* paralelizare:
dimensiunea template-ului este o putere a lui 2 ->> 8
, dar calculul medie face doar un thread. */
for (unsigned int i = 0; i < dimTemplate; ++i)
{
medTemplate += templa[i];
}
medTemplate /= (double)dimTemplate;
// template-matching with cross-correlation
/* paralelizare:
fiecare thread calculeaza cate un t
fiecare thread face media semnalului in paralel:
runda 0:
t0 - acceseaza signal[t+0]
t1 - acceseaza signal[t+1]
t2 - acceseaza signal[t+2]
t3 - [t+3]
t4 - [t+4]
..
t7 - [t+7]
____
8 threads
pentru:
DIM_SIGNAL = 10
DIM_TEMPLATE = 3
-> DIM_SIGNAL - DIM_TEMPLATE +1 threads.
____
cate runde?
3, pentru DIM_TEMPLATE = 3
deci pentru semnal de 4096 = 2^12
si template de 16 = 2^4
=> 16 runde, 4081 threads.
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
optimizare: semnal de 4096-1 + dim_template = 4111
-> 4111 - 16 + 1 = 4096 threads -> 128 warps a cate 32 threads.
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
fiecare thread continua si face media semnalului.
apoi la fel pentru urmatorul for, cu l_S.
are deja media salvata, in DIM_TEMPLATE runde calculeaza corr[t].
optimizare: stream pentru trimitere medie template, dupa ce a fost calculata pe host.
________________________________________________________________________________________
IMPLEMENTARE
________________________________________________________________________________________
// 4111 - 16 + 1 = corr[4096]. 4096 threads.
#define DIM_SIGNAL 4111
#define DIM_TEMPLATE 16
__global__ void xCorr_Template_Matching(double* corr, double* sig, double* tem, double medTem, unsigned int dimTem, unsigned int dimSig)
{
unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
double medSig = 0.0;
for(unsigned int i = 0; i < dimTem; ++i)
{
medSig += sig[thrd_offs];
}
medSig /= (double)dimSig;
double sumNumarator = 0.0;
double sumNumitor_S = 0.0;
double sumNumitor_T = 0.0;
for(unsigned int i = 0; i < dimTem; ++i)
{
double l_S = sig[thrd_offs + i] - medSig;
double l_T = tem[i] - medTem;
sumNumarator += l_S * l_T;
sumNumitor_S += l_S * l_S;
sumNumitor_T += l_T * l_T;
}
corr[thrd_offs] = sumNumarator / (rsqrt(sumNumitor_S) * rsqrt(sumNumitor_T));
}
int main(void)
{
cudaError_t cudaStatus;
unsigned int dimSignal = DIM_SIGNAL;
unsigned int dimTemplate = DIM_TEMPLATE;
unsigned int dimCorr = dimSignal - dimTemplate + 1;
double * dev_signal, *hos_signal;
double * dev_templa, *hos_templa;
double * dev_corr, *hos_corr;
double hos_medTemplate = 0.0;
double *dev_medTemplate;
// alocare memorie pentru vectori: semnal, template, rezultat
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
cudaStatus = cudaMalloc((void**)&hos_signal, sizeof(double)*dimSignal);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_signal, sizeof(double)*dimSignal);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&hos_templa, sizeof(double)*dimTemplate);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_templa, sizeof(double)*dimTemplate);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&hos_corr, sizeof(double)*dimCorr);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_corr, sizeof(double)*dimCorr);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_medTemplate, sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// initializare semnal cu valori random. initializare template cu valori in (-1,1) (normalizate)
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
for(unsigned int i = 0; i < dimSignal; i++)
{
hos_signal[i] = doubleSignedRand(MIN, MAX);
}
for(unsigned int i = 0; i < dimTemplate; i++)
{
hos_templa[i] = Rand(-1, 1);
hos_medTemplate += hos_templa[i];
}
hos_medTemplate /= (double)dimTemplate;
// Transmitere date de intrare catre device (GPU) + initializare zone de memorie pe device
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
cudaStatus = cudaMemcpy(dev_signal, hos_signal, sizeof(double)*dimSignal, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_templa, hos_templa, sizeof(double)*dimTemplate, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_corr, hos_corr, sizeof(double)*dimCorr, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_medTemplate, &hos_medTemplate, sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// 32 * 8 * 16 = 4096 threads in total.
unsigned int thrd_per_block = 32 * 8 // 8 warps
unsigned int bollocks = dimSignal / thrd_per_block; // 16
dim3 dimGrid(bollocks);
dim3 dimBlock(thrd_per_block);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
xCorr_Template_Matching <<<dimGrid, dimBlock>>>(dev_corr, dev_signal, dev_templa, *dev_medTemplate, dimTemplate, dimSignal);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaThreadSynchronize();
cudaStatus = cudaMemcpy(hos_corr, dev_corr, sizeof(double)*dimCorr, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for(unsigned int i = 0; i < dimCorr; ++i)
{
printf("%f ", hos_corr[i]);
}
printf("\n\n");
double elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Timp de executie varianta cu RAM: %f\n", elapsedTime);
goto Succes;
Error:
printf("Eroare!\n\n");
Succes:
cudaFree(dev_corr);
cudaFree(dev_signal);
cudaFree(dev_templa);
cudaFree(hos_corr);
cudaFree(hos_signal);
cudaFree(hos_templa);
cudaFree(dev_medTemplate);
return 0;
}
________________________________________________________________________________________
________________________________________________________________________________________
*/
for (unsigned int t = 0; t <= (dimSignal - dimTemplate); ++t)
{
medSignal = 0.0;
for (unsigned int i = 0; i < dimTemplate; ++i)
{
medSignal += signal[i+t];
}
medSignal /= (double)dimSignal;
double sumNumarator = 0.0;
double sumNumitor_S = 0.0;
double sumNumitor_T = 0.0;
for (unsigned int i = 0; i < dimTemplate; ++i)
{
double l_S = signal[i + t] - medSignal;
double l_T = templa[i] - medTemplate;
sumNumarator += l_S * l_T;
sumNumitor_S += l_S * l_S;
sumNumitor_T += l_T * l_T;
}
corr[t] = sumNumarator / (rsqrt(sumNumitor_S) * rsqrt(sumNumitor_T));
}
fprintf(stdout, "corelatie:\n");
for (unsigned int t = 0; t <= dimSignal - dimTemplate; ++t)
{
fprintf(stdout, "%f ", corr[t]);
}
fprintf(stdout, "\n");
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define NR_ITER 100
//#define TIMP_EXEC__VAR_RAM 1
#define TIMP_EXEC__VAR_REG 1
#ifdef TIMP_EXEC__VAR_RAM
#ifdef TIMP_EXEC__VAR_REG
#error "Alege RAM sau REG."
#endif
#endif
#ifndef TIMP_EXEC__VAR_RAM
#ifndef TIMP_EXEC__VAR_REG
#error "Alege RAM sau REG."
#endif
#endif
xcor = suma[x] = suma(sig[x] - medie(sig[x], start, stop))
__global__ void inversareArray(int *d_out, int *d_in)
{
int inOffset = blockDim.x * blockIdx.x;
int outOffset = blockDim.x * gridDim.x - blockIdx.x * blockDim.x;
int in = inOffset + threadIdx.x;
int out = outOffset - threadIdx.x - 1;
#ifdef TIMP_EXEC__VAR_RAM == 1
for (int i = 0; i < NR_ITER; ++i)
{
d_in[in]++;
d_in[in]--;
}
d_out[out] = d_in[in];
#elif TIMP_EXEC__VAR_REG == 1
int loc_d_in = d_in[in];
for (int i = 0; i < NR_ITER; ++i)
{
loc_d_in++;
loc_d_in--;
}
d_out[out] = loc_d_in;
#endif
}
int main(int argc, char** argv)
{
cudaError_t cudaStatus;
// pointer to host memory
int *h_a;
// size of array
int dimA = 256 * 1024; // 256K elements (1MB total)
// pointer to device memory
int *d_in, *d_out;
// define block size
int numThreadsPerBlock = 256;
// compute number of blocks needed based on
// array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
cudaStatus = cudaMallocHost((void**)&h_a, memSize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_in, memSize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_out, memSize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
// Copy host array to device array
cudaStatus = cudaMemcpy(d_in, h_a, memSize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
inversareArray <<< dimGrid, dimBlock >>>(d_out, d_in);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// block until the device has completed
cudaThreadSynchronize();
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
#ifdef TIMP_EXEC__VAR_RAM == 1
printf("Timp de executie varianta cu RAM: %f\n", elapsedTime);
#elif TIMP_EXEC__VAR_REG == 1
printf("Timp de executie varianta cu REG: %f\n", elapsedTime);
#endif
// device to host copy
cudaStatus = cudaMemcpy(h_a, d_out, memSize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
if (h_a[i] != (dimA - 1 - i))
{
printf("Gresit!\n");
break;
}
}
// free device memory
cudaFree(d_in);
cudaFree(d_out);
// free host memory
cudaFreeHost(h_a);
// If the program makes it this far, then the results
// are correct
printf("Correct!\n");
goto Sfarsit;
Error:
printf("Eroare!\n");
Sfarsit:
return 0;
}
Revise this Paste