Paste code

#define DIM_SIGNAL 10u
#define DIM_TEMPLATE 3u

int main(void)
{
	unsigned int dimSignal = DIM_SIGNAL;
	unsigned int dimTemplate = DIM_TEMPLATE;
	signed int signal[DIM_SIGNAL] = { -1, 0, 1, 2, 3, 7, 5, 2, 7, 8 };
	signed int templa[DIM_TEMPLATE] = { 1, 2, 3 };

	double medSignal = 0.0;
	double medTemplate = 0.0;
	double corr[DIM_SIGNAL - DIM_TEMPLATE];

	// calcul medie template
/* paralelizare: 
dimensiunea template-ului este o putere a lui 2 ->> 8 
, dar calculul medie face doar un thread. */
	for (unsigned int i = 0; i < dimTemplate; ++i)
	{
		medTemplate += templa[i];
	}
	medTemplate /= (double)dimTemplate;

	// template-matching with cross-correlation
/* paralelizare: 
fiecare thread calculeaza cate un t
fiecare thread face media semnalului in paralel:
runda 0:
t0 - acceseaza signal[t+0]
t1 - acceseaza signal[t+1]
t2 - acceseaza signal[t+2]
t3 - [t+3]
t4 - [t+4]
..
t7 - [t+7]
____
8 threads
pentru: 
DIM_SIGNAL = 10
DIM_TEMPLATE = 3
-> DIM_SIGNAL - DIM_TEMPLATE +1 threads.
____
cate runde?
3, pentru DIM_TEMPLATE = 3

deci pentru semnal de 4096 = 2^12
si template de 16 = 2^4
=> 16 runde, 4081 threads.
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
optimizare: semnal de 4096-1 + dim_template = 4111
-> 4111 - 16 + 1 = 4096 threads -> 128 warps a cate 32 threads.
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX
fiecare thread continua si face media semnalului. 
apoi la fel pentru urmatorul for, cu l_S. 
are deja media salvata, in DIM_TEMPLATE runde calculeaza corr[t].

optimizare: stream pentru trimitere medie template, dupa ce a fost calculata pe host.
________________________________________________________________________________________
IMPLEMENTARE
________________________________________________________________________________________
// 4111 - 16 + 1 = corr[4096]. 4096 threads. 
#define DIM_SIGNAL 4111
#define DIM_TEMPLATE 16

__global__ void xCorr_Template_Matching(double* corr, double* sig, double* tem, double medTem, unsigned int dimTem, unsigned int dimSig)
{
	unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
	double medSig = 0.0;

	for(unsigned int i = 0; i < dimTem; ++i)
	{
		medSig += sig[thrd_offs];
	}
	medSig /= (double)dimSig;

	double sumNumarator = 0.0;
	double sumNumitor_S = 0.0;
	double sumNumitor_T = 0.0;

	for(unsigned int i = 0; i < dimTem; ++i)
	{
		double l_S = sig[thrd_offs + i] - medSig;
		double l_T = tem[i] - medTem;

		sumNumarator += l_S * l_T;
		sumNumitor_S += l_S * l_S;
		sumNumitor_T += l_T * l_T;
	}

	corr[thrd_offs] = sumNumarator / (rsqrt(sumNumitor_S) * rsqrt(sumNumitor_T));
}

int main(void)
{

	cudaError_t cudaStatus;

	unsigned int dimSignal = DIM_SIGNAL;
	unsigned int dimTemplate = DIM_TEMPLATE;
	unsigned int dimCorr = dimSignal - dimTemplate + 1;
	double * dev_signal, *hos_signal;
	double * dev_templa, *hos_templa;
	double * dev_corr, *hos_corr;
	double hos_medTemplate = 0.0;
	double *dev_medTemplate;


// alocare memorie pentru vectori: semnal, template, rezultat
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX

	cudaStatus = cudaMalloc((void**)&hos_signal, sizeof(double)*dimSignal);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_signal, sizeof(double)*dimSignal);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&hos_templa, sizeof(double)*dimTemplate);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_templa, sizeof(double)*dimTemplate);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&hos_corr, sizeof(double)*dimCorr);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_corr, sizeof(double)*dimCorr);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_medTemplate, sizeof(double));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}	

// initializare semnal cu valori random. initializare template cu valori in (-1,1) (normalizate)
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX

	for(unsigned int i = 0; i < dimSignal; i++)
	{
		hos_signal[i] = doubleSignedRand(MIN, MAX);
	}

	for(unsigned int i = 0; i < dimTemplate; i++)
	{
		hos_templa[i] = Rand(-1, 1);
		hos_medTemplate += hos_templa[i];
	}
	hos_medTemplate /= (double)dimTemplate;

// Transmitere date de intrare catre device (GPU) + initializare zone de memorie pe device
_______________________________________________________________________
XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX XXXXX

	cudaStatus = cudaMemcpy(dev_signal, hos_signal, sizeof(double)*dimSignal, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}
	
	cudaStatus = cudaMemcpy(dev_templa, hos_templa, sizeof(double)*dimTemplate, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	cudaStatus = cudaMemcpy(dev_corr, hos_corr, sizeof(double)*dimCorr, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	cudaStatus = cudaMemcpy(dev_medTemplate, &hos_medTemplate, sizeof(double), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}
	// 32 * 8 * 16 = 4096 threads in total.
	unsigned int thrd_per_block = 32 * 8 // 8 warps
	unsigned int bollocks = dimSignal / thrd_per_block; // 16

	dim3 dimGrid(bollocks);
	dim3 dimBlock(thrd_per_block);

	cudaEvent_t start, stop;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start, 0);

	xCorr_Template_Matching <<<dimGrid, dimBlock>>>(dev_corr, dev_signal, dev_templa, *dev_medTemplate, dimTemplate, dimSignal);

	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaThreadSynchronize();

	cudaStatus = cudaMemcpy(hos_corr, dev_corr, sizeof(double)*dimCorr, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	for(unsigned int i = 0; i < dimCorr; ++i)
	{
		printf("%f ", hos_corr[i]);
	}
	printf("\n\n");

	double elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("Timp de executie varianta cu RAM: %f\n", elapsedTime);
	goto Succes;
Error:
	printf("Eroare!\n\n");
Succes:
	cudaFree(dev_corr);
	cudaFree(dev_signal);
	cudaFree(dev_templa);
	cudaFree(hos_corr);
	cudaFree(hos_signal);
	cudaFree(hos_templa);
	cudaFree(dev_medTemplate);

	return 0;
}

________________________________________________________________________________________	
________________________________________________________________________________________
*/
	for (unsigned int t = 0; t <= (dimSignal - dimTemplate); ++t)
    {
		medSignal = 0.0;

		for (unsigned int i = 0; i < dimTemplate; ++i)
		{
    		medSignal += signal[i+t];
		}
		medSignal /= (double)dimSignal;

		double sumNumarator = 0.0;
		double sumNumitor_S = 0.0;
		double sumNumitor_T = 0.0;

		for (unsigned int i = 0; i < dimTemplate; ++i)
		{
			double l_S = signal[i + t] - medSignal;
			double l_T = templa[i] - medTemplate;

			sumNumarator += l_S * l_T;
			sumNumitor_S += l_S * l_S;
			sumNumitor_T += l_T * l_T;
		}

		corr[t] = sumNumarator / (rsqrt(sumNumitor_S) * rsqrt(sumNumitor_T));
	}

	fprintf(stdout, "corelatie:\n");

	for (unsigned int t = 0; t <= dimSignal - dimTemplate; ++t)
	{
		fprintf(stdout, "%f    ", corr[t]);
	}

	fprintf(stdout, "\n");

    return 0;
}


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#define NR_ITER	100

//#define TIMP_EXEC__VAR_RAM 1
#define TIMP_EXEC__VAR_REG 1

#ifdef TIMP_EXEC__VAR_RAM
#ifdef TIMP_EXEC__VAR_REG
#error "Alege RAM sau REG."
#endif
#endif

#ifndef TIMP_EXEC__VAR_RAM
#ifndef TIMP_EXEC__VAR_REG
#error "Alege RAM sau REG."
#endif
#endif

xcor = suma[x] = suma(sig[x] - medie(sig[x], start, stop))


__global__ void inversareArray(int *d_out, int *d_in)
{
	int inOffset = blockDim.x * blockIdx.x;
	int outOffset = blockDim.x * gridDim.x - blockIdx.x * blockDim.x;
	int in = inOffset + threadIdx.x;
	int out = outOffset - threadIdx.x - 1;

#ifdef TIMP_EXEC__VAR_RAM == 1

	for (int i = 0; i < NR_ITER; ++i)
	{
		d_in[in]++;
		d_in[in]--;
	}

	d_out[out] = d_in[in];

#elif TIMP_EXEC__VAR_REG == 1

	int loc_d_in = d_in[in];
	for (int i = 0; i < NR_ITER; ++i)
	{
		loc_d_in++;
		loc_d_in--;
	}

	d_out[out] = loc_d_in;

#endif
}

int main(int argc, char** argv)
{
	cudaError_t cudaStatus;

	// pointer to host memory
	int *h_a;
	// size of array
	int dimA = 256 * 1024; // 256K elements (1MB total)
	// pointer to device memory
	int *d_in, *d_out;
	// define block size
	int numThreadsPerBlock = 256;
	// compute number of blocks needed based on
	// array size and desired block size
	int numBlocks = dimA / numThreadsPerBlock;


	// allocate host and device memory
	
	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
	
	cudaStatus = cudaMallocHost((void**)&h_a, memSize);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&d_in, memSize);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&d_out, memSize);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	// Initialize input array on host
	for (int i = 0; i < dimA; ++i)
	{
		h_a[i] = i;
	}


	// Copy host array to device array
	cudaStatus = cudaMemcpy(d_in, h_a, memSize, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}


	// launch kernel
	dim3 dimGrid(numBlocks);
	dim3 dimBlock(numThreadsPerBlock);

	cudaEvent_t start, stop;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start, 0);
	inversareArray <<< dimGrid, dimBlock >>>(d_out, d_in);
	
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	// block until the device has completed
	cudaThreadSynchronize();

	float elapsedTime;

	cudaEventElapsedTime(&elapsedTime, start, stop);

#ifdef TIMP_EXEC__VAR_RAM == 1
	printf("Timp de executie varianta cu RAM: %f\n", elapsedTime);
#elif TIMP_EXEC__VAR_REG == 1
	printf("Timp de executie varianta cu REG: %f\n", elapsedTime);
#endif

	// device to host copy
	cudaStatus = cudaMemcpy(h_a, d_out, memSize, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	// verify the data returned to the host is correct
	for (int i = 0; i < dimA; i++)
	{
		if (h_a[i] != (dimA - 1 - i))
		{
			printf("Gresit!\n");
			break;
		}
	}
	

	// free device memory
	cudaFree(d_in);
	cudaFree(d_out);

	// free host memory
	cudaFreeHost(h_a);
	
	// If the program makes it this far, then the results
	// are correct
	printf("Correct!\n");
	goto Sfarsit;
Error:
	printf("Eroare!\n");
Sfarsit:
	return 0;
}