05 May 2018 0 2K Report

(1) Apply CPU and GPU to compute a matrix multiplication A*B=C, where A, B and C are either 1000*1000 or others, such as 500*500, 20*20, etc. The requirements are to show a maximal error, an average error and GFLOPS.

(2) To reduce the errors, please apply Kahan's Summation Formula and show all the results mentioned above.

(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function in a block from main memory to global memory and show all the results mentioned above again.

My Cuda programme below can run but the output is not so good -

#include

#include

#include

#include

#include

#include

#include

#define BLOCK_SIZE 16

#define NUM_THREADS 256

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

const int tid = threadIdx.x;

const int bid = blockIdx.x;

const int idx = bid * blockDim.x + tid;

const int row = idx / n;

const int column = idx % n;

int i;

if (row < n && column < n)

{

float t = 0;

for (i = 0; i < n; i++)

{

t += a[row * lda + i] * b[i * ldb + column];

}

c[row * ldc + column] = t;

}

}

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

const int tid = threadIdx.x;

const int bid = blockIdx.x;

const int idx = bid * blockDim.x + tid;

const int row = idx / n;

const int column = idx % n;

int i;

if (row < n && column < n)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++)

{

float r;

y -= a[row * lda + i] * b[i * ldb + column];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + column] = t;

}

}

clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_2 >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

extern __shared__ float data[];

const int tid = threadIdx.x;

const int row = blockIdx.x;

int i, j;

for (i = tid; i < n; i += blockDim.x)

{

data[i] = a[row * lda + i];

}

__syncthreads();

for (j = tid; j < n; j += blockDim.x)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++) {

float r;

y -= data[i] * b[i * ldb + j];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + j] = t;

}

}

clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_3 >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

extern __shared__ float data[];

const int tid = threadIdx.x;

const int row = blockIdx.x;

int i, j;

for (i = tid; i < n; i += blockDim.x)

{

data[i] = a[row * lda + i];

}

__syncthreads();

for (j = tid; j < n; j += blockDim.x)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++)

{

float r;

y -= data[i] * b[i * ldb + j];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + j] = t;

}

}

clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);

cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);

cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_4 >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];

__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];

const int tidc = threadIdx.x;

const int tidr = threadIdx.y;

const int bidc = blockIdx.x * BLOCK_SIZE;

const int bidr = blockIdx.y * BLOCK_SIZE;

int i, j;

float results = 0;

float comp = 0;

for (j = 0; j < n; j += BLOCK_SIZE)

{

if (tidr + bidr < n && tidc + j < n)

{

matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];

}

else

{

matA[tidr][tidc] = 0;

}

if (tidr + j < n && tidc + bidc < n)

{

matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];

}

else

{

matB[tidr][tidc] = 0;

}

__syncthreads();

for (i = 0; i < BLOCK_SIZE; i++)

{

float t;

comp -= matA[tidr][i] * matB[i][tidc];

t = results - comp;

comp = (t - results) + comp;

results = t;

}

__syncthreads();

}

if (tidr + bidr < n && tidc + bidc < n)

{

c[(tidr + bidr) * ldc + tidc + bidc] = results;

}

}

clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);

cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);

cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

dim3 blocks(bx, bx);

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

matMultCUDA_5 >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

int i, j, k;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++)

{

double t = 0;

for (k = 0; k < n; k++)

{

t += a[i * lda + k] * b[k * ldb + j];

}

c[i * ldc + j] = t;

}

}

}

void matgen(float* a, int lda, int n)

{

int i, j;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++) {

a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);

}

}

}

void compare_mat(const float* a, int lda, const float* b, int ldb, int n)

{

float max_err = 0;

float average_err = 0;

int i, j;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++)

{

if (b[i * ldb + j] != 0)

{

float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);

if (max_err < err) max_err = err;

average_err += err;

}

}

}

printf("Max error: %g Average error: %g\n", max_err, average_err / (n * n));

}

bool InitCUDA()

{

int count;

cudaGetDeviceCount(&count);

if (count == 0)

{

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++)

{

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)

{

if (prop.major >= 1)

{

break;

}

}

}

if (i == count)

{

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

int main()

{

int n;

printf("please input maxter number :");

scanf("%d",&n);

float *a, *b, *c, *d;

if (!InitCUDA())

{

return 0;

}

a = (float*)malloc(sizeof(float)* n * n);

b = (float*)malloc(sizeof(float)* n * n);

c = (float*)malloc(sizeof(float)* n * n);

d = (float*)malloc(sizeof(float)* n * n);

srand(0);

matgen(a, n, n);

LARGE_INTEGER timeStart;

LARGE_INTEGER timeEnd;

LARGE_INTEGER frequency;

QueryPerformanceFrequency(&frequency);

double quadpart = (double)frequency.QuadPart;

QueryPerformanceCounter(&timeStart);

matmult(a, n, b, n, d, n, n);

QueryPerformanceCounter(&timeEnd);

double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

printf("\n(CPU) Time used is: %f ms\n", elapsed);

clock_t time = matmultCUDA(a, n, b, n, c, n, n);

double sec = (double)time / CLOCKS_PER_SEC;

printf("\n(1) Before reduce errors:");

printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

compare_mat(c, n, d, n, n);

time = matmultCUDA_2(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(2) After reduce errors:");

printf("\n(GPU2) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_3(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(GPU3) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_4(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(GPU4) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

compare_mat(c, n, d, n, n);

QueryPerformanceFrequency(&frequency);

quadpart = (double)frequency.QuadPart;

QueryPerformanceCounter(&timeStart);

time = matmultCUDA_5(a, n, b, n, c, n, n);

QueryPerformanceCounter(&timeEnd);

elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

printf("\n(CPU5) Time used is: %f ms\n", elapsed);

sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;

printf("\n(3) Apply pitch and shard memory:");

printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

free(a);

free(b);

free(c);

free(d);

system("\n pause");

return 0;

}

Kindly please help for the modification such as -

printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

Some mistake above???

More Tcy Tcy's questions See All
Similar questions and discussions