How to correct the time units on the results - Cuda ?

05 May 2018 0 2K Report

(1) Apply CPU and GPU to compute a matrix multiplication A*B=C, where A, B and C are either 1000*1000 or others, such as 500*500, 20*20, etc. The requirements are to show a maximal error, an average error and GFLOPS.

(2) To reduce the errors, please apply Kahan's Summation Formula and show all the results mentioned above.

(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function in a block from main memory to global memory and show all the results mentioned above again.

My Cuda programme below can run but the output is not so good -

#include

#define BLOCK_SIZE 16

#define NUM_THREADS 256

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

const int tid = threadIdx.x;

const int bid = blockIdx.x;

const int idx = bid * blockDim.x + tid;

const int row = idx / n;

const int column = idx % n;

int i;

if (row < n && column < n)

{

float t = 0;

for (i = 0; i < n; i++)

{

t += a[row * lda + i] * b[i * ldb + column];

}

c[row * ldc + column] = t;

}

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

const int tid = threadIdx.x;

const int bid = blockIdx.x;

const int idx = bid * blockDim.x + tid;

const int row = idx / n;

const int column = idx % n;

int i;

if (row < n && column < n)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++)

{

float r;

y -= a[row * lda + i] * b[i * ldb + column];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + column] = t;

}

clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_2 >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

extern __shared__ float data[];

const int tid = threadIdx.x;

const int row = blockIdx.x;

int i, j;

for (i = tid; i < n; i += blockDim.x)

{

data[i] = a[row * lda + i];

}

__syncthreads();

for (j = tid; j < n; j += blockDim.x)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++) {

float r;

y -= data[i] * b[i * ldb + j];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + j] = t;

}

clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_3 >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

extern __shared__ float data[];

const int tid = threadIdx.x;

const int row = blockIdx.x;

int i, j;

for (i = tid; i < n; i += blockDim.x)

{

data[i] = a[row * lda + i];

}

__syncthreads();

for (j = tid; j < n; j += blockDim.x)

{

float t = 0;

float y = 0;

for (i = 0; i < n; i++)

{

float r;

y -= data[i] * b[i * ldb + j];

r = t - y;

y = (r - t) + y;

t = r;

}

c[row * ldc + j] = t;

}

clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);

cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);

cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;

matMultCUDA_4 >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)

{

__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];

__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];

const int tidc = threadIdx.x;

const int tidr = threadIdx.y;

const int bidc = blockIdx.x * BLOCK_SIZE;

const int bidr = blockIdx.y * BLOCK_SIZE;

int i, j;

float results = 0;

float comp = 0;

for (j = 0; j < n; j += BLOCK_SIZE)

{

if (tidr + bidr < n && tidc + j < n)

{

matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];

}

else

{

matA[tidr][tidc] = 0;

}

if (tidr + j < n && tidc + bidc < n)

{

matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];

}

else

{

matB[tidr][tidc] = 0;

}

__syncthreads();

for (i = 0; i < BLOCK_SIZE; i++)

{

float t;

comp -= matA[tidr][i] * matB[i][tidc];

t = results - comp;

comp = (t - results) + comp;

results = t;

}

__syncthreads();

}

if (tidr + bidr < n && tidc + bidc < n)

{

c[(tidr + bidr) * ldc + tidc + bidc] = results;

}

clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

float *ac, *bc, *cc;

clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);

cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);

cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);

cudaMalloc((void**)&bc, sizeof(float)* n * n);

cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);

cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

dim3 blocks(bx, bx);

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

matMultCUDA_5 >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);

cudaFree(bc);

cudaFree(cc);

end = clock();

return end - start;

}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)

{

int i, j, k;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++)

{

double t = 0;

for (k = 0; k < n; k++)

{

t += a[i * lda + k] * b[k * ldb + j];

}

c[i * ldc + j] = t;

}

void matgen(float* a, int lda, int n)

{

int i, j;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++) {

a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);

}

void compare_mat(const float* a, int lda, const float* b, int ldb, int n)

{

float max_err = 0;

float average_err = 0;

int i, j;

for (i = 0; i < n; i++)

{

for (j = 0; j < n; j++)

{

if (b[i * ldb + j] != 0)

{

float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);

if (max_err < err) max_err = err;

average_err += err;

}

printf("Max error: %g Average error: %g\n", max_err, average_err / (n * n));

}

bool InitCUDA()

{

int count;

cudaGetDeviceCount(&count);

if (count == 0)

{

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++)

{

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)

{

if (prop.major >= 1)

{

break;

}

if (i == count)

{

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

int main()

{

int n;

printf("please input maxter number :");

scanf("%d",&n);

float *a, *b, *c, *d;

if (!InitCUDA())

{

return 0;

}

a = (float*)malloc(sizeof(float)* n * n);

b = (float*)malloc(sizeof(float)* n * n);

c = (float*)malloc(sizeof(float)* n * n);

d = (float*)malloc(sizeof(float)* n * n);

srand(0);

matgen(a, n, n);

LARGE_INTEGER timeStart;

LARGE_INTEGER timeEnd;

LARGE_INTEGER frequency;

QueryPerformanceFrequency(&frequency);

double quadpart = (double)frequency.QuadPart;

QueryPerformanceCounter(&timeStart);

matmult(a, n, b, n, d, n, n);

QueryPerformanceCounter(&timeEnd);

double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

printf("\n(CPU) Time used is: %f ms\n", elapsed);

clock_t time = matmultCUDA(a, n, b, n, c, n, n);

double sec = (double)time / CLOCKS_PER_SEC;

printf("\n(1) Before reduce errors:");

printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

compare_mat(c, n, d, n, n);

time = matmultCUDA_2(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(2) After reduce errors:");

printf("\n(GPU2) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_3(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(GPU3) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_4(a, n, b, n, c, n, n);

sec = (double)time / CLOCKS_PER_SEC;

printf("\n(GPU4) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

compare_mat(c, n, d, n, n);

QueryPerformanceFrequency(&frequency);

quadpart = (double)frequency.QuadPart;

QueryPerformanceCounter(&timeStart);

time = matmultCUDA_5(a, n, b, n, c, n, n);

QueryPerformanceCounter(&timeEnd);

elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

printf("\n(CPU5) Time used is: %f ms\n", elapsed);

sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;

printf("\n(3) Apply pitch and shard memory:");

printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

free(a);

free(b);

free(c);

free(d);

system("\n pause");

return 0;

}

Kindly please help for the modification such as -

printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

Some mistake above???

Badges
Science topic

Similar topics
Analytical Chemistry
Column

More Tcy Tcy's questions See All

How to process the hyperspectral images ?

Dear I have the hyperspectral image - how to preprocess data need time Traditional CNN cannot be applied to project directly - it needs a lot of time to test (may be take 3 months) Need to do...

08 September 2019 7,458 1 View

How to modify Adaline Stochastic gradient descent ?

Dear May I know how to modify my own Python programming so that I will get the same picture as refer to the attached file - Adaline Stochastic gradient descent (I am using the Anaconda Python...

02 March 2019 2,256 1 View

How to perform Matlab for Supervised Classification?

Select any image or data (1) Run Linear Support Vector Machine, Kernel Support Vector Machine and Fisher Linear Discriminant Analysis (training sample by prior knowledge) (2) Adjust parameters...

05 June 2018 7,800 0 View

How to use colour filter with Cuda programming ?

. From the image (see the attached picture), I would like to perform Cuda programming as following procedures (1) Using colour filter, only green colour is passed (2) When this part...

05 June 2018 3,317 0 View

Matlab cannot produce result ?

function Y = myFunction(d,n,theta) Y = coinflip(100,100,0.25); d = 100; n = 100; theta = 0.25; Y = [ ]; for Dloop = 1:d for Nloop = 1:n X =randn(1); Y = [Y;X]; end end Y(Ytheta) =...

04 May 2018 8,303 1 View

Require Matlab review - Urgent - how to modify?

Dear I try to perform the Matlab programming with formula equation for the Pattern Recognition as below (herewith refer to the Microsoft Word) However I am not sure my answers are correct...

03 April 2018 8,395 0 View

How to to perform the Matlab programming with certain formula equations for the Bayesian Classification?

Dear I would like to perform the Matlab programming with certain formula equations for the Bayesian Classification as below (herewith refer to the Microsoft Word file) However I get some...

03 April 2018 1,458 0 View

Which type of compound does lamda max of 218 indicate in a uv-vis spectrum of a partially purified compound through column and TLC?

A crude extract of fungal culture using EtOH was subjected to column and TLC and partially purified compound was obtained. UV vis spectrum of the compound/s has max absorbance at 218nm. The...

11 August 2024 9,801 2 View

Can you connect an HPLC to a Mass Spec only at a certain time point?

Can anyone explain this method? Especially the last statement where it says only at 1.5 to 2.5mins was the MS/MS connected to the UPLC. How is that possible, is it a feature in this specific...

11 August 2024 8,141 3 View

RNA Extraction Using Hot Borate Method No Longer Working?

I've been performing RNA extraction on cotton petiole tissue for a few months now using the method described in the following paper, a derivative of the typical hot borate method...

08 August 2024 9,882 2 View

Can I use a HisTRAP column for affinity chromatography?

I'm working on selecting antibodies against a recombinant protein that has a His-tag. My idea is to first bind the recombinant protein to a HisTRAP column and then use this column for an affinity...

07 August 2024 505 3 View

Hi, please what is the best tool on Aspen plus that can help me get the best inlet temperature and pressure for cryogenic distillation?

I am trying to recover liquid CO2 from a mixture of 0.6 CO2, 0.3 N2, and 0.1 O2. My aim is to recover about 99% liquid CO2 at the bottom of the column and make sure the amount of gaseous CO2...

06 August 2024 4,611 0 View

What is the best practice for resuspending cell pellets during competent cell preparation (both electro and comp)?

I'm fairly new to making homemade comp cells and have done so with some mild success. What is generally an acceptable way of resuspending comp cells during wash steps? I've been told to swirl the...

31 July 2024 8,309 0 View

How to perform EEG source analysis on each trial of data separately?

Hello Everyone I have a question about structure for connectivity analysis on sources. My goal: preprocess and cut data into trials create headmodels, using template MRI file perform source...

30 July 2024 2,744 1 View

A positively charged and 10x his-tagged protein that doesn't bind to any chromatographic resin?

Hello everyone, I am currently working on a protein that is 10x his-tagged and positively charged (predicted pI=10.16). But when I tried to use Ni-NTA column to purify the protein, it's not...

24 July 2024 7,293 4 View

Why is trifluoroacetic acid (TFA) used in c-18 column?

I would like to know the role of the TFA (trifluoroacetic acid) used on reverse phase chromatography c-18 column. It's is often used with a concentration of 0.1% on the mobile phase.

23 July 2024 8,075 3 View

How to identify a monomeric protein species of the target without running the molecular weight standard on SEC?

I was using Superose6 10/300 Gl column for purification of my target protein. the sample consists of various oligomeric state and i would like to identify the approximate elution volume where my...

15 July 2024 5,157 2 View