c++ - Matrix manipulation using CUDA -
i trying write program matrix calculations using c/cuda. have following program:
in main.cu
#include <cuda.h> #include <iostream> #include "teste.cuh" using std::cout; int main(void) { const int ndofs = 2; const int nel = 4; double *gh = new double[ndofs*nel*ndofs*nel]; double *gg; cudamalloc((void**)& gg, sizeof(double)*ndofs*nel*ndofs*nel); (int ii = 0; ii < ndofs*nel*ndofs*nel; ii++) gh[ii] = 0.; cudamemcpy(gh, gg, sizeof(double)*ndofs*nel*ndofs*nel, cudamemcpyhosttodevice); integrag<<<256, 256>>>(nel, gg); cudamemcpy(gg, gh, sizeof(double)*ndofs*nel*ndofs*nel, cudamemcpydevicetohost); (int ii = 0; ii < ndofs*nel*ndofs*nel; ii++) cout << ii + 1 << " " << gh[ii] << "\n"; return 0; }
in mtrx.cuh
#ifndef teste_cuh_ #define teste_cuh_ __global__ void integrag(const int n, double* g) { const int szmodel = 2*n; int idx = threadidx.x + blockidx.x*blockdim.x; int idy = threadidx.y + blockidx.y*blockdim.y; int offset = idx + idy*blockdim.x*griddim.x; int posinit = szmodel*offset; g[posinit + 0] = 1; g[posinit + 1] = 1; g[posinit + 2] = 1; g[posinit + 3] = 1; } #endif
the result (which supposed matrix filled 1's) copied host array; problem is: nothing happens! apparently, program not calling gpu kernel, , still getting array full of zeros.
i new cuda programming , using cuda example (jason sanders) reference book.
my questions are:
- what wrong code?
- is best way deal matrices using gpu, using matrices vectorized form?
- is there reference can provide more examples on matrices using gpu's?
these questions:
what wrong code?
is best way deal matrices using gpu, using matrices vectorized form?
is there reference can provide more examples on matrices using gpu's?
for first question. first of all, problem should explicitly defined. want code? sort of calculations want on matrix?
try check errors this way so. there obvious bugs in code well. of bugs:
- you're passing wrong address pointers cudamemcpy, pointers passed source , destination have swapped each other, check here
change them to:
"ndofsnelndofs*nel" shows you're interested in value of first 64 numbers of array, why calling 256 threads , 256 blocks?
this part of code:
int idx = threadidx.x + blockidx.xblockdim.x; int idy = threadidx.y + blockidx.yblockdim.y;
shows want use 2-dim threads , blocks; so, need use dim type.
by making following changes:
cudamemcpy(gg, gh, sizeof(double)*ndofs*nel*ndofs*nel, cudamemcpyhosttodevice); //here dim3 block(2,2); //here dim3 thread(4,4); //here integrag<<<block, thread>>>(nel, gg); //here cudamemcpy(gh, gg, sizeof(double)*ndofs*nel*ndofs*nel, cudamemcpydevicetohost); //here
you'll result following:
1 1 2 1 3 1 4 1 5 0 6 0 7 0 8 0 9 1 10 1 11 1 12 1 . . . 57 1 58 1 59 1 60 1 61 0 62 0 63 0 64 0
anyway, if state problem , goal more clearly, better suggestions can provided you.
regarding last 2 questions:
in opinion cuda c programming guide , cuda c best practices guide 2 must documents read when starting cuda, , include examples on matrix calculations well.
Comments
Post a Comment