22 #include <cublas_v2.h>
28 #define NUMTHREADS 128
31 #define GRIDSIZE 65535
34 #define CUDA_SAFE_CALL( call) { \
35 cudaError err = call; \
36 if( cudaSuccess != err) { \
37 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
38 __FILE__, __LINE__, cudaGetErrorString( err) ); \
69 __global__
void gpu_mvprod(
double *x,
double *y,
double alpha,
int NumUp,
int NumDown,
int dim,
double *Umat,
double *Down_data,
unsigned int *Down_ind,
int size_Down,
double *Up_data,
unsigned int *Up_ind,
int size_Up,
int rows_shared)
71 int index = threadIdx.x + blockDim.x * blockIdx.x + blockIdx.y * blockDim.x * gridDim.x;
75 double result = Umat[index] * x[index];
77 int sv = index / NumDown;
78 int id = index % NumDown;
80 extern __shared__
double shared[];
82 unsigned int *shared_ind = (
unsigned int *) &shared[size_Up * rows_shared];
84 int s_sv = (blockDim.x * blockIdx.x + blockIdx.y * blockDim.x * gridDim.x)/NumDown;
86 if(threadIdx.x < rows_shared && (s_sv + threadIdx.x) < NumUp)
87 for(
int i=0;i<size_Up;i++)
89 shared[i*rows_shared+threadIdx.x] = Up_data[s_sv + threadIdx.x + i*NumUp];
91 shared_ind[i*rows_shared+threadIdx.x] = Up_ind[s_sv + threadIdx.x + i*NumUp];
96 for(
int i=0;i<size_Up;i++)
98 result += shared[sv-s_sv+i*rows_shared] * x[
id + NumDown*shared_ind[sv-s_sv+i*rows_shared]];
100 for(
int i=0;i<size_Down;i++)
101 result += Down_data[
id+i*NumDown] * x[sv*NumDown + Down_ind[
id+i*NumDown]];
103 y[index] = alpha * y[index] + result;
116 int NumUp = T::baseUp.size();
117 int NumDown = T::baseDown.size();
118 int dim = NumUp*NumDown;
120 int rows_shared = ceil(
NUMTHREADS*1.0/NumDown) + 1;
121 size_t sharedmem = T::size_Up * rows_shared * (
sizeof(double) +
sizeof(
unsigned int));
130 gpu_mvprod<<<numblocks,NUMTHREADS,sharedmem>>>(x,y,alpha,NumUp,NumDown,dim,Umat_gpu,Down_data_gpu,Down_ind_gpu,T::size_Down,Up_data_gpu,Up_ind_gpu,T::size_Up,rows_shared);
147 cudaGetDevice( &device );
150 cudaGetDeviceProperties( &prop, device );
152 int NumUp = T::baseUp.size();
153 int NumDown = T::baseDown.size();
155 size_t neededmem = T::getDim()*
sizeof(double) +
156 NumUp*T::size_Up*(
sizeof(
double)+
sizeof(
unsigned int)) +
157 NumDown*T::size_Down*(
sizeof(double)+
sizeof(
unsigned int)) +
158 2*T::dim*
sizeof(
double);
160 if(neededmem > prop.totalGlobalMem)
162 std::cerr <<
"Houston, we have a memory problem!" << std::endl;
166 if( ceil(T::dim*1.0/
NUMTHREADS) > (1.0*prop.maxGridSize[0]*prop.maxGridSize[1]) )
168 std::cerr <<
"Houston, we have a grid size problem!" << std::endl;
172 if( T::size_Up * (ceil(
NUMTHREADS*1.0/NumDown)+1) * (
sizeof(
double) +
sizeof(
unsigned int)) > prop.sharedMemPerBlock )
174 std::cerr <<
"Houston, we have a shared memory size problem!" << std::endl;
179 double *Umat = T::Umatrix();
181 CUDA_SAFE_CALL(cudaMemcpy(Umat_gpu,Umat,T::dim*
sizeof(
double),cudaMemcpyHostToDevice));
186 CUDA_SAFE_CALL(cudaMalloc(&Up_data_gpu,NumUp*T::size_Up*
sizeof(
double)));
187 CUDA_SAFE_CALL(cudaMalloc(&Up_ind_gpu,NumUp*T::size_Up*
sizeof(
unsigned int)));
189 CUDA_SAFE_CALL(cudaMemcpy(Up_data_gpu,T::Up_data,NumUp*T::size_Up*
sizeof(
double),cudaMemcpyHostToDevice));
190 CUDA_SAFE_CALL(cudaMemcpy(Up_ind_gpu,T::Up_ind,NumUp*T::size_Up*
sizeof(
unsigned int),cudaMemcpyHostToDevice));
192 CUDA_SAFE_CALL(cudaMalloc(&Down_data_gpu,NumDown*T::size_Down*
sizeof(
double)));
193 CUDA_SAFE_CALL(cudaMalloc(&Down_ind_gpu,NumDown*T::size_Down*
sizeof(
unsigned int)));
195 CUDA_SAFE_CALL(cudaMemcpy(Down_data_gpu,T::Down_data,NumDown*T::size_Down*
sizeof(
double),cudaMemcpyHostToDevice));
196 CUDA_SAFE_CALL(cudaMemcpy(Down_ind_gpu,T::Down_ind,NumDown*T::size_Down*
sizeof(
unsigned int),cudaMemcpyHostToDevice));
198 std::vector<double> a(m,0);
199 std::vector<double> b(m,0);
201 double *qa =
new double [T::dim];
202 double *qb =
new double [T::dim];
211 for(
int i=0;i<T::dim;i++)
214 qb[i] = (rand()*10.0/RAND_MAX);
220 double norm = 1.0/sqrt(
ddot_(&dim,qb,&incx,qb,&incx));
222 dscal_(&dim,&norm,qb,&incx);
224 CUDA_SAFE_CALL(cudaMemcpy(qa_gpu,qa,T::dim*
sizeof(
double),cudaMemcpyHostToDevice));
225 CUDA_SAFE_CALL(cudaMemcpy(qb_gpu,qb,T::dim*
sizeof(
double),cudaMemcpyHostToDevice));
238 cublasHandle_t handle;
239 cublasCreate(&handle);
246 std::vector<double> acopy(a);
247 std::vector<double> bcopy(b);
251 cudaEvent_t start, stop;
253 cudaEventCreate( &start );
254 cudaEventCreate( &stop );
256 cudaEventRecord(start, 0);
258 while(fabs(E-acopy[0]) > 1e-4)
265 cublasDscal(handle,T::dim,&alpha,f1,1);
269 cublasDdot(handle,T::dim,f1,1,f2,1,&a[i-1]);
272 cublasDaxpy(handle,T::dim,&alpha,f2,1,f1,1);
274 cublasDdot(handle,T::dim,f1,1,f1,1,&b[i]);
277 if( fabs(b[i]) < 1e-10 )
282 cublasDscal(handle,T::dim,&alpha,f1,1);
295 dstev_(&jobz,&m,acopy.data(),&bcopy.data()[1],&alpha,&m,&alpha,&info);
298 std::cerr <<
"Error in Lanczos" << std::endl;
305 cudaEventRecord(stop, 0);
306 cudaEventSynchronize(stop);
307 cudaEventElapsedTime( &exeTime, start, stop );
309 std::cout <<
"Done in " << m-10 <<
" Iterations" << std::endl;
310 std::cout <<
"Cuda time: " << exeTime <<
" ms" << std::endl;
312 cudaEventDestroy(start);
313 cudaEventDestroy(stop);
315 cublasDestroy(handle);