HubbardGPU
Hubbard diagonalisation on the GPU (and CPU)
 All Classes Files Functions Variables Typedefs Friends Macros
hamgpu.cu
Go to the documentation of this file.
1 /* Copyright (C) 2012 Ward Poelmans
2 
3 This file is part of Hubbard-GPU.
4 
5 Hubbard-GPU is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9 
10 Hubbard-GPU is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with Hubbard-GPU. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <iostream>
20 #include <cstdio>
21 #include <cstdlib>
22 #include <cublas_v2.h>
23 #include "hamgpu.h"
24 #include "hamsparse.h"
25 #include "hamsparse2D.h"
26 
27 // number of threads in a block (must be multiple of 32)
28 #define NUMTHREADS 128
29 
30 // the maximum size of the grid
31 #define GRIDSIZE 65535
32 
33 // Helper macro to check CUDA return values
34 #define CUDA_SAFE_CALL( call) { \
35  cudaError err = call; \
36  if( cudaSuccess != err) { \
37  fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
38  __FILE__, __LINE__, cudaGetErrorString( err) ); \
39  exit(EXIT_FAILURE); \
40  } }
41 
42 
46 template<>
47 GPUHamiltonian<SparseHamiltonian>::GPUHamiltonian(int Ns, int Nu, int Nd, double J, double U)
48  : SparseHamiltonian(Ns,Nu,Nd,J,U)
49 {
50 }
51 
55 template<>
56 GPUHamiltonian<SparseHamiltonian2D>::GPUHamiltonian(int L, int D, int Nu, int Nd, double J, double U)
57  : SparseHamiltonian2D(L,D,Nu,Nd,J,U)
58 {
59 }
60 
61 template<class T>
63 {
64 }
65 
69 __global__ void gpu_mvprod(double *x, double *y, double alpha, int NumUp, int NumDown, int dim, double *Umat, double *Down_data,unsigned int *Down_ind, int size_Down, double *Up_data, unsigned int *Up_ind, int size_Up, int rows_shared)
70 {
71  int index = threadIdx.x + blockDim.x * blockIdx.x + blockIdx.y * blockDim.x * gridDim.x;
72 
73  if(index < dim)
74  {
75  double result = Umat[index] * x[index];
76 
77  int sv = index / NumDown; //__fdividef(index,NumDown);
78  int id = index % NumDown; // index - sv*NumDown;
79 
80  extern __shared__ double shared[];
81 
82  unsigned int *shared_ind = (unsigned int *) &shared[size_Up * rows_shared];
83 
84  int s_sv = (blockDim.x * blockIdx.x + blockIdx.y * blockDim.x * gridDim.x)/NumDown;
85 
86  if(threadIdx.x < rows_shared && (s_sv + threadIdx.x) < NumUp)
87  for(int i=0;i<size_Up;i++)
88  {
89  shared[i*rows_shared+threadIdx.x] = Up_data[s_sv + threadIdx.x + i*NumUp];
90 
91  shared_ind[i*rows_shared+threadIdx.x] = Up_ind[s_sv + threadIdx.x + i*NumUp];
92  }
93 
94  __syncthreads();
95 
96  for(int i=0;i<size_Up;i++)
97  // result += Up_data[sv+i*NumUp] * x[id + NumDown*Up_ind[sv+i*NumUp]];
98  result += shared[sv-s_sv+i*rows_shared] * x[id + NumDown*shared_ind[sv-s_sv+i*rows_shared]];
99 
100  for(int i=0;i<size_Down;i++)
101  result += Down_data[id+i*NumDown] * x[sv*NumDown + Down_ind[id+i*NumDown]];
102 
103  y[index] = alpha * y[index] + result;
104  }
105 }
106 
113 template<class T>
114 void GPUHamiltonian<T>::mvprod(double *x, double *y, double alpha) const
115 {
116  int NumUp = T::baseUp.size();
117  int NumDown = T::baseDown.size();
118  int dim = NumUp*NumDown;
119  dim3 numblocks(ceil(dim*1.0/NUMTHREADS));
120  int rows_shared = ceil(NUMTHREADS*1.0/NumDown) + 1;
121  size_t sharedmem = T::size_Up * rows_shared * (sizeof(double) + sizeof(unsigned int));
122 
123  if(numblocks.x > GRIDSIZE)
124  {
125  numblocks.x = GRIDSIZE;
126  numblocks.y = ceil(ceil(dim*1.0/NUMTHREADS)*1.0/GRIDSIZE);
127  }
128 
129  cudaGetLastError();
130  gpu_mvprod<<<numblocks,NUMTHREADS,sharedmem>>>(x,y,alpha,NumUp,NumDown,dim,Umat_gpu,Down_data_gpu,Down_ind_gpu,T::size_Down,Up_data_gpu,Up_ind_gpu,T::size_Up,rows_shared);
131  CUDA_SAFE_CALL(cudaGetLastError());
132 }
133 
140 template<class T>
142 {
143  if(!m)
144  m = 10;
145 
146  int device;
147  cudaGetDevice( &device );
148 
149  cudaDeviceProp prop;
150  cudaGetDeviceProperties( &prop, device );
151 
152  int NumUp = T::baseUp.size();
153  int NumDown = T::baseDown.size();
154 
155  size_t neededmem = T::getDim()*sizeof(double) +
156  NumUp*T::size_Up*(sizeof(double)+sizeof(unsigned int)) +
157  NumDown*T::size_Down*(sizeof(double)+sizeof(unsigned int)) +
158  2*T::dim*sizeof(double);
159 
160  if(neededmem > prop.totalGlobalMem)
161  {
162  std::cerr << "Houston, we have a memory problem!" << std::endl;
163  return 0;
164  }
165 
166  if( ceil(T::dim*1.0/NUMTHREADS) > (1.0*prop.maxGridSize[0]*prop.maxGridSize[1]) ) // convert all to doubles to avoid int overflow
167  {
168  std::cerr << "Houston, we have a grid size problem!" << std::endl;
169  return 0;
170  }
171 
172  if( T::size_Up * (ceil(NUMTHREADS*1.0/NumDown)+1) * (sizeof(double) + sizeof(unsigned int)) > prop.sharedMemPerBlock )
173  {
174  std::cerr << "Houston, we have a shared memory size problem!" << std::endl;
175  return 0;
176  }
177 
178  // alloc Umat and copy to gpu
179  double *Umat = T::Umatrix();
180  CUDA_SAFE_CALL(cudaMalloc(&Umat_gpu, T::dim*sizeof(double)));
181  CUDA_SAFE_CALL(cudaMemcpy(Umat_gpu,Umat,T::dim*sizeof(double),cudaMemcpyHostToDevice));
182 
183  delete [] Umat;
184 
185 
186  CUDA_SAFE_CALL(cudaMalloc(&Up_data_gpu,NumUp*T::size_Up*sizeof(double)));
187  CUDA_SAFE_CALL(cudaMalloc(&Up_ind_gpu,NumUp*T::size_Up*sizeof(unsigned int)));
188 
189  CUDA_SAFE_CALL(cudaMemcpy(Up_data_gpu,T::Up_data,NumUp*T::size_Up*sizeof(double),cudaMemcpyHostToDevice));
190  CUDA_SAFE_CALL(cudaMemcpy(Up_ind_gpu,T::Up_ind,NumUp*T::size_Up*sizeof(unsigned int),cudaMemcpyHostToDevice));
191 
192  CUDA_SAFE_CALL(cudaMalloc(&Down_data_gpu,NumDown*T::size_Down*sizeof(double)));
193  CUDA_SAFE_CALL(cudaMalloc(&Down_ind_gpu,NumDown*T::size_Down*sizeof(unsigned int)));
194 
195  CUDA_SAFE_CALL(cudaMemcpy(Down_data_gpu,T::Down_data,NumDown*T::size_Down*sizeof(double),cudaMemcpyHostToDevice));
196  CUDA_SAFE_CALL(cudaMemcpy(Down_ind_gpu,T::Down_ind,NumDown*T::size_Down*sizeof(unsigned int),cudaMemcpyHostToDevice));
197 
198  std::vector<double> a(m,0);
199  std::vector<double> b(m,0);
200 
201  double *qa = new double [T::dim];
202  double *qb = new double [T::dim];
203 
204  double *qa_gpu;
205  double *qb_gpu;
206  CUDA_SAFE_CALL(cudaMalloc(&qa_gpu,T::dim*sizeof(double)));
207  CUDA_SAFE_CALL(cudaMalloc(&qb_gpu,T::dim*sizeof(double)));
208 
209  srand(time(0));
210 
211  for(int i=0;i<T::dim;i++)
212  {
213  qa[i] = 0;
214  qb[i] = (rand()*10.0/RAND_MAX);
215  }
216 
217  int incx = 1;
218  int dim = T::dim;
219 
220  double norm = 1.0/sqrt(ddot_(&dim,qb,&incx,qb,&incx));
221 
222  dscal_(&dim,&norm,qb,&incx);
223 
224  CUDA_SAFE_CALL(cudaMemcpy(qa_gpu,qa,T::dim*sizeof(double),cudaMemcpyHostToDevice));
225  CUDA_SAFE_CALL(cudaMemcpy(qb_gpu,qb,T::dim*sizeof(double),cudaMemcpyHostToDevice));
226 
227  delete [] qa;
228  delete [] qb;
229 
230  norm = 1;
231 
232  double *f1 = qa_gpu;
233  double *f2 = qb_gpu;
234  double *tmp;
235 
236  double alpha = 0;
237 
238  cublasHandle_t handle;
239  cublasCreate(&handle);
240 
241 // cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE;
242 // cublasSetPointerMode(handle,mode);
243 
244  int i=1;
245 
246  std::vector<double> acopy(a);
247  std::vector<double> bcopy(b);
248 
249  double E = 1;
250 
251  cudaEvent_t start, stop;
252  float exeTime;
253  cudaEventCreate( &start );
254  cudaEventCreate( &stop );
255 
256  cudaEventRecord(start, 0);
257 
258  while(fabs(E-acopy[0]) > 1e-4)
259  {
260  E = acopy[0];
261 
262  for(;i<m;i++)
263  {
264  alpha = -b[i-1];
265  cublasDscal(handle,T::dim,&alpha,f1,1);
266 
267  mvprod(f2,f1,norm);
268 
269  cublasDdot(handle,T::dim,f1,1,f2,1,&a[i-1]);
270 
271  alpha = -a[i-1];
272  cublasDaxpy(handle,T::dim,&alpha,f2,1,f1,1);
273 
274  cublasDdot(handle,T::dim,f1,1,f1,1,&b[i]);
275  b[i] = sqrt(b[i]);
276 
277  if( fabs(b[i]) < 1e-10 )
278  break;
279 
280  alpha = 1.0/b[i];
281 
282  cublasDscal(handle,T::dim,&alpha,f1,1);
283 
284  tmp = f2;
285  f2 = f1;
286  f1 = tmp;
287  }
288 
289  acopy = a;
290  bcopy = b;
291 
292  char jobz = 'N';
293  int info;
294 
295  dstev_(&jobz,&m,acopy.data(),&bcopy.data()[1],&alpha,&m,&alpha,&info);
296 
297  if(info != 0)
298  std::cerr << "Error in Lanczos" << std::endl;
299 
300  m += 10;
301  a.resize(m);
302  b.resize(m);
303  }
304 
305  cudaEventRecord(stop, 0);
306  cudaEventSynchronize(stop);
307  cudaEventElapsedTime( &exeTime, start, stop );
308 
309  std::cout << "Done in " << m-10 << " Iterations" << std::endl;
310  std::cout << "Cuda time: " << exeTime << " ms" << std::endl;
311 
312  cudaEventDestroy(start);
313  cudaEventDestroy(stop);
314 
315  cublasDestroy(handle);
316 
317  alpha = acopy[0];
318 
319  CUDA_SAFE_CALL(cudaFree(qa_gpu));
320  CUDA_SAFE_CALL(cudaFree(qb_gpu));
321 
322  CUDA_SAFE_CALL(cudaFree(Up_data_gpu));
323  CUDA_SAFE_CALL(cudaFree(Up_ind_gpu));
324  CUDA_SAFE_CALL(cudaFree(Down_data_gpu));
325  CUDA_SAFE_CALL(cudaFree(Down_ind_gpu));
326 
327  CUDA_SAFE_CALL(cudaFree(Umat_gpu));
328 
329  CUDA_SAFE_CALL(cudaDeviceReset());
330 
331  return alpha;
332 }
333 
334 // Expliciet specify the template class with the possible template parameters
335 template class GPUHamiltonian<SparseHamiltonian>;
337 
338 /* vim: set ts=8 sw=4 tw=0 expandtab :*/