EMAN2
emdata_cuda.cpp
Go to the documentation of this file.
1/*
2 * Author: Steven Ludtke, 04/10/2003 (sludtke@bcm.edu)
3 * Copyright (c) 2000-2006 Baylor College of Medicine
4 *
5 * This software is issued under a joint BSD/GNU license. You may use the
6 * source code in this file under either license. However, note that the
7 * complete EMAN2 and SPARX software packages have some GPL dependencies,
8 * so you are responsible for compliance with the licenses of these packages
9 * if you opt to use BSD licensing. The warranty disclaimer below holds
10 * in either instance.
11 *
12 * This complete copyright notice must be included in any revised version of the
13 * source code. Additional authorship citations may be added, but existing
14 * author citations must be preserved.
15 *
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 *
30 * */
31
32/*
33 * This code is for a CUDA memory managment scheme. EMData rdata arrays are copied to CUDA DDRAM memory via
34 * copy_to_cuda() and to texture memory via copy_to_cudaro(). EMData is copied back using copy_from_device(),
35 * and DDRAM data can be freed via rw_free() and ro_free(). When data is copied to CUDA DDRAM, memory is managed
36 * via a static doubly linked list. When copy_to_cuda() is called there is first a check to ensure that there is enough
37 * memory available. If so , the copy is made and a call to addlist() is made, adding this EMData item to the
38 * static doubly linked list. If there is not enough memory, then the function, freeup_devicemem(), is called and the
39 * last item on the linked list is removed. If there is still not enough room, then the next last item is removed, etc, etc
40 * If there is still no room after the last item is removed, then no copy is made(and everything on this list is removed).
41 * Items are removed from the list via: reomvefromlist(). Used in this maner the memory managment algorithm is a FILO(first in last out),
42 * HOWEVER, when CUDA is used in applications a call to elementaccessed() can be made, which moves the item to the top of the list.
43 * When this scheme is used, the memory management algorithm becomes, LRU(least recently used), which should give better results in
44 * almost all cases. As a side note, to actutally use texture memory, a call to bindcudaarray?() should be made, when needed
45 * A corresponding call to unbindcudaarray?() needs to be made after texture memory is not needed. These operations do not actually
46 * move data around, just bind it to a Texture object, which are very limited resources!!!. There are just two such texture object,
47 * known as texA, and texB. These can be utilized in the actual CUDA code that nvcc compiles (in directory libEM/cuda).
48 * Note that elementacessed is called every time getcudarwdata(), getcudarodata() or isroongpu() called. Hence LRU is used by default,
49 * and you are forced to use these getter function b/c cudarwdata and cudarodata are private. You could get arround this in EMData
50 * functions though.....
51 * Note that possible concurrency issues can arise, because when data is copied bewteen the Host and GPU, there are two copies.
52 * To account for this possible problem, CUDA functions can call setdirtybit() which will copy back from GPU to host whenever
53 * get_data() is called (This function is a getter for EMData's rdata). Currently this technology is not in use because I haven't
54 * debuggesd it, so whenever a call to get_data()is called and there is data on the GPU a copy from GPU to CPU is made irrespctive
55 * of whether or not the data on the CPU vs GPU is the same.
56*/
57
58#ifdef EMAN2_USING_CUDA
59
60#include "emdata.h"
61#include "exception.h"
62#include <cuda_runtime_api.h>
63#include <driver_functions.h>
64#include <cuda.h>
65#include <cuda/cuda_util.h>
66#include <cuda/cuda_emfft.h>
67
68using namespace EMAN;
69
70// Static init
71const EMData* EMData::firstinlist = 0;
72const EMData* EMData::lastinlist = 0;
73int EMData::memused = 0;
74int EMData::fudgemem = 1.024E8; //let's leave 10 MB of 'fudge' memory on the device
75int EMData::cudadevicenum = -1;
76bool EMData::usecuda = 0;
77bool EMData::nocudainit = (getenv("NOCUDAINIT") == NULL) ? 0 : bool(atoi(getenv("NOCUDAINIT")));
78
79bool EMData::copy_to_cuda() const
80{
81 //cout << "copying from host to device RW" << " " << num_bytes << endl;
82 if(rw_alloc()) {
83 memused += num_bytes;
84 cudaError_t error = cudaMemcpy(cudarwdata,rdata,num_bytes,cudaMemcpyHostToDevice);
85 if ( error != cudaSuccess) {
86 //cout << rdata << " " << cudarwdata << endl;
87 throw UnexpectedBehaviorException("CudaMemcpy (host to device) failed:" + string(cudaGetErrorString(error)));
88 }
89 }else{return false;}
90 //setdirtybit() //uncomment this line if you want to ensure that only one effective copy exists on either the host or GPU
91
92 return true;
93}
94
95bool EMData::copy_to_cudaro() const
96{
97
98 //cout << "copying from host to device RO" << " " << num_bytes << endl;
99 if(ro_alloc()) {
100 memused += num_bytes;
101 copy_to_array(rdata, cudarodata, nx, ny, nz, cudaMemcpyHostToDevice);
102 }else{return false;}
103 //setdirtybit() //uncomment this line if you want to ensure that only one effective copy exists on either the host or GPU
104
105 return true;
106}
107
108bool EMData::rw_alloc() const
109{
110 if(cudarwdata){return true;} // already exists
111 num_bytes = nxyz*sizeof(float);
112 if(!freeup_devicemem(num_bytes)){return false;}
113 cudaError_t error = cudaMalloc((void**)&cudarwdata,num_bytes);
114 if ( error != cudaSuccess){return false;}
115 if(!cudarodata){
116 addtolist();
117 }else{
118 elementaccessed();
119 }
120 return true;
121}
122
123bool EMData::ro_alloc() const
124{
125 if(cudarodata){return true;} // already exists
126 num_bytes = nxyz*sizeof(float);
127 if(!freeup_devicemem(num_bytes)){return false;}
128 cudarodata = get_cuda_array(nx, ny, nz);
129 if(cudarodata == 0) throw UnexpectedBehaviorException("Bad Array alloc");
130 if(!cudarwdata){
131 addtolist();
132 }else{
133 elementaccessed();
134 }
135
136 return true;
137
138}
139
140void EMData::bindcudaarrayA(const bool intp_mode) const
141{
142 if(cudarodata == 0){throw UnexpectedBehaviorException( "Cuda Array not allocated!!");}
143 if(nz > 1){
144 bind_cuda_array_to_textureA(cudarodata, 3, intp_mode);
145 }else{
146 bind_cuda_array_to_textureA(cudarodata, 2, intp_mode);
147 }
148
149}
150
151void EMData::unbindcudaarryA() const
152{
153
154 if(nz > 1){
156 }else{
158 }
159
160}
161
162void EMData::bindcudaarrayB(const bool intp_mode) const
163{
164 if(cudarodata == 0){throw UnexpectedBehaviorException( "Cuda Array not allocated!!");}
165 if(nz > 1){
166 bind_cuda_array_to_textureB(cudarodata, 3, intp_mode);
167 }else{
168 bind_cuda_array_to_textureB(cudarodata, 2, intp_mode);
169 }
170
171}
172
173void EMData::unbindcudaarryB() const
174{
175
176 if(nz > 1){
178 }else{
180 }
181
182}
183
184bool EMData::copy_from_device(const bool rocpy)
185{
186 if(cudarwdata && !rocpy){
187 if(rdata == 0){rdata = (float*)malloc(num_bytes);} //allocate space if needed, assumes size hasn't changed(Which is hasn't so far)
188 cudaError_t error = cudaMemcpy(rdata,cudarwdata,num_bytes,cudaMemcpyDeviceToHost);
189 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
190 rw_free(); //we have the data on either the host or device, not both (prevents concurrency issues)
191 if(cudarodata) ro_free(); // clear any RO data, for call safety
192 } else if (cudarodata && rocpy) {
193 if(rdata == 0){rdata = (float*)malloc(num_bytes);} //allocate space if needed
194 if (nz > 1){
195 cudaExtent extent;
196 extent.width = nx;
197 extent.height = ny;
198 extent.depth = nz;
199 cudaMemcpy3DParms copyParams = {0};
200 copyParams.srcArray = cudarodata;
201 copyParams.dstPtr = make_cudaPitchedPtr((void*)rdata, extent.width*sizeof(float), extent.width, extent.height);
202 copyParams.extent = extent;
203 copyParams.kind = cudaMemcpyDeviceToHost;
204 cudaError_t error = cudaMemcpy3D(&copyParams);
205 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "RO CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
206 } else{
207 cudaError_t error = cudaMemcpyFromArray(rdata,cudarodata,0,0,num_bytes,cudaMemcpyDeviceToHost);
208 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "RO CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
209 }
210 ro_free(); //we have the data on either the host or device, not both (prevents concurrency issues)
211 if(cudarwdata) rw_free(); // clear any RW data, for call safety
212 } else {
213 return false;
214 }
215
216 update();
217 return true;
218}
219
220bool EMData::copy_rw_to_ro() const
221{
222
223 if(cudarwdata == 0){return false;}
224
225 if(cudarodata == 0){
226 if(!freeup_devicemem(num_bytes)){return false;}
227 cudarodata = get_cuda_array(nx, ny, nz);
228 if(cudarodata == 0) throw UnexpectedBehaviorException("Bad Array alloc");
229 memused += num_bytes;
230 }
231 //this will copy over any prexisting data (saves a malloc)....(but sometimes not a safe call.....)
232 copy_to_array(cudarwdata, cudarodata, nx, ny, nz, cudaMemcpyDeviceToDevice);
233 roneedsupdate = 0; //just copied, so no longer need an update
234 elementaccessed(); //To move the image to the top of the stack, prevents deletion before useage(If the image is at the stack bottom, and then anoth image is moved on....)
235 return true;
236
237}
238
239// The policy here is that when an EMData object is created, cudarwdata is set to 0. and no mem is allocated. It is
240//only when cudarwdata points to allocated data does the EMData object go on the list. cudarwdata should NEVER be set
241void EMData::runcuda(float * results) const
242{
243
244 if(results == 0){throw UnexpectedBehaviorException( "Cuda failed!!!");}
245 if(cudarwdata != 0){
246 //rw_free();} //delete the old data, why not jus overwrite!! (save a cudaFree)
247 } else {
248 addtolist(); // now that we are using memory add to the list
249 }
250 cudarwdata = results;
251
252}
253
254void EMData::rw_free() const
255{
256 cudaError_t error = cudaFree(cudarwdata);
257 if ( error != cudaSuccess){
258 cout << rdata << " " << cudarwdata << endl;
259 throw UnexpectedBehaviorException( "CudaFree failed:" + string(cudaGetErrorString(error)));
260 }
261 cudarwdata = 0;
262 memused -= num_bytes;
263 if(!cudarodata){removefromlist();}
264
265}
266
267void EMData::ro_free() const
268{
269 cudaError_t error = cudaFreeArray(cudarodata);
270 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray failed:" + string(cudaGetErrorString(error)));
271 cudarodata = 0;
272 memused -= num_bytes;
273 if(!cudarwdata){removefromlist();}
274
275}
276
277bool EMData::isrodataongpu() const
278{
279 if(cudarodata != 0 && !roneedsupdate){
280 if(cudarodata !=0) elementaccessed();
281 return true;
282 }
283 if(cudarwdata != 0){
284 if(copy_rw_to_ro()){;
285 return true;
286 } else {
287 return false;
288 }
289 }else{
290 return false;
291 }
292
293}
294
295bool EMData::freeup_devicemem(const int& num_bytes) const
296{
297 size_t freemem=0, totalmem=0; //initialize to prevent undefined behaviour
298 cudaMemGetInfo(&freemem, &totalmem);
299 //cout << "memusage" << " " << freemem << " " << totalmem << endl;
300 if ((ptrdiff_t(freemem) - ptrdiff_t(fudgemem)) > ptrdiff_t(num_bytes)){
301 return true;
302 }else{
303 //keep on removing stuff until enough memory is available
304 while(lastinlist != 0){
305 if(lastinlist->cudarwdata){
306 //screw the constness, always copy from GPU to host rather than throwing stuff away!!!
307 const_cast<EMData*>(lastinlist)->copy_from_device();
308 }
309 if(lastinlist->cudarodata){
310 const_cast<EMData*>(lastinlist)->copy_from_device(1);
311 }
312 cudaMemGetInfo(&freemem, &totalmem); //update free memory
313 if((ptrdiff_t(freemem) - ptrdiff_t(fudgemem)) > ptrdiff_t(num_bytes)){return true;} //this should break the loop....
314 }
315 }
316
317 return false; //if we failed :(
318}
319
320void EMData::setdirtybit() const
321{
322 cudadirtybit = 1;
323}
324
325void EMData::addtolist() const
326{
327 //Adds item to top of list
328 if(firstinlist == 0){ //if this is the first item in the list (first object in list), then make a new list
329 firstinlist = this;
330 lastinlist = this;
331 nextlistitem = 0;
332 prevlistitem = 0;
333 }else{
334 //we add to top of list
335 firstinlist->nextlistitem = this;
336 prevlistitem = firstinlist;
337 nextlistitem = 0;
338 firstinlist = this;
339 }
340
341}
342
343void EMData::elementaccessed() const
344{
345 //DO not move item to top of list if already at top of list
346 if(firstinlist == this){return;}
347 removefromlist();
348 addtolist();
349}
350
351void EMData::removefromlist() const
352{
353
354 //remove from list
355 if(firstinlist == lastinlist){ //last item in list....
356 firstinlist = 0;
357 lastinlist = 0;
358 nextlistitem = 0;
359 prevlistitem = 0;
360 return;
361 }
362 if(nextlistitem !=0){
363 nextlistitem->prevlistitem = prevlistitem; //this object is not first in the list
364 }else{
365 firstinlist = prevlistitem;
366 }
367 if(prevlistitem !=0){
368 prevlistitem->nextlistitem = nextlistitem; //this item is not last in the list
369 }else{
370 lastinlist = nextlistitem;
371 }
372 nextlistitem = 0;
373 prevlistitem = 0;
374
375}
376
377void EMData::switchoncuda()
378{
379 EMData::usecuda = 1;
380}
381
382void EMData::switchoffcuda()
383{
384 EMData::usecuda = 0;
385}
386
387void EMData::cuda_cleanup()
388{
390 //Cleanup any object mess.... CUDA has OCD
391 while(lastinlist){
392 if(lastinlist->cudarwdata) lastinlist->rw_free();
393 if(lastinlist && lastinlist->cudarodata) lastinlist->ro_free();
394 }
395 //Exit CUDA threads
396 cudaThreadExit();
397 //Free the CUDA device lock
398 if(EMData::cudadevicenum >= 0)
399 {
400 char filename[16];
401 sprintf(filename,"%s%d",cudalockfile,EMData::cudadevicenum); //Only works for Linux
402 remove(filename);
403 }
404
405}
406
407bool EMData::cuda_initialize()
408{
409 if(EMData::nocudainit) return 0;
410 int device = device_init();
411
412 if(device != -1)
413 {
414 EMData::cudadevicenum = device;
415 switchoncuda();
416 return 1;
417 } else {
418 switchoffcuda();
419 return 0;
420 }
421}
422
423const char* EMData::getcudalock()
424{
425 return cudalockfile;
426}
427
428#endif //EMAN2_USING_CUDA
EMData stores an image's data and defines core image processing routines.
Definition: emdata.h:82
float * rdata
image real data
Definition: emdata.h:835
int nx
image size
Definition: emdata.h:848
size_t nxyz
Definition: emdata.h:849
void do_cuda_fft_cache_destroy()
bool copy_to_array(const float *data, cudaArray *array, const int nx, const int ny, const int n, const cudaMemcpyKind memkindz)
cudaArray * get_cuda_array(const int nx, const int ny, const int nz)
int device_init()
Initialize the cuda device Can be called any number of times but the actual initialization occurs onl...
void unbind_cuda_textureA(const int ndims)
void bind_cuda_array_to_textureB(const cudaArray *const array, const int ndims, const bool interp_mode)
const char *const cudalockfile
Definition: cuda_util.h:6
void unbind_cuda_textureB(const int ndims)
void bind_cuda_array_to_textureA(const cudaArray *const array, const int ndims, const bool interp_mode)
void update()
Mark EMData as changed, statistics, etc will be updated at need.
#define UnexpectedBehaviorException(desc)
Definition: exception.h:400
E2Exception class.
Definition: aligner.h:40