tworking on multi-GPU call for interact routine - sphere - GPU-based 3D discret… | |
git clone git://src.adamsgaard.dk/sphere | |
Log | |
Files | |
Refs | |
LICENSE | |
--- | |
commit 983734d17bb48ec0d5191c2e18e66808584afc95 | |
parent b620d26c6032c277e46b2e68fc566449bcdc015b | |
Author: Anders Damsgaard <[email protected]> | |
Date: Mon, 30 Jun 2014 12:43:30 +0200 | |
working on multi-GPU call for interact routine | |
Diffstat: | |
M src/device.cu | 113 +++++++++++++++++++++++++++++… | |
M src/sphere.h | 37 +++++++++++++++++++++++++++++… | |
2 files changed, 148 insertions(+), 2 deletions(-) | |
--- | |
diff --git a/src/device.cu b/src/device.cu | |
t@@ -340,6 +340,119 @@ __host__ void DEM::allocateGlobalDeviceMemory(void) | |
std::cout << "Done" << std::endl; | |
} | |
+// Allocate global memory on other devices required for "interact" function. | |
+// The values of domain_size[ndevices] must be set beforehand. | |
+__host__ void DEM::allocateHelperDeviceMemory(void) | |
+{ | |
+ // Particle memory size | |
+ unsigned int memSizeF4 = sizeof(Float4) * np; | |
+ | |
+ // Initialize pointers to per-GPU arrays | |
+ hdev_gridParticleIndex = (unsigned**)malloc(ndevices*sizeof(unsigned*)); | |
+ hdev_gridCellStart = (unsigned**)malloc(ndevices*sizeof(unsigned*)); | |
+ hdev_gridCellEnd = (unsigned**)malloc(ndevices*sizeof(unsigned*)); | |
+ hdev_x = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_x_sorted = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_vel = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_vel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_angvel = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_angvel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_walls_nx = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_walls_mvfd = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_distmod = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ | |
+ hdev_force = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_torque = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_delta_t = (Float4**)malloc(ndevices*sizeof(Float4*)); | |
+ hdev_es_dot = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_es = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_ev_dot = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_ev = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_p = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_walls_force_pp = (Float**)malloc(ndevices*sizeof(Float*)); | |
+ hdev_contacts = (unsigned**)malloc(ndevices*sizeof(unsigned*)); | |
+ | |
+ for (int d=0; d<ndevices; d++) { | |
+ | |
+ // do not allocate memory on primary GPU | |
+ if (d == device) | |
+ continue; | |
+ | |
+ cudaSetDevice(d); | |
+ | |
+ // allocate space for full input arrays for interact() | |
+ cudaMalloc((void**)&hdev_gridParticleIndex[d], sizeof(unsigned int)*np… | |
+ cudaMalloc((void**)&hdev_gridCellStart[d], sizeof(unsigned int) | |
+ *grid.num[0]*grid.num[1]*grid.num[2]); | |
+ cudaMalloc((void**)&hdev_gridCellEnd[d], sizeof(unsigned int) | |
+ *grid.num[0]*grid.num[1]*grid.num[2]); | |
+ cudaMalloc((void**)&hdev_x[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_x_sorted[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_vel[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_vel_sorted[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_angvel[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_angvel_sorted[d], memSizeF4); | |
+ cudaMalloc((void**)&hdev_walls_nx[d], sizeof(Float4)*walls.nw); | |
+ cudaMalloc((void**)&hdev_walls_mvfd[d], sizeof(Float4)*walls.nw); | |
+ cudaMalloc((void**)&hdev_distmod[d], memSizeF4*NC); | |
+ | |
+ // allocate space for partial output arrays for interact() | |
+ cudaMalloc((void**)&hdev_force[d], sizeof(Float4)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_torque[d], sizeof(Float4)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_es_dot[d], sizeof(Float)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_ev_dot[d], sizeof(Float)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_es[d], sizeof(Float)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_ev[d], sizeof(Float)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_p[d], sizeof(Float)*domain_size[d]); | |
+ cudaMalloc((void**)&hdev_walls_force_pp[d], | |
+ sizeof(Float)*domain_size[d]*walls.nw); | |
+ cudaMalloc((void**)&hdev_contacts[d], | |
+ sizeof(unsigned)*domain_size[d]*NC); | |
+ cudaMalloc((void**)&hdev_delta_t[d], sizeof(Float4)*domain_size[d]*NC); | |
+ | |
+ checkForCudaErrors("During allocateGlobalDeviceMemoryOtherDevices"); | |
+ } | |
+ cudaSetDevice(device); // select main GPU | |
+} | |
+ | |
+__host__ void DEM::freeHelperDeviceMemory() | |
+{ | |
+ for (int d=0; d<ndevices; d++) { | |
+ | |
+ // do not allocate memory on primary GPU | |
+ if (d == device) | |
+ continue; | |
+ | |
+ cudaSetDevice(d); | |
+ | |
+ cudaFree(hdev_gridParticleIndex[d]); | |
+ cudaFree(hdev_gridCellStart[d]); | |
+ cudaFree(hdev_gridCellEnd[d]); | |
+ cudaFree(hdev_x[d]); | |
+ cudaFree(hdev_vel[d]); | |
+ cudaFree(hdev_vel_sorted[d]); | |
+ cudaFree(hdev_angvel[d]); | |
+ cudaFree(hdev_angvel_sorted[d]); | |
+ cudaFree(hdev_walls_nx[d]); | |
+ cudaFree(hdev_walls_mvfd[d]); | |
+ cudaFree(hdev_distmod[d]); | |
+ | |
+ cudaFree(hdev_force[d]); | |
+ cudaFree(hdev_torque[d]); | |
+ cudaFree(hdev_es_dot[d]); | |
+ cudaFree(hdev_ev_dot[d]); | |
+ cudaFree(hdev_es[d]); | |
+ cudaFree(hdev_ev[d]); | |
+ cudaFree(hdev_p[d]); | |
+ cudaFree(hdev_walls_force_pp[d]); | |
+ cudaFree(hdev_contacts[d]); | |
+ cudaFree(hdev_delta_t[d]); | |
+ | |
+ checkForCudaErrors("During helper device cudaFree calls"); | |
+ } | |
+ cudaSetDevice(device); // select primary GPU | |
+} | |
+ | |
__host__ void DEM::freeGlobalDeviceMemory() | |
{ | |
if (verbose == 1) | |
diff --git a/src/sphere.h b/src/sphere.h | |
t@@ -53,8 +53,10 @@ class DEM { | |
unsigned int width; | |
unsigned int height; | |
- int ndevices; // number of CUDA GPUs | |
- int device; // primary GPU | |
+ // Device management | |
+ int ndevices; // number of CUDA GPUs | |
+ int device; // primary GPU | |
+ int* domain_size; // elements per GPU | |
// DEVICE ARRAYS | |
t@@ -137,6 +139,10 @@ class DEM { | |
void allocateGlobalDeviceMemory(); | |
void rt_allocateGlobalDeviceMemory(); | |
+ // Allocate global memory on helper devices | |
+ void allocateHelperDeviceMemory(); | |
+ void freeHelperDeviceMemory(); | |
+ | |
// Free dynamically allocated global device memory | |
void freeGlobalDeviceMemory(); | |
void rt_freeGlobalDeviceMemory(); | |
t@@ -208,6 +214,33 @@ class DEM { | |
Float* dev_ns_div_tau_z; // div(tau) on z-face | |
Float3* dev_ns_f_pf; // Interaction force on particles | |
+ // Helper device arrays, input | |
+ unsigned int** hdev_gridParticleIndex; | |
+ unsigned int** hdev_gridCellStart; | |
+ unsigned int** hdev_gridCellEnd; | |
+ Float4** hdev_x; | |
+ Float4** hdev_x_sorted; | |
+ Float4** hdev_vel; | |
+ Float4** hdev_vel_sorted; | |
+ Float4** hdev_angvel; | |
+ Float4** hdev_angvel_sorted; | |
+ Float4** hdev_walls_nx; | |
+ Float4** hdev_walls_mvfd; | |
+ Float4** hdev_distmod; | |
+ | |
+ // Helper device arrays, output | |
+ Float4** hdev_force; | |
+ Float4** hdev_torque; | |
+ Float4** hdev_delta_t; | |
+ Float** hdev_es_dot; | |
+ Float** hdev_es; | |
+ Float** hdev_ev_dot; | |
+ Float** hdev_ev; | |
+ Float** hdev_p; | |
+ Float** hdev_walls_force_pp; | |
+ unsigned int** hdev_contacts; | |
+ | |
+ | |
//// Navier Stokes functions | |
// Memory allocation |