Introduction
Introduction Statistics Contact Development Disclaimer Help
tworking on multi-GPU call for interact routine - sphere - GPU-based 3D discret…
git clone git://src.adamsgaard.dk/sphere
Log
Files
Refs
LICENSE
---
commit 983734d17bb48ec0d5191c2e18e66808584afc95
parent b620d26c6032c277e46b2e68fc566449bcdc015b
Author: Anders Damsgaard <[email protected]>
Date: Mon, 30 Jun 2014 12:43:30 +0200
working on multi-GPU call for interact routine
Diffstat:
M src/device.cu | 113 +++++++++++++++++++++++++++++…
M src/sphere.h | 37 +++++++++++++++++++++++++++++…
2 files changed, 148 insertions(+), 2 deletions(-)
---
diff --git a/src/device.cu b/src/device.cu
t@@ -340,6 +340,119 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
std::cout << "Done" << std::endl;
}
+// Allocate global memory on other devices required for "interact" function.
+// The values of domain_size[ndevices] must be set beforehand.
+__host__ void DEM::allocateHelperDeviceMemory(void)
+{
+ // Particle memory size
+ unsigned int memSizeF4 = sizeof(Float4) * np;
+
+ // Initialize pointers to per-GPU arrays
+ hdev_gridParticleIndex = (unsigned**)malloc(ndevices*sizeof(unsigned*));
+ hdev_gridCellStart = (unsigned**)malloc(ndevices*sizeof(unsigned*));
+ hdev_gridCellEnd = (unsigned**)malloc(ndevices*sizeof(unsigned*));
+ hdev_x = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_x_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_vel = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_vel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_angvel = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_angvel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_walls_nx = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_walls_mvfd = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_distmod = (Float4**)malloc(ndevices*sizeof(Float4*));
+
+ hdev_force = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_torque = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_delta_t = (Float4**)malloc(ndevices*sizeof(Float4*));
+ hdev_es_dot = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_es = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_ev_dot = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_ev = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_p = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_walls_force_pp = (Float**)malloc(ndevices*sizeof(Float*));
+ hdev_contacts = (unsigned**)malloc(ndevices*sizeof(unsigned*));
+
+ for (int d=0; d<ndevices; d++) {
+
+ // do not allocate memory on primary GPU
+ if (d == device)
+ continue;
+
+ cudaSetDevice(d);
+
+ // allocate space for full input arrays for interact()
+ cudaMalloc((void**)&hdev_gridParticleIndex[d], sizeof(unsigned int)*np…
+ cudaMalloc((void**)&hdev_gridCellStart[d], sizeof(unsigned int)
+ *grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&hdev_gridCellEnd[d], sizeof(unsigned int)
+ *grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&hdev_x[d], memSizeF4);
+ cudaMalloc((void**)&hdev_x_sorted[d], memSizeF4);
+ cudaMalloc((void**)&hdev_vel[d], memSizeF4);
+ cudaMalloc((void**)&hdev_vel_sorted[d], memSizeF4);
+ cudaMalloc((void**)&hdev_angvel[d], memSizeF4);
+ cudaMalloc((void**)&hdev_angvel_sorted[d], memSizeF4);
+ cudaMalloc((void**)&hdev_walls_nx[d], sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&hdev_walls_mvfd[d], sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&hdev_distmod[d], memSizeF4*NC);
+
+ // allocate space for partial output arrays for interact()
+ cudaMalloc((void**)&hdev_force[d], sizeof(Float4)*domain_size[d]);
+ cudaMalloc((void**)&hdev_torque[d], sizeof(Float4)*domain_size[d]);
+ cudaMalloc((void**)&hdev_es_dot[d], sizeof(Float)*domain_size[d]);
+ cudaMalloc((void**)&hdev_ev_dot[d], sizeof(Float)*domain_size[d]);
+ cudaMalloc((void**)&hdev_es[d], sizeof(Float)*domain_size[d]);
+ cudaMalloc((void**)&hdev_ev[d], sizeof(Float)*domain_size[d]);
+ cudaMalloc((void**)&hdev_p[d], sizeof(Float)*domain_size[d]);
+ cudaMalloc((void**)&hdev_walls_force_pp[d],
+ sizeof(Float)*domain_size[d]*walls.nw);
+ cudaMalloc((void**)&hdev_contacts[d],
+ sizeof(unsigned)*domain_size[d]*NC);
+ cudaMalloc((void**)&hdev_delta_t[d], sizeof(Float4)*domain_size[d]*NC);
+
+ checkForCudaErrors("During allocateGlobalDeviceMemoryOtherDevices");
+ }
+ cudaSetDevice(device); // select main GPU
+}
+
+__host__ void DEM::freeHelperDeviceMemory()
+{
+ for (int d=0; d<ndevices; d++) {
+
+ // do not allocate memory on primary GPU
+ if (d == device)
+ continue;
+
+ cudaSetDevice(d);
+
+ cudaFree(hdev_gridParticleIndex[d]);
+ cudaFree(hdev_gridCellStart[d]);
+ cudaFree(hdev_gridCellEnd[d]);
+ cudaFree(hdev_x[d]);
+ cudaFree(hdev_vel[d]);
+ cudaFree(hdev_vel_sorted[d]);
+ cudaFree(hdev_angvel[d]);
+ cudaFree(hdev_angvel_sorted[d]);
+ cudaFree(hdev_walls_nx[d]);
+ cudaFree(hdev_walls_mvfd[d]);
+ cudaFree(hdev_distmod[d]);
+
+ cudaFree(hdev_force[d]);
+ cudaFree(hdev_torque[d]);
+ cudaFree(hdev_es_dot[d]);
+ cudaFree(hdev_ev_dot[d]);
+ cudaFree(hdev_es[d]);
+ cudaFree(hdev_ev[d]);
+ cudaFree(hdev_p[d]);
+ cudaFree(hdev_walls_force_pp[d]);
+ cudaFree(hdev_contacts[d]);
+ cudaFree(hdev_delta_t[d]);
+
+ checkForCudaErrors("During helper device cudaFree calls");
+ }
+ cudaSetDevice(device); // select primary GPU
+}
+
__host__ void DEM::freeGlobalDeviceMemory()
{
if (verbose == 1)
diff --git a/src/sphere.h b/src/sphere.h
t@@ -53,8 +53,10 @@ class DEM {
unsigned int width;
unsigned int height;
- int ndevices; // number of CUDA GPUs
- int device; // primary GPU
+ // Device management
+ int ndevices; // number of CUDA GPUs
+ int device; // primary GPU
+ int* domain_size; // elements per GPU
// DEVICE ARRAYS
t@@ -137,6 +139,10 @@ class DEM {
void allocateGlobalDeviceMemory();
void rt_allocateGlobalDeviceMemory();
+ // Allocate global memory on helper devices
+ void allocateHelperDeviceMemory();
+ void freeHelperDeviceMemory();
+
// Free dynamically allocated global device memory
void freeGlobalDeviceMemory();
void rt_freeGlobalDeviceMemory();
t@@ -208,6 +214,33 @@ class DEM {
Float* dev_ns_div_tau_z; // div(tau) on z-face
Float3* dev_ns_f_pf; // Interaction force on particles
+ // Helper device arrays, input
+ unsigned int** hdev_gridParticleIndex;
+ unsigned int** hdev_gridCellStart;
+ unsigned int** hdev_gridCellEnd;
+ Float4** hdev_x;
+ Float4** hdev_x_sorted;
+ Float4** hdev_vel;
+ Float4** hdev_vel_sorted;
+ Float4** hdev_angvel;
+ Float4** hdev_angvel_sorted;
+ Float4** hdev_walls_nx;
+ Float4** hdev_walls_mvfd;
+ Float4** hdev_distmod;
+
+ // Helper device arrays, output
+ Float4** hdev_force;
+ Float4** hdev_torque;
+ Float4** hdev_delta_t;
+ Float** hdev_es_dot;
+ Float** hdev_es;
+ Float** hdev_ev_dot;
+ Float** hdev_ev;
+ Float** hdev_p;
+ Float** hdev_walls_force_pp;
+ unsigned int** hdev_contacts;
+
+
//// Navier Stokes functions
// Memory allocation
You are viewing proxied material from mx1.adamsgaard.dk. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.