tFurther work on implementing OO - sphere - GPU-based 3D discrete element metho… | |
git clone git://src.adamsgaard.dk/sphere | |
Log | |
Files | |
Refs | |
LICENSE | |
--- | |
commit 287f0082450f6709173703f7a43f708676d4f29e | |
parent bd8883f07cd1e54218c7cde17e6376d52918727c | |
Author: Anders Damsgaard Christensen <[email protected]> | |
Date: Fri, 26 Oct 2012 23:05:19 +0200 | |
Further work on implementing OO | |
Diffstat: | |
M src/constants.cuh | 1 + | |
M src/contactsearch.cuh | 20 +++++++++----------- | |
M src/datatypes.h | 31 ++++++++++++++++++++++++-----… | |
M src/device.cu | 512 ++++++++++++++---------------… | |
M src/integration.cuh | 4 ++-- | |
M src/sorting.cuh | 19 ++++++++----------- | |
M src/sphere.h | 31 ++++++++++++++++-------------… | |
7 files changed, 297 insertions(+), 321 deletions(-) | |
--- | |
diff --git a/src/constants.cuh b/src/constants.cuh | |
t@@ -10,6 +10,7 @@ | |
__constant__ unsigned int devC_nd; // Number of dimensions | |
__constant__ unsigned int devC_np; // Number of particles | |
__constant__ int devC_nc; // Max. number of contacts a particle can… | |
+__constant__ Float devC_dt; // Computational time step length | |
// Device constant memory structures | |
__constant__ Params devC_params; | |
diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh | |
t@@ -87,7 +87,6 @@ __device__ void findAndProcessContactsInCell(int3 targetCell, | |
Float* es_dot, Float* ev_dot, | |
Float* p, | |
Float4* dev_x_sorted, | |
- Float* dev_radius_sorted, | |
Float4* dev_vel_sorted, | |
Float4* dev_angvel_sorted, | |
unsigned int* dev_cellStart, | |
t@@ -125,7 +124,7 @@ __device__ void findAndProcessContactsInCell(int3 targetCe… | |
// Fetch position and velocity of particle B. | |
Float4 x_b = dev_x_sorted[idx_b]; | |
- Float radius_b = dev_radius_sorted[idx_b]; | |
+ Float radius_b = x_b.w; | |
Float kappa = devC_params.kappa; | |
// Distance between particle centers (Float4 -> Float3) | |
t@@ -183,7 +182,6 @@ __device__ void findContactsInCell(int3 targetCell, | |
unsigned int idx_a, | |
Float4 x_a, Float radius_a, | |
Float4* dev_x_sorted, | |
- Float* dev_radius_sorted, | |
unsigned int* dev_cellStart, | |
unsigned int* dev_cellEnd, | |
unsigned int* dev_gridParticleIndex, | |
t@@ -226,7 +224,7 @@ __device__ void findContactsInCell(int3 targetCell, | |
// Fetch position and radius of particle B. | |
Float4 x_b = dev_x_sorted[idx_b]; | |
- Float radius_b = dev_radius_sorted[idx_b]; | |
+ Float radius_b = x_b.w; | |
// Read the original index of particle B | |
unsigned int idx_b_orig = dev_gridParticleIndex[idx_b]; | |
t@@ -316,7 +314,7 @@ __device__ void findContactsInCell(int3 targetCell, | |
__global__ void topology(unsigned int* dev_cellStart, | |
unsigned int* dev_cellEnd, // Input: Particles in… | |
unsigned int* dev_gridParticleIndex, // Input: Unsort… | |
- Float4* dev_x_sorted, Float* dev_radius_sorted, | |
+ Float4* dev_x_sorted, | |
unsigned int* dev_contacts, | |
Float4* dev_distmod) | |
{ | |
t@@ -325,7 +323,7 @@ __global__ void topology(unsigned int* dev_cellStart, | |
if (idx_a < devC_params.np) { | |
// Fetch particle data in global read | |
Float4 x_a = dev_x_sorted[idx_a]; | |
- Float radius_a = dev_radius_sorted[idx_a]; | |
+ Float radius_a = x_a.w; | |
// Count the number of contacts in this time step | |
int nc = 0; | |
t@@ -346,7 +344,7 @@ __global__ void topology(unsigned int* dev_cellStart, | |
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis | |
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim); | |
findContactsInCell(targetPos, idx_a, x_a, radius_a, | |
- dev_x_sorted, dev_radius_sorted, | |
+ dev_x_sorted, | |
dev_cellStart, dev_cellEnd, | |
dev_gridParticleIndex, | |
&nc, dev_contacts, dev_distmod); | |
t@@ -371,8 +369,8 @@ __global__ void topology(unsigned int* dev_cellStart, | |
__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsort… | |
unsigned int* dev_cellStart, | |
unsigned int* dev_cellEnd, | |
- Float4* dev_x, Float* dev_radius, | |
- Float4* dev_x_sorted, Float* dev_radius_sorted, | |
+ Float4* dev_x, | |
+ Float4* dev_x_sorted, | |
Float4* dev_vel_sorted, Float4* dev_angvel_sorted, | |
Float4* dev_vel, Float4* dev_angvel, | |
Float4* dev_force, Float4* dev_torque, | |
t@@ -395,7 +393,7 @@ __global__ void interact(unsigned int* dev_gridParticleInd… | |
// Fetch particle data in global read | |
unsigned int idx_a_orig = dev_gridParticleIndex[idx_a]; | |
Float4 x_a = dev_x_sorted[idx_a]; | |
- Float radius_a = dev_radius_sorted[idx_a]; | |
+ Float radius_a = x_a.w; | |
// Fetch wall data in global read | |
Float4 w_up_nx = dev_w_nx[0]; | |
t@@ -521,7 +519,7 @@ __global__ void interact(unsigned int* dev_gridParticleInd… | |
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim); | |
findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a, | |
&F, &T, &es_dot, &ev_dot, &p, | |
- dev_x_sorted, dev_radius_sorted, | |
+ dev_x_sorted, | |
dev_vel_sorted, dev_angvel_sorted, | |
dev_cellStart, dev_cellEnd, | |
dev_w_nx, dev_w_mvfd); | |
diff --git a/src/datatypes.h b/src/datatypes.h | |
t@@ -15,13 +15,18 @@ | |
// Structure containing kinematic particle values | |
struct Kinematics { | |
- Float4 *x; // Positions + radii (w) | |
- Float2 *xysum; // Horizontal distance traveled | |
- Float4 *vel; // Translational velocities + fixvels (w) | |
- Float4 *force; // Sums of forces | |
- Float4 *angpos; // Angular positions | |
- Float4 *angvel; // Angular velocities | |
- Float4 *torque; // Sums of torques | |
+ Float4 *x; // Positions + radii (w) | |
+ Float2 *xysum; // Horizontal distance traveled | |
+ Float4 *vel; // Translational velocities + fixvels (w) | |
+ Float4 *acc; // Translational accelerations | |
+ Float4 *force; // Sums of forces | |
+ Float4 *angpos; // Angular positions | |
+ Float4 *angvel; // Angular velocities | |
+ Float4 *angacc; // Angular accelerations | |
+ Float4 *torque; // Sums of torques | |
+ unsigned int *contacts; // List of contacts per particle | |
+ Float4 *distmod; // Distance modifiers for contacts across periodic… | |
+ Float4 *delta_t; // Accumulated shear distance of contacts | |
}; | |
// Structure containing individual physical particle parameters | |
t@@ -42,6 +47,17 @@ struct Grid { | |
int periodic; // Behavior of boundaries at 1st and 2nd world … | |
}; | |
+struct Sorting { | |
+ Float4 *x_sorted; // Positions + radii (w) (sorted) | |
+ Float4 *vel_sorted; // Translational velocities + fixvels (w) (sort… | |
+ Float4 *angvel_sorted; // Angular velocities (sorted) | |
+ unsigned int *gridParticleCellID; // Hash key (cell index) from position in … | |
+ unsigned int *gridParticleIndex; // Original indexes of particles | |
+ unsigned int *cellStart; // First index of sorted idx'es in cells | |
+ unsigned int *cellEnd; // Last index of sorted idx'es in cells | |
+}; | |
+ | |
+ | |
// Structure containing time parameters | |
struct Time { | |
Float dt; // Computational time step length | |
t@@ -76,6 +92,7 @@ struct Walls { | |
int wmode[MAXWALLS]; // Wall modes | |
Float4* nx; // Wall normal and position | |
Float4* mvfd; // Wall mass, velocity, force and dev. stress | |
+ Float* force; // Resulting forces on walls per particle | |
Float gamma_wn; // Wall normal viscosity | |
Float gamma_wt; // Wall tangential viscosity | |
Float gamma_wr; // Wall rolling viscosity | |
diff --git a/src/device.cu b/src/device.cu | |
t@@ -197,6 +197,7 @@ __host__ void DEM::transferToConstantDeviceMemory() | |
cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd)); | |
cudaMemcpyToSymbol("devC_np", &np, sizeof(np)); | |
cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int)); | |
+ cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float)); | |
cudaMemcpyToSymbol(devC_grid, grid, sizeof(Grid)); | |
cudaMemcpyToSymbol(devC_params, params, sizeof(Params)); | |
t@@ -205,149 +206,178 @@ __host__ void DEM::transferToConstantDeviceMemory() | |
if (verbose == 1) | |
cout << "Done\n"; | |
- checkConstantMemory(); | |
+ //checkConstantMemory(); | |
} | |
-//extern "C" | |
-__host__ void DEM::startTime() | |
+// Allocate device memory for particle variables, | |
+// tied to previously declared pointers in structures | |
+__host__ void DEM::allocateGlobalDeviceMemory(void) | |
{ | |
+ // Particle memory size | |
+ unsigned int memSizeF = sizeof(Float) * np; | |
+ unsigned int memSizeF4 = sizeof(Float4) * np; | |
- using std::cout; // Namespace directive | |
+ if (verbose == 1) | |
+ std::cout << " Allocating device memory: "; | |
- // Copy data to constant global device memory | |
- transferToConstantDeviceMemory(); | |
+ // Particle arrays | |
+ cudaMalloc((void**)&dev_k.x, memSizeF4); | |
+ cudaMalloc((void**)&dev_sort.x_sorted, memSizeF4); | |
+ cudaMalloc((void**)&dev_k.vel, memSizeF4); | |
+ cudaMalloc((void**)&dev_sort.vel_sorted, memSizeF4); | |
+ cudaMalloc((void**)&dev_k.angvel, memSizeF4); | |
+ cudaMalloc((void**)&dev_sort.angvel_sorted, memSizeF4); | |
+ cudaMalloc((void**)&dev_k.acc, memSizeF4); | |
+ k.acc = new Float4[np]; | |
+ cudaMalloc((void**)&dev_k.angacc, memSizeF4); | |
+ k.angacc = new Float4[np]; | |
+ cudaMalloc((void**)&dev_k.force, memSizeF4); | |
+ cudaMalloc((void**)&dev_k.torque, memSizeF4); | |
+ cudaMalloc((void**)&dev_k.angpos, memSizeF4); | |
+ cudaMalloc((void**)&dev_e.es_dot, memSizeF); | |
+ cudaMalloc((void**)&dev_e.ev_dot, memSizeF); | |
+ cudaMalloc((void**)&dev_e.es, memSizeF); | |
+ cudaMalloc((void**)&dev_e.ev, memSizeF); | |
+ cudaMalloc((void**)&dev_e.p, memSizeF); | |
- Float* dev_w_force; // Resulting force on wall per particle | |
- Float* dev_w_force_partial; // Partial sum from block of threads | |
- | |
- // Memory for sorted particle data | |
- Float4* dev_x_sorted; | |
- Float4* dev_vel_sorted; | |
- Float4* dev_angvel_sorted; | |
- Float* dev_radius_sorted; | |
- | |
- // Grid-particle array pointers | |
- unsigned int* dev_gridParticleCellID; | |
- unsigned int* dev_gridParticleIndex; | |
- unsigned int* dev_cellStart; | |
- unsigned int* dev_cellEnd; | |
- | |
- // Particle contact bookkeeping | |
- unsigned int* dev_contacts; | |
- unsigned int* host_contacts = new unsigned int[p.np*NC]; | |
- // Particle pair distance correction across periodic boundaries | |
- Float4* dev_distmod; | |
- Float4* host_distmod = new Float4[p.np*NC]; | |
- // x,y,z contains the interparticle vector, corrected if contact | |
- // is across a periodic boundary. | |
- Float4* dev_delta_t; // Accumulated shear distance of contact | |
- Float4* host_delta_t = new Float4[p.np*NC]; | |
+ // Cell-related arrays | |
+ cudaMalloc((void**)&dev_sort.gridParticleCellID, sizeof(unsigned int)*np); | |
+ cudaMalloc((void**)&dev_sort.gridParticleIndex, sizeof(unsigned int)*np); | |
+ cudaMalloc((void**)&dev_sort.cellStart, sizeof(unsigned int)*grid.num[0]*gri… | |
+ cudaMalloc((void**)&dev_sort.cellEnd, sizeof(unsigned int)*grid.num[0]*grid.… | |
- // Particle memory size | |
- unsigned int memSizeF = sizeof(Float) * p.np; | |
- unsigned int memSizeF4 = sizeof(Float4) * p.np; | |
+ // Particle contact bookkeeping arrays | |
+ cudaMalloc((void**)&dev_k.contacts, sizeof(unsigned int)*np*NC); // Max NC c… | |
+ cudaMalloc((void**)&dev_k.distmod, sizeof(Float4)*np*NC); | |
+ cudaMalloc((void**)&dev_k.delta_t, sizeof(Float4)*np*NC); | |
+ | |
+ // Host contact bookkeeping arrays | |
+ k.contacts = new unsigned int[np*NC]; | |
+ // Initialize contacts lists to np | |
+ for (unsigned int i=0; i<(np*NC); ++i) | |
+ k.contacts[i] = np; | |
+ k.distmod = new Float4[np*NC]; | |
+ k.delta_t = new Float4[np*NC]; | |
- // Allocate device memory for particle variables, | |
- // tied to previously declared pointers in structures | |
- allocateGlobalDeviceMemory(); | |
- cout << " Allocating device memory: "; | |
+ // Wall arrays | |
+ cudaMalloc((void**)&dev_walls.nx, sizeof(Float4)*walls.nw); | |
+ cudaMalloc((void**)&dev_walls.mvfd, sizeof(Float4)*walls.nw); | |
+ cudaMalloc((void**)&dev_walls.force, sizeof(Float)*walls.nw*np); | |
+ // dev_w_force_partial allocated later | |
+ checkForCudaErrors("End of allocateGlobalDeviceMemory"); | |
+ if (verbose == 1) | |
+ std::cout << "Done\n"; | |
+} | |
+ | |
+__host__ void DEM::freeGlobalDeviceMemory() | |
+{ | |
+ if (verbose == 1) | |
+ printf("\nLiberating device memory: "); | |
// Particle arrays | |
- cudaMalloc((void**)&dev_x, memSizeF4); | |
- cudaMalloc((void**)&dev_x_sorted, memSizeF4); | |
- cudaMalloc((void**)&dev_vel, memSizeF4); | |
- cudaMalloc((void**)&dev_vel_sorted, memSizeF4); | |
- cudaMalloc((void**)&dev_angvel, memSizeF4); | |
- cudaMalloc((void**)&dev_angvel_sorted, memSizeF4); | |
- cudaMalloc((void**)&dev_acc, memSizeF4); | |
- cudaMalloc((void**)&dev_angacc, memSizeF4); | |
- cudaMalloc((void**)&dev_force, memSizeF4); | |
- cudaMalloc((void**)&dev_torque, memSizeF4); | |
- cudaMalloc((void**)&dev_angpos, memSizeF4); | |
- cudaMalloc((void**)&dev_radius, memSizeF); | |
- cudaMalloc((void**)&dev_radius_sorted, memSizeF); | |
- cudaMalloc((void**)&dev_es_dot, memSizeF); | |
- cudaMalloc((void**)&dev_ev_dot, memSizeF); | |
- cudaMalloc((void**)&dev_es, memSizeF); | |
- cudaMalloc((void**)&dev_ev, memSizeF); | |
- cudaMalloc((void**)&dev_p, memSizeF); | |
- //cudaMalloc((void**)&dev_bonds, sizeof(uint4) * p.np); | |
- //cudaMalloc((void**)&dev_bonds_sorted, sizeof(uint4) * p.np); | |
+ cudaFree(dev_k.x); | |
+ cudaFree(dev_sort.x_sorted); | |
+ cudaFree(dev_k.vel); | |
+ cudaFree(dev_sort.vel_sorted); | |
+ cudaFree(dev_k.angvel); | |
+ cudaFree(dev_sort.angvel_sorted); | |
+ cudaFree(dev_k.acc); | |
+ cudaFree(dev_k.angacc); | |
+ cudaFree(dev_k.force); | |
+ cudaFree(dev_k.torque); | |
+ cudaFree(dev_k.angpos); | |
+ cudaFree(dev_e.es_dot); | |
+ cudaFree(dev_e.ev_dot); | |
+ cudaFree(dev_e.es); | |
+ cudaFree(dev_e.ev); | |
+ cudaFree(dev_e.p); | |
+ cudaFree(dev_k.contacts); | |
+ cudaFree(dev_k.distmod); | |
+ cudaFree(dev_k.delta_t); | |
// Cell-related arrays | |
- cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*p.np); | |
- cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*p.np); | |
- cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num… | |
- cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1… | |
- | |
- // Particle contact bookkeeping arrays | |
- cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*p.np*NC); // Max NC c… | |
- cudaMalloc((void**)&dev_distmod, sizeof(Float4)*p.np*NC); | |
- cudaMalloc((void**)&dev_delta_t, sizeof(Float4)*p.np*NC); | |
+ cudaFree(dev_sort.gridParticleIndex); | |
+ cudaFree(dev_sort.cellStart); | |
+ cudaFree(dev_sort.cellEnd); | |
// Wall arrays | |
- cudaMalloc((void**)&dev_w_nx, sizeof(Float)*params.nw*4); | |
- cudaMalloc((void**)&dev_w_mvfd, sizeof(Float)*params.nw*4); | |
- cudaMalloc((void**)&dev_w_force, sizeof(Float)*params.nw*p.np); | |
+ cudaFree(dev_walls.nx); | |
+ cudaFree(dev_walls.mvfd); | |
+ cudaFree(dev_walls.force); | |
+ cudaFree(dev_w_force_partial); | |
+ | |
+ if (verbose == 1) | |
+ printf("Done\n"); | |
+} | |
- checkForCudaErrors("Post device memory allocation"); | |
- cout << "Done\n"; | |
- // Transfer data from host to gpu device memory | |
+__host__ void DEM::transferToGlobalDeviceMemory() | |
+{ | |
+ if (verbose == 1) | |
+ std::cout << " Transfering data to the device: "; | |
+ | |
+ // Copy structure data from host to global device memory | |
+ /*cudaMemcpy(dev_k, k, sizeof(Kinematics), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_e, e, sizeof(Energies), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_time, time, sizeof(Time), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_walls, walls, sizeof(Walls), cudaMemcpyHostToDevice);*/ | |
+ cudaMemcpy(dev_k, k, sizeof(k), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_e, e, sizeof(e), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_time, time, sizeof(time), cudaMemcpyHostToDevice); | |
+ cudaMemcpy(dev_walls, walls, sizeof(walls), cudaMemcpyHostToDevice); | |
+ | |
+ checkForCudaErrors("End of transferToGlobalDeviceMemory"); | |
+ if (verbose == 1) | |
+ std::cout << "Done\n"; | |
+} | |
+ | |
+__host__ void DEM::transferToGlobalDeviceMemory() | |
+{ | |
cout << " Transfering data to the device: "; | |
- // Particle data | |
- cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_angpos, host_angpos, memSizeF4, cudaMemcpyHostToDevice); | |
- //cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p.np, cudaMemcpyHostToDe… | |
- cudaMemcpy(dev_radius, p.radius, memSizeF, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_es_dot, p.es_dot, memSizeF, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_ev_dot, p.ev_dot, memSizeF, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_es, p.es, memSizeF, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_ev, p.ev, memSizeF, cudaMemcpyHostToDevice); | |
- cudaMemcpy(dev_p, p.p, memSizeF, cudaMemcpyHostToDevice); | |
- | |
- // Wall data (wall mass and number in constant memory) | |
- cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)*params.nw*4, cudaMemcpyHostToD… | |
- cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)*params.nw*4, cudaMemcpyHos… | |
- | |
- // Initialize contacts lists to p.np | |
- unsigned int* npu = new unsigned int[p.np*NC]; | |
- for (unsigned int i=0; i<(p.np*NC); ++i) | |
- npu[i] = p.np; | |
- cudaMemcpy(dev_contacts, npu, sizeof(unsigned int)*p.np*NC, cudaMemcpyHostTo… | |
- delete[] npu; | |
- | |
- // Create array of 0.0 values on the host and transfer these to the distance | |
- // modifier and shear displacement arrays | |
- Float4* zerosF4 = new Float4[p.np*NC]; | |
- for (unsigned int i=0; i<(p.np*NC); ++i) | |
- zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f); | |
- cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDev… | |
- cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDev… | |
- delete[] zerosF4; | |
- | |
- checkForCudaErrors("Post memcopy"); | |
- cout << "Done\n"; | |
+ // Copy structure data from host to global device memory | |
+ cudaMemcpy(k, dev_k, sizeof(k), cudaMemcpyDeviceToHost); | |
+ cudaMemcpy(e, dev_e, sizeof(e), cudaMemcpyDeviceToHost); | |
+ cudaMemcpy(time, dev_time, sizeof(time), cudaMemcpyDeviceToHost); | |
+ cudaMemcpy(walls, dev_walls, sizeof(walls), cudaMemcpyDeviceToHost); | |
+ | |
+ checkForCudaErrors("End of transferFromGlobalDeviceMemory"); | |
+ if (verbose == 1) | |
+ std::cout << "Done\n"; | |
+} | |
+ | |
+ | |
+// Iterate through time by explicit time integration | |
+__host__ void DEM::startTime() | |
+{ | |
+ | |
+ using std::cout; // Namespace directive | |
+ char file[200]; // Output filename | |
+ FILE *fp; | |
+ | |
+ // Copy data to constant global device memory | |
+ transferToConstantDeviceMemory(); | |
+ | |
+ | |
+ // Particle memory size | |
+ unsigned int memSizeF = sizeof(Float) * np; | |
+ unsigned int memSizeF4 = sizeof(Float4) * np; | |
+ | |
+ // Allocate device memory for particle variables, | |
+ // tied to previously declared pointers in structures | |
+ allocateGlobalDeviceMemory(); | |
+ | |
+ // Transfer data from host to gpu device memory | |
+ transferToGlobalDeviceMemory(); | |
// Synchronization point | |
cudaThreadSynchronize(); | |
- checkForCudaErrors("Start of mainLoop()"); | |
+ checkForCudaErrors("Start of startTime()"); | |
// Model world variables | |
float tic, toc, filetimeclock, time_spent, dev_time_spent; | |
- // File output | |
- FILE* fp; | |
- char file[1000]; // Complete path+filename variable | |
- | |
// Start CPU clock | |
tic = clock(); | |
t@@ -360,22 +390,25 @@ __host__ void DEM::startTime() | |
// Shared memory per block | |
unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1); | |
+ Float* dev_w_force_partial; | |
cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x); | |
// Report to stdout | |
- cout << "\n Device memory allocation and transfer complete.\n" | |
- << " - Blocks per grid: " | |
- << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n" | |
- << " - Threads per block: " | |
- << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n" | |
- << " - Shared memory required per block: " << smemSize << " bytes\n"; | |
+ if (verbose == 1) { | |
+ cout << "\n Device memory allocation and transfer complete.\n" | |
+ << " - Blocks per grid: " | |
+ << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n" | |
+ << " - Threads per block: " | |
+ << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n" | |
+ << " - Shared memory required per block: " << smemSize << " bytes\n"; | |
+ } | |
// Initialize counter variable values | |
filetimeclock = 0.0; | |
long iter = 0; | |
// Create first status.dat | |
- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin); | |
+ sprintf(file,"output/%s.status.dat", inputbin); | |
fp = fopen(file, "w"); | |
fprintf(fp,"%2.4e %2.4e %d\n", | |
time.current, | |
t@@ -384,23 +417,17 @@ __host__ void DEM::startTime() | |
fclose(fp); | |
// Write first output data file: output0.bin, thus testing writing of bin fi… | |
- sprintf(file,"%s/output/%s.output0.bin", cwd, inputbin); | |
- if (fwritebin(file, &p, host_x, host_vel, | |
- host_angvel, host_force, | |
- host_torque, host_angpos, | |
- host_bonds, | |
- &grid, &time, ¶ms, | |
- host_w_nx, host_w_mvfd) != 0) { | |
- std::cerr << "\n Problem during fwritebin \n"; | |
- exit(EXIT_FAILURE); | |
+ sprintf(file,"output/%s.output0.bin", inputbin); | |
+ writebin(file); | |
+ | |
+ if (verbose == 1) { | |
+ cout << "\n Entering the main calculation time loop...\n\n" | |
+ << " IMPORTANT: Do not close this terminal, doing so will \n" | |
+ << " terminate this SPHERE process. Follow the \n" | |
+ << " progress by executing:\n" | |
+ << " $ ./sphere_status " << inputbin << "\n\n"; | |
} | |
- cout << "\n Entering the main calculation time loop...\n\n" | |
- << " IMPORTANT: Do not close this terminal, doing so will \n" | |
- << " terminate this SPHERE process. Follow the \n" | |
- << " progress by executing:\n" | |
- << " $ ./sphere_status " << inputbin << "\n\n"; | |
- | |
// Start GPU clock | |
cudaEvent_t dev_tic, dev_toc; | |
t@@ -447,9 +474,9 @@ __host__ void DEM::startTime() | |
// in the fine, uniform and homogenous grid. | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID, | |
- dev_gridParticleIndex, | |
- dev_x); | |
+ calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort.gridParticleCellID, | |
+ dev_sort.gridParticleIndex, | |
+ dev_k.x); | |
// Synchronization point | |
cudaThreadSynchronize(); | |
t@@ -461,19 +488,19 @@ __host__ void DEM::startTime() | |
// Sort particle (key, particle ID) pairs by hash key with Thrust radix so… | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID), | |
- thrust::device_ptr<uint>(dev_gridParticleCellID + p.np… | |
- thrust::device_ptr<uint>(dev_gridParticleIndex)); | |
+ thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort.gridParticleCellID), | |
+ thrust::device_ptr<uint>(dev_sort.gridParticleCellID +… | |
+ thrust::device_ptr<uint>(dev_sort.gridParticleIndex)); | |
cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implic… | |
if (PROFILING == 1) | |
stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort); | |
- checkForCudaErrors("Post calcParticleCellID"); | |
+ checkForCudaErrors("Post thrust::sort_by_key"); | |
// Zero cell array values by setting cellStart to its highest possible val… | |
// specified with pointer value 0xffffffff, which for a 32 bit unsigned int | |
// is 4294967295. | |
- cudaMemset(dev_cellStart, 0xffffffff, | |
+ cudaMemset(dev_sort.cellStart, 0xffffffff, | |
grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int)); | |
cudaThreadSynchronize(); | |
checkForCudaErrors("Post cudaMemset"); | |
t@@ -482,18 +509,15 @@ __host__ void DEM::startTime() | |
// coherent memory access. Save ordered configurations in new arrays (*_so… | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart, | |
- dev_cellEnd, | |
- dev_gridParticleCellID, | |
- dev_gridParticleIndex, | |
- dev_x, dev_vel, | |
- dev_angvel, dev_radius, | |
- //dev_bonds, | |
- dev_x_sorted, | |
- dev_vel_sorted, | |
- dev_angvel_sorted, | |
- dev_radius_sorted); | |
- //dev_bonds_sorted); | |
+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort.cellStart, | |
+ dev_sort.cellEnd, | |
+ dev_sort.gridParticleCellID… | |
+ dev_sort.gridParticleIndex, | |
+ dev_k.x, dev_k.vel, | |
+ dev_k.angvel, | |
+ dev_sort.x_sorted, | |
+ dev_sort.vel_sorted, | |
+ dev_sort.angvel_sorted); | |
// Synchronization point | |
cudaThreadSynchronize(); | |
t@@ -504,17 +528,16 @@ __host__ void DEM::startTime() | |
// The contact search in topology() is only necessary for determining | |
// the accumulated shear distance needed in the linear elastic | |
// and nonlinear contact force model | |
- if (params.shearmodel == 2 || params.shearmodel == 3) { | |
+ if (params.contactmodel == 2 || params.contactmodel == 3) { | |
// For each particle: Search contacts in neighbor cells | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- topology<<<dimGrid, dimBlock>>>(dev_cellStart, | |
- dev_cellEnd, | |
- dev_gridParticleIndex, | |
- dev_x_sorted, | |
- dev_radius_sorted, | |
- dev_contacts, | |
- dev_distmod); | |
+ topology<<<dimGrid, dimBlock>>>(dev_sort.cellStart, | |
+ dev_sort.cellEnd, | |
+ dev_sort.gridParticleIndex, | |
+ dev_sort.x_sorted, | |
+ dev_k.contacts, | |
+ dev_k.distmod); | |
// Synchronization point | |
t@@ -528,21 +551,26 @@ __host__ void DEM::startTime() | |
// For each particle: Process collisions and compute resulting forces. | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex, | |
- dev_cellStart, | |
- dev_cellEnd, | |
- dev_x, dev_radius, | |
- dev_x_sorted, dev_radius_sorted, | |
- dev_vel_sorted, dev_angvel_sorted, | |
- dev_vel, dev_angvel, | |
- dev_force, dev_torque, | |
- dev_es_dot, dev_ev_dot, | |
- dev_es, dev_ev, dev_p, | |
- dev_w_nx, dev_w_mvfd, dev_w_force, | |
- //dev_bonds_sorted, | |
- dev_contacts, | |
- dev_distmod, | |
- dev_delta_t); | |
+ interact<<<dimGrid, dimBlock>>>(dev_sort.gridParticleIndex, | |
+ dev_sort.cellStart, | |
+ dev_sort.cellEnd, | |
+ dev_k.x, | |
+ dev_sort.x_sorted, | |
+ dev_sort.vel_sorted, | |
+ dev_sort.angvel_sorted, | |
+ dev_k.vel, | |
+ dev_k.angvel, | |
+ dev_k.force, | |
+ dev_k.torque, | |
+ dev_e.es_dot, | |
+ dev_e.ev_dot, | |
+ dev_e.es, dev_e.ev, dev_e.p, | |
+ dev_walls.nx, | |
+ dev_walls.mvfd, | |
+ dev_walls.force, | |
+ dev_k.contacts, | |
+ dev_k.distmod, | |
+ dev_k.delta_t); | |
// Synchronization point | |
t@@ -554,11 +582,16 @@ __host__ void DEM::startTime() | |
// Update particle kinematics | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted, | |
- dev_angvel_sorted, dev_radius_sorted, | |
- dev_x, dev_vel, dev_angvel, | |
- dev_force, dev_torque, dev_angpos, | |
- dev_gridParticleIndex); | |
+ integrate<<<dimGrid, dimBlock>>>(dev_sort.x_sorted, | |
+ dev_sort.vel_sorted, | |
+ dev_sort.angvel_sorted, | |
+ dev_k.x, | |
+ dev_k.vel, | |
+ dev_k.angvel, | |
+ dev_k.force, | |
+ dev_k.torque, | |
+ dev_k.angpos, | |
+ dev_sort.gridParticleIndex); | |
cudaThreadSynchronize(); | |
if (PROFILING == 1) | |
t@@ -567,7 +600,7 @@ __host__ void DEM::startTime() | |
// Summation of forces on wall | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial); | |
+ summation<<<dimGrid, dimBlock>>>(dev_walls.force, dev_w_force_partial); | |
// Synchronization point | |
cudaThreadSynchronize(); | |
t@@ -578,10 +611,10 @@ __host__ void DEM::startTime() | |
// Update wall kinematics | |
if (PROFILING == 1) | |
startTimer(&kernel_tic); | |
- integrateWalls<<< 1, params.nw>>>(dev_w_nx, | |
- dev_w_mvfd, | |
- dev_w_force_partial, | |
- blocksPerGrid); | |
+ integrateWalls<<< 1, walls.nw>>>(dev_walls.nx, | |
+ dev_walls.mvfd, | |
+ dev_w_force_partial, | |
+ blocksPerGrid); | |
// Synchronization point | |
cudaThreadSynchronize(); | |
t@@ -591,12 +624,14 @@ __host__ void DEM::startTime() | |
// Update timers and counters | |
- time.current += time.dt; | |
- filetimeclock += time.dt; | |
+ time.current += time.dt; | |
+ filetimeclock += time.dt; | |
// Report time to console | |
- cout << "\r Current simulation time: " | |
- << time.current << " s. ";// << std::flush; | |
+ if (verbose == 1) { | |
+ cout << "\r Current simulation time: " | |
+ << time.current << " s. ";// << std::flush; | |
+ } | |
// Produce output binary if the time interval | |
t@@ -608,52 +643,16 @@ __host__ void DEM::startTime() | |
checkForCudaErrors("Beginning of file output section"); | |
//// Copy device data to host memory | |
- | |
- // Particle data | |
- cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_angpos, dev_angpos, memSizeF4, cudaMemcpyDeviceToHost); | |
- //cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p.np, cudaMemcpyDevi… | |
- cudaMemcpy(p.es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(p.ev_dot, dev_ev_dot, memSizeF, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(p.es, dev_es, memSizeF, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(p.ev, dev_ev, memSizeF, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(p.p, dev_p, memSizeF, cudaMemcpyDeviceToHost); | |
- | |
- // Wall data | |
- cudaMemcpy(host_w_nx, dev_w_nx, | |
- sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost); | |
- cudaMemcpy(host_w_mvfd, dev_w_mvfd, | |
- sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost); | |
- | |
- | |
- // Contact information | |
- if (CONTACTINFO == 1) { | |
- cudaMemcpy(host_contacts, dev_contacts, sizeof(unsigned int)*p.np*NC, … | |
- cudaMemcpy(host_delta_t, dev_delta_t, memSizeF4*NC, cudaMemcpyDeviceTo… | |
- cudaMemcpy(host_distmod, dev_distmod, memSizeF4*NC, cudaMemcpyDeviceTo… | |
- } | |
+ transferFromGlobalDeviceMemory(); | |
// Pause the CPU thread until all CUDA calls previously issued are compl… | |
cudaThreadSynchronize(); | |
// Write binary output file | |
time.step_count += 1; | |
- sprintf(file,"%s/output/%s.output%d.bin", cwd, inputbin, time.step_count… | |
- | |
- if (fwritebin(file, &p, host_x, host_vel, | |
- host_angvel, host_force, | |
- host_torque, host_angpos, | |
- host_bonds, | |
- &grid, &time, ¶ms, | |
- host_w_nx, host_w_mvfd) != 0) { | |
- cout << "\n Error during fwritebin() in main loop\n"; | |
- exit(EXIT_FAILURE); | |
- } | |
+ sprintf(file,"output/%s.output%d.bin", inputbin, time.step_count); | |
+ writebin(file); | |
+ | |
if (CONTACTINFO == 1) { | |
// Write contact information to stdout | |
t@@ -686,7 +685,7 @@ __host__ void DEM::startTime() | |
} | |
// Update status.dat at the interval of filetime | |
- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin); | |
+ sprintf(file,"output/%s.status.dat", inputbin); | |
fp = fopen(file, "w"); | |
fprintf(fp,"%2.4e %2.4e %d\n", | |
time.current, | |
t@@ -747,48 +746,11 @@ __host__ void DEM::startTime() | |
// Free GPU device memory | |
- printf("\nLiberating device memory: "); | |
- | |
- // Particle arrays | |
- cudaFree(dev_x); | |
- cudaFree(dev_x_sorted); | |
- cudaFree(dev_vel); | |
- cudaFree(dev_vel_sorted); | |
- cudaFree(dev_angvel); | |
- cudaFree(dev_angvel_sorted); | |
- cudaFree(dev_acc); | |
- cudaFree(dev_angacc); | |
- cudaFree(dev_force); | |
- cudaFree(dev_torque); | |
- cudaFree(dev_angpos); | |
- cudaFree(dev_radius); | |
- cudaFree(dev_radius_sorted); | |
- cudaFree(dev_es_dot); | |
- cudaFree(dev_ev_dot); | |
- cudaFree(dev_es); | |
- cudaFree(dev_ev); | |
- cudaFree(dev_p); | |
- //cudaFree(dev_bonds); | |
- //cudaFree(dev_bonds_sorted); | |
- cudaFree(dev_contacts); | |
- cudaFree(dev_distmod); | |
- cudaFree(dev_delta_t); | |
- | |
- // Cell-related arrays | |
- cudaFree(dev_gridParticleIndex); | |
- cudaFree(dev_cellStart); | |
- cudaFree(dev_cellEnd); | |
- | |
- // Wall arrays | |
- cudaFree(dev_w_nx); | |
- cudaFree(dev_w_mvfd); | |
- cudaFree(dev_w_force); | |
- cudaFree(dev_w_force_partial); | |
+ freeGlobalDeviceMemory(); | |
// Contact info arrays | |
delete[] host_contacts; | |
delete[] host_distmod; | |
delete[] host_delta_t; | |
- printf("Done\n"); | |
} /* EOF */ | |
diff --git a/src/integration.cuh b/src/integration.cuh | |
t@@ -8,7 +8,7 @@ | |
// Second order integration scheme based on Taylor expansion of particle kinem… | |
// Kernel executed on device, and callable from host only. | |
__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp… | |
- Float4* dev_angvel_sorted, Float* dev_radius_sorted,… | |
+ Float4* dev_angvel_sorted, | |
Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, … | |
Float4* dev_force, Float4* dev_torque, Float4* dev_a… | |
unsigned int* dev_gridParticleIndex) // Input: Sorte… | |
t@@ -32,7 +32,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_… | |
Float4 x = dev_x_sorted[idx]; | |
Float4 vel = dev_vel_sorted[idx]; | |
Float4 angvel = dev_angvel_sorted[idx]; | |
- Float radius = dev_radius_sorted[idx]; | |
+ Float radius = x.w; | |
// Coherent read from constant memory to registers | |
Float dt = devC_params.dt; | |
diff --git a/src/sorting.cuh b/src/sorting.cuh | |
t@@ -43,15 +43,16 @@ __global__ void calcParticleCellID(unsigned int* dev_gridP… | |
// Reorder particle data into sorted order, and find the start and end particl… | |
// of each cell in the sorted hash array. | |
// Kernel executed on device, and callable from host only. | |
-__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_c… | |
+__global__ void reorderArrays(unsigned int* dev_cellStart, | |
+ unsigned int* dev_cellEnd, | |
unsigned int* dev_gridParticleCellID, | |
unsigned int* dev_gridParticleIndex, | |
- Float4* dev_x, Float4* dev_vel, | |
- Float4* dev_angvel, Float* dev_radius, | |
- //uint4* dev_bonds, | |
- Float4* dev_x_sorted, Float4* dev_vel_sorted, | |
- Float4* dev_angvel_sorted, Float* dev_radius_sor… | |
- //uint4* dev_bonds_sorted) | |
+ Float4* dev_x, | |
+ Float4* dev_vel, | |
+ Float4* dev_angvel, | |
+ Float4* dev_x_sorted, | |
+ Float4* dev_vel_sorted, | |
+ Float4* dev_angvel_sorted) | |
{ | |
// Create hash array in shared on-chip memory. The size of the array | |
t@@ -107,16 +108,12 @@ __global__ void reorderArrays(unsigned int* dev_cellStar… | |
Float4 x = dev_x[sortedIndex]; | |
Float4 vel = dev_vel[sortedIndex]; | |
Float4 angvel = dev_angvel[sortedIndex]; | |
- Float radius = dev_radius[sortedIndex]; | |
- //uint4 bonds = dev_bonds[sortedIndex]; | |
__syncthreads(); | |
// Write sorted data to global memory | |
dev_x_sorted[idx] = x; | |
dev_vel_sorted[idx] = vel; | |
dev_angvel_sorted[idx] = angvel; | |
- dev_radius_sorted[idx] = radius; | |
- //dev_bonds_sorted[idx] = bonds; | |
} | |
} // End of reorderArrays(...) | |
diff --git a/src/sphere.h b/src/sphere.h | |
t@@ -20,31 +20,29 @@ class DEM { | |
unsigned int np; | |
// Structure containing individual particle kinematics | |
- Kinematics k; | |
- Kinematics dev_k; | |
+ Kinematics k; // host | |
+ Kinematics dev_k; // device | |
// Structure containing energy values | |
- Energies e; | |
- Energies dev_e; | |
+ Energies e; // host | |
+ Energies dev_e; // device | |
// Structure of global parameters | |
- Params params; | |
- Params dev_params; | |
+ Params params; // host | |
// Structure containing spatial parameters | |
- Grid grid; | |
- Grid dev_grid; | |
+ Grid grid; // host | |
+ | |
+ // Structure containing sorting arrays | |
+ Sorting dev_sort; // device | |
// Structure of temporal parameters | |
- Time time; | |
- Time dev_time; | |
+ Time time; // host | |
+ Time *dev_time; // device | |
// Structure of wall parameters | |
- Walls walls; | |
- Walls dev_walls; | |
- | |
- // Wall force arrays | |
- Float* dev_w_force; | |
+ Walls walls; // host | |
+ Walls dev_walls; // device | |
// GPU initialization, must be called before startTime() | |
void initializeGPU(void); | |
t@@ -58,6 +56,9 @@ class DEM { | |
// Allocate global device memory to hold data | |
void allocateGlobalDeviceMemory(void); | |
+ // Free dynamically allocated global device memory | |
+ void freeGlobalDeviceMemory(void); | |
+ | |
// Copy non-constant data to global GPU memory | |
void transferToGlobalDeviceMemory(void); | |