GopherProxy

	tFurther work on implementing OO - sphere - GPU-based 3D discrete element metho…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit 287f0082450f6709173703f7a43f708676d4f29e
	parent bd8883f07cd1e54218c7cde17e6376d52918727c
	Author: Anders Damsgaard Christensen <[email protected]>
	Date: Fri, 26 Oct 2012 23:05:19 +0200

	Further work on implementing OO

	Diffstat:
	M src/constants.cuh \| 1 +
	M src/contactsearch.cuh \| 20 +++++++++-----------
	M src/datatypes.h \| 31 ++++++++++++++++++++++++-----…
	M src/device.cu \| 512 ++++++++++++++---------------…
	M src/integration.cuh \| 4 ++--
	M src/sorting.cuh \| 19 ++++++++-----------
	M src/sphere.h \| 31 ++++++++++++++++-------------…

	7 files changed, 297 insertions(+), 321 deletions(-)
	---
	diff --git a/src/constants.cuh b/src/constants.cuh
	t@@ -10,6 +10,7 @@
	__constant__ unsigned int devC_nd; // Number of dimensions
	__constant__ unsigned int devC_np; // Number of particles
	__constant__ int devC_nc; // Max. number of contacts a particle can…
	+__constant__ Float devC_dt; // Computational time step length

	// Device constant memory structures
	__constant__ Params devC_params;
	diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
	t@@ -87,7 +87,6 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
	Float* es_dot, Float* ev_dot,
	Float* p,
	Float4* dev_x_sorted,
	- Float* dev_radius_sorted,
	Float4* dev_vel_sorted,
	Float4* dev_angvel_sorted,
	unsigned int* dev_cellStart,
	t@@ -125,7 +124,7 @@ __device__ void findAndProcessContactsInCell(int3 targetCe…

	// Fetch position and velocity of particle B.
	Float4 x_b = dev_x_sorted[idx_b];
	- Float radius_b = dev_radius_sorted[idx_b];
	+ Float radius_b = x_b.w;
	Float kappa = devC_params.kappa;

	// Distance between particle centers (Float4 -> Float3)
	t@@ -183,7 +182,6 @@ __device__ void findContactsInCell(int3 targetCell,
	unsigned int idx_a,
	Float4 x_a, Float radius_a,
	Float4* dev_x_sorted,
	- Float* dev_radius_sorted,
	unsigned int* dev_cellStart,
	unsigned int* dev_cellEnd,
	unsigned int* dev_gridParticleIndex,
	t@@ -226,7 +224,7 @@ __device__ void findContactsInCell(int3 targetCell,

	// Fetch position and radius of particle B.
	Float4 x_b = dev_x_sorted[idx_b];
	- Float radius_b = dev_radius_sorted[idx_b];
	+ Float radius_b = x_b.w;

	// Read the original index of particle B
	unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
	t@@ -316,7 +314,7 @@ __device__ void findContactsInCell(int3 targetCell,
	__global__ void topology(unsigned int* dev_cellStart,
	unsigned int* dev_cellEnd, // Input: Particles in…
	unsigned int* dev_gridParticleIndex, // Input: Unsort…
	- Float4* dev_x_sorted, Float* dev_radius_sorted,
	+ Float4* dev_x_sorted,
	unsigned int* dev_contacts,
	Float4* dev_distmod)
	{
	t@@ -325,7 +323,7 @@ __global__ void topology(unsigned int* dev_cellStart,
	if (idx_a < devC_params.np) {
	// Fetch particle data in global read
	Float4 x_a = dev_x_sorted[idx_a];
	- Float radius_a = dev_radius_sorted[idx_a];
	+ Float radius_a = x_a.w;

	// Count the number of contacts in this time step
	int nc = 0;
	t@@ -346,7 +344,7 @@ __global__ void topology(unsigned int* dev_cellStart,
	for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	findContactsInCell(targetPos, idx_a, x_a, radius_a,
	- dev_x_sorted, dev_radius_sorted,
	+ dev_x_sorted,
	dev_cellStart, dev_cellEnd,
	dev_gridParticleIndex,
	&nc, dev_contacts, dev_distmod);
	t@@ -371,8 +369,8 @@ __global__ void topology(unsigned int* dev_cellStart,
	__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsort…
	unsigned int* dev_cellStart,
	unsigned int* dev_cellEnd,
	- Float4* dev_x, Float* dev_radius,
	- Float4* dev_x_sorted, Float* dev_radius_sorted,
	+ Float4* dev_x,
	+ Float4* dev_x_sorted,
	Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
	Float4* dev_vel, Float4* dev_angvel,
	Float4* dev_force, Float4* dev_torque,
	t@@ -395,7 +393,7 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	// Fetch particle data in global read
	unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
	Float4 x_a = dev_x_sorted[idx_a];
	- Float radius_a = dev_radius_sorted[idx_a];
	+ Float radius_a = x_a.w;

	// Fetch wall data in global read
	Float4 w_up_nx = dev_w_nx[0];
	t@@ -521,7 +519,7 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a,
	&F, &T, &es_dot, &ev_dot, &p,
	- dev_x_sorted, dev_radius_sorted,
	+ dev_x_sorted,
	dev_vel_sorted, dev_angvel_sorted,
	dev_cellStart, dev_cellEnd,
	dev_w_nx, dev_w_mvfd);
	diff --git a/src/datatypes.h b/src/datatypes.h
	t@@ -15,13 +15,18 @@

	// Structure containing kinematic particle values
	struct Kinematics {
	- Float4 *x; // Positions + radii (w)
	- Float2 *xysum; // Horizontal distance traveled
	- Float4 *vel; // Translational velocities + fixvels (w)
	- Float4 *force; // Sums of forces
	- Float4 *angpos; // Angular positions
	- Float4 *angvel; // Angular velocities
	- Float4 *torque; // Sums of torques
	+ Float4 *x; // Positions + radii (w)
	+ Float2 *xysum; // Horizontal distance traveled
	+ Float4 *vel; // Translational velocities + fixvels (w)
	+ Float4 *acc; // Translational accelerations
	+ Float4 *force; // Sums of forces
	+ Float4 *angpos; // Angular positions
	+ Float4 *angvel; // Angular velocities
	+ Float4 *angacc; // Angular accelerations
	+ Float4 *torque; // Sums of torques
	+ unsigned int *contacts; // List of contacts per particle
	+ Float4 *distmod; // Distance modifiers for contacts across periodic…
	+ Float4 *delta_t; // Accumulated shear distance of contacts
	};

	// Structure containing individual physical particle parameters
	t@@ -42,6 +47,17 @@ struct Grid {
	int periodic; // Behavior of boundaries at 1st and 2nd world …
	};

	+struct Sorting {
	+ Float4 *x_sorted; // Positions + radii (w) (sorted)
	+ Float4 *vel_sorted; // Translational velocities + fixvels (w) (sort…
	+ Float4 *angvel_sorted; // Angular velocities (sorted)
	+ unsigned int *gridParticleCellID; // Hash key (cell index) from position in …
	+ unsigned int *gridParticleIndex; // Original indexes of particles
	+ unsigned int *cellStart; // First index of sorted idx'es in cells
	+ unsigned int *cellEnd; // Last index of sorted idx'es in cells
	+};
	+
	+
	// Structure containing time parameters
	struct Time {
	Float dt; // Computational time step length
	t@@ -76,6 +92,7 @@ struct Walls {
	int wmode[MAXWALLS]; // Wall modes
	Float4* nx; // Wall normal and position
	Float4* mvfd; // Wall mass, velocity, force and dev. stress
	+ Float* force; // Resulting forces on walls per particle
	Float gamma_wn; // Wall normal viscosity
	Float gamma_wt; // Wall tangential viscosity
	Float gamma_wr; // Wall rolling viscosity
	diff --git a/src/device.cu b/src/device.cu
	t@@ -197,6 +197,7 @@ __host__ void DEM::transferToConstantDeviceMemory()
	cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd));
	cudaMemcpyToSymbol("devC_np", &np, sizeof(np));
	cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int));
	+ cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float));
	cudaMemcpyToSymbol(devC_grid, grid, sizeof(Grid));
	cudaMemcpyToSymbol(devC_params, params, sizeof(Params));

	t@@ -205,149 +206,178 @@ __host__ void DEM::transferToConstantDeviceMemory()
	if (verbose == 1)
	cout << "Done\n";

	- checkConstantMemory();
	+ //checkConstantMemory();
	}


	-//extern "C"
	-__host__ void DEM::startTime()
	+// Allocate device memory for particle variables,
	+// tied to previously declared pointers in structures
	+__host__ void DEM::allocateGlobalDeviceMemory(void)
	{
	+ // Particle memory size
	+ unsigned int memSizeF = sizeof(Float) * np;
	+ unsigned int memSizeF4 = sizeof(Float4) * np;

	- using std::cout; // Namespace directive
	+ if (verbose == 1)
	+ std::cout << " Allocating device memory: ";

	- // Copy data to constant global device memory
	- transferToConstantDeviceMemory();
	+ // Particle arrays
	+ cudaMalloc((void**)&dev_k.x, memSizeF4);
	+ cudaMalloc((void**)&dev_sort.x_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_k.vel, memSizeF4);
	+ cudaMalloc((void**)&dev_sort.vel_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_k.angvel, memSizeF4);
	+ cudaMalloc((void**)&dev_sort.angvel_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_k.acc, memSizeF4);
	+ k.acc = new Float4[np];
	+ cudaMalloc((void**)&dev_k.angacc, memSizeF4);
	+ k.angacc = new Float4[np];
	+ cudaMalloc((void**)&dev_k.force, memSizeF4);
	+ cudaMalloc((void**)&dev_k.torque, memSizeF4);
	+ cudaMalloc((void**)&dev_k.angpos, memSizeF4);
	+ cudaMalloc((void**)&dev_e.es_dot, memSizeF);
	+ cudaMalloc((void**)&dev_e.ev_dot, memSizeF);
	+ cudaMalloc((void**)&dev_e.es, memSizeF);
	+ cudaMalloc((void**)&dev_e.ev, memSizeF);
	+ cudaMalloc((void**)&dev_e.p, memSizeF);

	- Float* dev_w_force; // Resulting force on wall per particle
	- Float* dev_w_force_partial; // Partial sum from block of threads
	-
	- // Memory for sorted particle data
	- Float4* dev_x_sorted;
	- Float4* dev_vel_sorted;
	- Float4* dev_angvel_sorted;
	- Float* dev_radius_sorted;
	-
	- // Grid-particle array pointers
	- unsigned int* dev_gridParticleCellID;
	- unsigned int* dev_gridParticleIndex;
	- unsigned int* dev_cellStart;
	- unsigned int* dev_cellEnd;
	-
	- // Particle contact bookkeeping
	- unsigned int* dev_contacts;
	- unsigned int* host_contacts = new unsigned int[p.np*NC];
	- // Particle pair distance correction across periodic boundaries
	- Float4* dev_distmod;
	- Float4* host_distmod = new Float4[p.np*NC];
	- // x,y,z contains the interparticle vector, corrected if contact
	- // is across a periodic boundary.
	- Float4* dev_delta_t; // Accumulated shear distance of contact
	- Float4* host_delta_t = new Float4[p.np*NC];
	+ // Cell-related arrays
	+ cudaMalloc((void*)&dev_sort.gridParticleCellID, sizeof(unsigned int)np);
	+ cudaMalloc((void*)&dev_sort.gridParticleIndex, sizeof(unsigned int)np);
	+ cudaMalloc((void*)&dev_sort.cellStart, sizeof(unsigned int)grid.num[0]*gri…
	+ cudaMalloc((void*)&dev_sort.cellEnd, sizeof(unsigned int)grid.num[0]*grid.…

	- // Particle memory size
	- unsigned int memSizeF = sizeof(Float) * p.np;
	- unsigned int memSizeF4 = sizeof(Float4) * p.np;
	+ // Particle contact bookkeeping arrays
	+ cudaMalloc((void*)&dev_k.contacts, sizeof(unsigned int)np*NC); // Max NC c…
	+ cudaMalloc((void*)&dev_k.distmod, sizeof(Float4)np*NC);
	+ cudaMalloc((void*)&dev_k.delta_t, sizeof(Float4)np*NC);
	+
	+ // Host contact bookkeeping arrays
	+ k.contacts = new unsigned int[np*NC];
	+ // Initialize contacts lists to np
	+ for (unsigned int i=0; i<(np*NC); ++i)
	+ k.contacts[i] = np;
	+ k.distmod = new Float4[np*NC];
	+ k.delta_t = new Float4[np*NC];

	- // Allocate device memory for particle variables,
	- // tied to previously declared pointers in structures
	- allocateGlobalDeviceMemory();
	- cout << " Allocating device memory: ";
	+ // Wall arrays
	+ cudaMalloc((void*)&dev_walls.nx, sizeof(Float4)walls.nw);
	+ cudaMalloc((void*)&dev_walls.mvfd, sizeof(Float4)walls.nw);
	+ cudaMalloc((void*)&dev_walls.force, sizeof(Float)walls.nw*np);
	+ // dev_w_force_partial allocated later

	+ checkForCudaErrors("End of allocateGlobalDeviceMemory");
	+ if (verbose == 1)
	+ std::cout << "Done\n";
	+}
	+
	+__host__ void DEM::freeGlobalDeviceMemory()
	+{
	+ if (verbose == 1)
	+ printf("\nLiberating device memory: ");
	// Particle arrays
	- cudaMalloc((void**)&dev_x, memSizeF4);
	- cudaMalloc((void**)&dev_x_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_vel, memSizeF4);
	- cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_angvel, memSizeF4);
	- cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_acc, memSizeF4);
	- cudaMalloc((void**)&dev_angacc, memSizeF4);
	- cudaMalloc((void**)&dev_force, memSizeF4);
	- cudaMalloc((void**)&dev_torque, memSizeF4);
	- cudaMalloc((void**)&dev_angpos, memSizeF4);
	- cudaMalloc((void**)&dev_radius, memSizeF);
	- cudaMalloc((void**)&dev_radius_sorted, memSizeF);
	- cudaMalloc((void**)&dev_es_dot, memSizeF);
	- cudaMalloc((void**)&dev_ev_dot, memSizeF);
	- cudaMalloc((void**)&dev_es, memSizeF);
	- cudaMalloc((void**)&dev_ev, memSizeF);
	- cudaMalloc((void**)&dev_p, memSizeF);
	- //cudaMalloc((void*)&dev_bonds, sizeof(uint4) p.np);
	- //cudaMalloc((void*)&dev_bonds_sorted, sizeof(uint4) p.np);
	+ cudaFree(dev_k.x);
	+ cudaFree(dev_sort.x_sorted);
	+ cudaFree(dev_k.vel);
	+ cudaFree(dev_sort.vel_sorted);
	+ cudaFree(dev_k.angvel);
	+ cudaFree(dev_sort.angvel_sorted);
	+ cudaFree(dev_k.acc);
	+ cudaFree(dev_k.angacc);
	+ cudaFree(dev_k.force);
	+ cudaFree(dev_k.torque);
	+ cudaFree(dev_k.angpos);
	+ cudaFree(dev_e.es_dot);
	+ cudaFree(dev_e.ev_dot);
	+ cudaFree(dev_e.es);
	+ cudaFree(dev_e.ev);
	+ cudaFree(dev_e.p);
	+ cudaFree(dev_k.contacts);
	+ cudaFree(dev_k.distmod);
	+ cudaFree(dev_k.delta_t);

	// Cell-related arrays
	- cudaMalloc((void*)&dev_gridParticleCellID, sizeof(unsigned int)p.np);
	- cudaMalloc((void*)&dev_gridParticleIndex, sizeof(unsigned int)p.np);
	- cudaMalloc((void*)&dev_cellStart, sizeof(unsigned int)grid.num[0]*grid.num…
	- cudaMalloc((void*)&dev_cellEnd, sizeof(unsigned int)grid.num[0]*grid.num[1…
	-
	- // Particle contact bookkeeping arrays
	- cudaMalloc((void*)&dev_contacts, sizeof(unsigned int)p.np*NC); // Max NC c…
	- cudaMalloc((void*)&dev_distmod, sizeof(Float4)p.np*NC);
	- cudaMalloc((void*)&dev_delta_t, sizeof(Float4)p.np*NC);
	+ cudaFree(dev_sort.gridParticleIndex);
	+ cudaFree(dev_sort.cellStart);
	+ cudaFree(dev_sort.cellEnd);

	// Wall arrays
	- cudaMalloc((void*)&dev_w_nx, sizeof(Float)params.nw*4);
	- cudaMalloc((void*)&dev_w_mvfd, sizeof(Float)params.nw*4);
	- cudaMalloc((void*)&dev_w_force, sizeof(Float)params.nw*p.np);
	+ cudaFree(dev_walls.nx);
	+ cudaFree(dev_walls.mvfd);
	+ cudaFree(dev_walls.force);
	+ cudaFree(dev_w_force_partial);
	+
	+ if (verbose == 1)
	+ printf("Done\n");
	+}

	- checkForCudaErrors("Post device memory allocation");
	- cout << "Done\n";

	- // Transfer data from host to gpu device memory
	+__host__ void DEM::transferToGlobalDeviceMemory()
	+{
	+ if (verbose == 1)
	+ std::cout << " Transfering data to the device: ";
	+
	+ // Copy structure data from host to global device memory
	+ /*cudaMemcpy(dev_k, k, sizeof(Kinematics), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_e, e, sizeof(Energies), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_time, time, sizeof(Time), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_walls, walls, sizeof(Walls), cudaMemcpyHostToDevice);*/
	+ cudaMemcpy(dev_k, k, sizeof(k), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_e, e, sizeof(e), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_time, time, sizeof(time), cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_walls, walls, sizeof(walls), cudaMemcpyHostToDevice);
	+
	+ checkForCudaErrors("End of transferToGlobalDeviceMemory");
	+ if (verbose == 1)
	+ std::cout << "Done\n";
	+}
	+
	+__host__ void DEM::transferToGlobalDeviceMemory()
	+{
	cout << " Transfering data to the device: ";

	- // Particle data
	- cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_angpos, host_angpos, memSizeF4, cudaMemcpyHostToDevice);
	- //cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p.np, cudaMemcpyHostToDe…
	- cudaMemcpy(dev_radius, p.radius, memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_es_dot, p.es_dot, memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_ev_dot, p.ev_dot, memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_es, p.es, memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_ev, p.ev, memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_p, p.p, memSizeF, cudaMemcpyHostToDevice);
	-
	- // Wall data (wall mass and number in constant memory)
	- cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)params.nw4, cudaMemcpyHostToD…
	- cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)params.nw4, cudaMemcpyHos…
	-
	- // Initialize contacts lists to p.np
	- unsigned int* npu = new unsigned int[p.np*NC];
	- for (unsigned int i=0; i<(p.np*NC); ++i)
	- npu[i] = p.np;
	- cudaMemcpy(dev_contacts, npu, sizeof(unsigned int)p.npNC, cudaMemcpyHostTo…
	- delete[] npu;
	-
	- // Create array of 0.0 values on the host and transfer these to the distance
	- // modifier and shear displacement arrays
	- Float4* zerosF4 = new Float4[p.np*NC];
	- for (unsigned int i=0; i<(p.np*NC); ++i)
	- zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	- cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)p.npNC, cudaMemcpyHostToDev…
	- cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)p.npNC, cudaMemcpyHostToDev…
	- delete[] zerosF4;
	-
	- checkForCudaErrors("Post memcopy");
	- cout << "Done\n";
	+ // Copy structure data from host to global device memory
	+ cudaMemcpy(k, dev_k, sizeof(k), cudaMemcpyDeviceToHost);
	+ cudaMemcpy(e, dev_e, sizeof(e), cudaMemcpyDeviceToHost);
	+ cudaMemcpy(time, dev_time, sizeof(time), cudaMemcpyDeviceToHost);
	+ cudaMemcpy(walls, dev_walls, sizeof(walls), cudaMemcpyDeviceToHost);
	+
	+ checkForCudaErrors("End of transferFromGlobalDeviceMemory");
	+ if (verbose == 1)
	+ std::cout << "Done\n";
	+}
	+
	+
	+// Iterate through time by explicit time integration
	+__host__ void DEM::startTime()
	+{
	+
	+ using std::cout; // Namespace directive
	+ char file[200]; // Output filename
	+ FILE *fp;
	+
	+ // Copy data to constant global device memory
	+ transferToConstantDeviceMemory();
	+
	+
	+ // Particle memory size
	+ unsigned int memSizeF = sizeof(Float) * np;
	+ unsigned int memSizeF4 = sizeof(Float4) * np;
	+
	+ // Allocate device memory for particle variables,
	+ // tied to previously declared pointers in structures
	+ allocateGlobalDeviceMemory();
	+
	+ // Transfer data from host to gpu device memory
	+ transferToGlobalDeviceMemory();

	// Synchronization point
	cudaThreadSynchronize();
	- checkForCudaErrors("Start of mainLoop()");
	+ checkForCudaErrors("Start of startTime()");

	// Model world variables
	float tic, toc, filetimeclock, time_spent, dev_time_spent;

	- // File output
	- FILE* fp;
	- char file[1000]; // Complete path+filename variable
	-
	// Start CPU clock
	tic = clock();

	t@@ -360,22 +390,25 @@ __host__ void DEM::startTime()
	// Shared memory per block
	unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);

	+ Float* dev_w_force_partial;
	cudaMalloc((void*)&dev_w_force_partial, sizeof(Float)dimGrid.x);

	// Report to stdout
	- cout << "\n Device memory allocation and transfer complete.\n"
	- << " - Blocks per grid: "
	- << dimGrid.x << "" << dimGrid.y << "" << dimGrid.z << "\n"
	- << " - Threads per block: "
	- << dimBlock.x << "" << dimBlock.y << "" << dimBlock.z << "\n"
	- << " - Shared memory required per block: " << smemSize << " bytes\n";
	+ if (verbose == 1) {
	+ cout << "\n Device memory allocation and transfer complete.\n"
	+ << " - Blocks per grid: "
	+ << dimGrid.x << "" << dimGrid.y << "" << dimGrid.z << "\n"
	+ << " - Threads per block: "
	+ << dimBlock.x << "" << dimBlock.y << "" << dimBlock.z << "\n"
	+ << " - Shared memory required per block: " << smemSize << " bytes\n";
	+ }

	// Initialize counter variable values
	filetimeclock = 0.0;
	long iter = 0;

	// Create first status.dat
	- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
	+ sprintf(file,"output/%s.status.dat", inputbin);
	fp = fopen(file, "w");
	fprintf(fp,"%2.4e %2.4e %d\n",
	time.current,
	t@@ -384,23 +417,17 @@ __host__ void DEM::startTime()
	fclose(fp);

	// Write first output data file: output0.bin, thus testing writing of bin fi…
	- sprintf(file,"%s/output/%s.output0.bin", cwd, inputbin);
	- if (fwritebin(file, &p, host_x, host_vel,
	- host_angvel, host_force,
	- host_torque, host_angpos,
	- host_bonds,
	- &grid, &time, &params,
	- host_w_nx, host_w_mvfd) != 0) {
	- std::cerr << "\n Problem during fwritebin \n";
	- exit(EXIT_FAILURE);
	+ sprintf(file,"output/%s.output0.bin", inputbin);
	+ writebin(file);
	+
	+ if (verbose == 1) {
	+ cout << "\n Entering the main calculation time loop...\n\n"
	+ << " IMPORTANT: Do not close this terminal, doing so will \n"
	+ << " terminate this SPHERE process. Follow the \n"
	+ << " progress by executing:\n"
	+ << " $ ./sphere_status " << inputbin << "\n\n";
	}

	- cout << "\n Entering the main calculation time loop...\n\n"
	- << " IMPORTANT: Do not close this terminal, doing so will \n"
	- << " terminate this SPHERE process. Follow the \n"
	- << " progress by executing:\n"
	- << " $ ./sphere_status " << inputbin << "\n\n";
	-

	// Start GPU clock
	cudaEvent_t dev_tic, dev_toc;
	t@@ -447,9 +474,9 @@ __host__ void DEM::startTime()
	// in the fine, uniform and homogenous grid.
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
	- dev_gridParticleIndex,
	- dev_x);
	+ calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort.gridParticleCellID,
	+ dev_sort.gridParticleIndex,
	+ dev_k.x);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -461,19 +488,19 @@ __host__ void DEM::startTime()
	// Sort particle (key, particle ID) pairs by hash key with Thrust radix so…
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
	- thrust::device_ptr<uint>(dev_gridParticleCellID + p.np…
	- thrust::device_ptr<uint>(dev_gridParticleIndex));
	+ thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort.gridParticleCellID),
	+ thrust::device_ptr<uint>(dev_sort.gridParticleCellID +…
	+ thrust::device_ptr<uint>(dev_sort.gridParticleIndex));
	cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implic…
	if (PROFILING == 1)
	stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
	- checkForCudaErrors("Post calcParticleCellID");
	+ checkForCudaErrors("Post thrust::sort_by_key");


	// Zero cell array values by setting cellStart to its highest possible val…
	// specified with pointer value 0xffffffff, which for a 32 bit unsigned int
	// is 4294967295.
	- cudaMemset(dev_cellStart, 0xffffffff,
	+ cudaMemset(dev_sort.cellStart, 0xffffffff,
	grid.num[0]grid.num[1]grid.num[2]*sizeof(unsigned int));
	cudaThreadSynchronize();
	checkForCudaErrors("Post cudaMemset");
	t@@ -482,18 +509,15 @@ __host__ void DEM::startTime()
	// coherent memory access. Save ordered configurations in new arrays (*_so…
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
	- dev_cellEnd,
	- dev_gridParticleCellID,
	- dev_gridParticleIndex,
	- dev_x, dev_vel,
	- dev_angvel, dev_radius,
	- //dev_bonds,
	- dev_x_sorted,
	- dev_vel_sorted,
	- dev_angvel_sorted,
	- dev_radius_sorted);
	- //dev_bonds_sorted);
	+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort.cellStart,
	+ dev_sort.cellEnd,
	+ dev_sort.gridParticleCellID…
	+ dev_sort.gridParticleIndex,
	+ dev_k.x, dev_k.vel,
	+ dev_k.angvel,
	+ dev_sort.x_sorted,
	+ dev_sort.vel_sorted,
	+ dev_sort.angvel_sorted);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -504,17 +528,16 @@ __host__ void DEM::startTime()
	// The contact search in topology() is only necessary for determining
	// the accumulated shear distance needed in the linear elastic
	// and nonlinear contact force model
	- if (params.shearmodel == 2 \|\| params.shearmodel == 3) {
	+ if (params.contactmodel == 2 \|\| params.contactmodel == 3) {
	// For each particle: Search contacts in neighbor cells
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- topology<<<dimGrid, dimBlock>>>(dev_cellStart,
	- dev_cellEnd,
	- dev_gridParticleIndex,
	- dev_x_sorted,
	- dev_radius_sorted,
	- dev_contacts,
	- dev_distmod);
	+ topology<<<dimGrid, dimBlock>>>(dev_sort.cellStart,
	+ dev_sort.cellEnd,
	+ dev_sort.gridParticleIndex,
	+ dev_sort.x_sorted,
	+ dev_k.contacts,
	+ dev_k.distmod);


	// Synchronization point
	t@@ -528,21 +551,26 @@ __host__ void DEM::startTime()
	// For each particle: Process collisions and compute resulting forces.
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
	- dev_cellStart,
	- dev_cellEnd,
	- dev_x, dev_radius,
	- dev_x_sorted, dev_radius_sorted,
	- dev_vel_sorted, dev_angvel_sorted,
	- dev_vel, dev_angvel,
	- dev_force, dev_torque,
	- dev_es_dot, dev_ev_dot,
	- dev_es, dev_ev, dev_p,
	- dev_w_nx, dev_w_mvfd, dev_w_force,
	- //dev_bonds_sorted,
	- dev_contacts,
	- dev_distmod,
	- dev_delta_t);
	+ interact<<<dimGrid, dimBlock>>>(dev_sort.gridParticleIndex,
	+ dev_sort.cellStart,
	+ dev_sort.cellEnd,
	+ dev_k.x,
	+ dev_sort.x_sorted,
	+ dev_sort.vel_sorted,
	+ dev_sort.angvel_sorted,
	+ dev_k.vel,
	+ dev_k.angvel,
	+ dev_k.force,
	+ dev_k.torque,
	+ dev_e.es_dot,
	+ dev_e.ev_dot,
	+ dev_e.es, dev_e.ev, dev_e.p,
	+ dev_walls.nx,
	+ dev_walls.mvfd,
	+ dev_walls.force,
	+ dev_k.contacts,
	+ dev_k.distmod,
	+ dev_k.delta_t);


	// Synchronization point
	t@@ -554,11 +582,16 @@ __host__ void DEM::startTime()
	// Update particle kinematics
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted,
	- dev_angvel_sorted, dev_radius_sorted,
	- dev_x, dev_vel, dev_angvel,
	- dev_force, dev_torque, dev_angpos,
	- dev_gridParticleIndex);
	+ integrate<<<dimGrid, dimBlock>>>(dev_sort.x_sorted,
	+ dev_sort.vel_sorted,
	+ dev_sort.angvel_sorted,
	+ dev_k.x,
	+ dev_k.vel,
	+ dev_k.angvel,
	+ dev_k.force,
	+ dev_k.torque,
	+ dev_k.angpos,
	+ dev_sort.gridParticleIndex);

	cudaThreadSynchronize();
	if (PROFILING == 1)
	t@@ -567,7 +600,7 @@ __host__ void DEM::startTime()
	// Summation of forces on wall
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);
	+ summation<<<dimGrid, dimBlock>>>(dev_walls.force, dev_w_force_partial);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -578,10 +611,10 @@ __host__ void DEM::startTime()
	// Update wall kinematics
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- integrateWalls<<< 1, params.nw>>>(dev_w_nx,
	- dev_w_mvfd,
	- dev_w_force_partial,
	- blocksPerGrid);
	+ integrateWalls<<< 1, walls.nw>>>(dev_walls.nx,
	+ dev_walls.mvfd,
	+ dev_w_force_partial,
	+ blocksPerGrid);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -591,12 +624,14 @@ __host__ void DEM::startTime()


	// Update timers and counters
	- time.current += time.dt;
	- filetimeclock += time.dt;
	+ time.current += time.dt;
	+ filetimeclock += time.dt;

	// Report time to console
	- cout << "\r Current simulation time: "
	- << time.current << " s. ";// << std::flush;
	+ if (verbose == 1) {
	+ cout << "\r Current simulation time: "
	+ << time.current << " s. ";// << std::flush;
	+ }


	// Produce output binary if the time interval
	t@@ -608,52 +643,16 @@ __host__ void DEM::startTime()
	checkForCudaErrors("Beginning of file output section");

	//// Copy device data to host memory
	-
	- // Particle data
	- cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_angpos, dev_angpos, memSizeF4, cudaMemcpyDeviceToHost);
	- //cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p.np, cudaMemcpyDevi…
	- cudaMemcpy(p.es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p.ev_dot, dev_ev_dot, memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p.es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p.ev, dev_ev, memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p.p, dev_p, memSizeF, cudaMemcpyDeviceToHost);
	-
	- // Wall data
	- cudaMemcpy(host_w_nx, dev_w_nx,
	- sizeof(Float)params.nw4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_w_mvfd, dev_w_mvfd,
	- sizeof(Float)params.nw4, cudaMemcpyDeviceToHost);
	-
	-
	- // Contact information
	- if (CONTACTINFO == 1) {
	- cudaMemcpy(host_contacts, dev_contacts, sizeof(unsigned int)p.npNC, …
	- cudaMemcpy(host_delta_t, dev_delta_t, memSizeF4*NC, cudaMemcpyDeviceTo…
	- cudaMemcpy(host_distmod, dev_distmod, memSizeF4*NC, cudaMemcpyDeviceTo…
	- }
	+ transferFromGlobalDeviceMemory();

	// Pause the CPU thread until all CUDA calls previously issued are compl…
	cudaThreadSynchronize();

	// Write binary output file
	time.step_count += 1;
	- sprintf(file,"%s/output/%s.output%d.bin", cwd, inputbin, time.step_count…
	-
	- if (fwritebin(file, &p, host_x, host_vel,
	- host_angvel, host_force,
	- host_torque, host_angpos,
	- host_bonds,
	- &grid, &time, &params,
	- host_w_nx, host_w_mvfd) != 0) {
	- cout << "\n Error during fwritebin() in main loop\n";
	- exit(EXIT_FAILURE);
	- }
	+ sprintf(file,"output/%s.output%d.bin", inputbin, time.step_count);
	+ writebin(file);
	+

	if (CONTACTINFO == 1) {
	// Write contact information to stdout
	t@@ -686,7 +685,7 @@ __host__ void DEM::startTime()
	}

	// Update status.dat at the interval of filetime
	- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
	+ sprintf(file,"output/%s.status.dat", inputbin);
	fp = fopen(file, "w");
	fprintf(fp,"%2.4e %2.4e %d\n",
	time.current,
	t@@ -747,48 +746,11 @@ __host__ void DEM::startTime()


	// Free GPU device memory
	- printf("\nLiberating device memory: ");
	-
	- // Particle arrays
	- cudaFree(dev_x);
	- cudaFree(dev_x_sorted);
	- cudaFree(dev_vel);
	- cudaFree(dev_vel_sorted);
	- cudaFree(dev_angvel);
	- cudaFree(dev_angvel_sorted);
	- cudaFree(dev_acc);
	- cudaFree(dev_angacc);
	- cudaFree(dev_force);
	- cudaFree(dev_torque);
	- cudaFree(dev_angpos);
	- cudaFree(dev_radius);
	- cudaFree(dev_radius_sorted);
	- cudaFree(dev_es_dot);
	- cudaFree(dev_ev_dot);
	- cudaFree(dev_es);
	- cudaFree(dev_ev);
	- cudaFree(dev_p);
	- //cudaFree(dev_bonds);
	- //cudaFree(dev_bonds_sorted);
	- cudaFree(dev_contacts);
	- cudaFree(dev_distmod);
	- cudaFree(dev_delta_t);
	-
	- // Cell-related arrays
	- cudaFree(dev_gridParticleIndex);
	- cudaFree(dev_cellStart);
	- cudaFree(dev_cellEnd);
	-
	- // Wall arrays
	- cudaFree(dev_w_nx);
	- cudaFree(dev_w_mvfd);
	- cudaFree(dev_w_force);
	- cudaFree(dev_w_force_partial);
	+ freeGlobalDeviceMemory();

	// Contact info arrays
	delete[] host_contacts;
	delete[] host_distmod;
	delete[] host_delta_t;

	- printf("Done\n");
	} /* EOF */
	diff --git a/src/integration.cuh b/src/integration.cuh
	t@@ -8,7 +8,7 @@
	// Second order integration scheme based on Taylor expansion of particle kinem…
	// Kernel executed on device, and callable from host only.
	__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp…
	- Float4* dev_angvel_sorted, Float* dev_radius_sorted,…
	+ Float4* dev_angvel_sorted,
	Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, …
	Float4* dev_force, Float4* dev_torque, Float4* dev_a…
	unsigned int* dev_gridParticleIndex) // Input: Sorte…
	t@@ -32,7 +32,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_…
	Float4 x = dev_x_sorted[idx];
	Float4 vel = dev_vel_sorted[idx];
	Float4 angvel = dev_angvel_sorted[idx];
	- Float radius = dev_radius_sorted[idx];
	+ Float radius = x.w;

	// Coherent read from constant memory to registers
	Float dt = devC_params.dt;
	diff --git a/src/sorting.cuh b/src/sorting.cuh
	t@@ -43,15 +43,16 @@ __global__ void calcParticleCellID(unsigned int* dev_gridP…
	// Reorder particle data into sorted order, and find the start and end particl…
	// of each cell in the sorted hash array.
	// Kernel executed on device, and callable from host only.
	-__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_c…
	+__global__ void reorderArrays(unsigned int* dev_cellStart,
	+ unsigned int* dev_cellEnd,
	unsigned int* dev_gridParticleCellID,
	unsigned int* dev_gridParticleIndex,
	- Float4* dev_x, Float4* dev_vel,
	- Float4* dev_angvel, Float* dev_radius,
	- //uint4* dev_bonds,
	- Float4* dev_x_sorted, Float4* dev_vel_sorted,
	- Float4* dev_angvel_sorted, Float* dev_radius_sor…
	- //uint4* dev_bonds_sorted)
	+ Float4* dev_x,
	+ Float4* dev_vel,
	+ Float4* dev_angvel,
	+ Float4* dev_x_sorted,
	+ Float4* dev_vel_sorted,
	+ Float4* dev_angvel_sorted)
	{

	// Create hash array in shared on-chip memory. The size of the array
	t@@ -107,16 +108,12 @@ __global__ void reorderArrays(unsigned int* dev_cellStar…
	Float4 x = dev_x[sortedIndex];
	Float4 vel = dev_vel[sortedIndex];
	Float4 angvel = dev_angvel[sortedIndex];
	- Float radius = dev_radius[sortedIndex];
	- //uint4 bonds = dev_bonds[sortedIndex];

	__syncthreads();
	// Write sorted data to global memory
	dev_x_sorted[idx] = x;
	dev_vel_sorted[idx] = vel;
	dev_angvel_sorted[idx] = angvel;
	- dev_radius_sorted[idx] = radius;
	- //dev_bonds_sorted[idx] = bonds;
	}
	} // End of reorderArrays(...)

	diff --git a/src/sphere.h b/src/sphere.h
	t@@ -20,31 +20,29 @@ class DEM {
	unsigned int np;

	// Structure containing individual particle kinematics
	- Kinematics k;
	- Kinematics dev_k;
	+ Kinematics k; // host
	+ Kinematics dev_k; // device

	// Structure containing energy values
	- Energies e;
	- Energies dev_e;
	+ Energies e; // host
	+ Energies dev_e; // device

	// Structure of global parameters
	- Params params;
	- Params dev_params;
	+ Params params; // host

	// Structure containing spatial parameters
	- Grid grid;
	- Grid dev_grid;
	+ Grid grid; // host
	+
	+ // Structure containing sorting arrays
	+ Sorting dev_sort; // device

	// Structure of temporal parameters
	- Time time;
	- Time dev_time;
	+ Time time; // host
	+ Time *dev_time; // device

	// Structure of wall parameters
	- Walls walls;
	- Walls dev_walls;
	-
	- // Wall force arrays
	- Float* dev_w_force;
	+ Walls walls; // host
	+ Walls dev_walls; // device

	// GPU initialization, must be called before startTime()
	void initializeGPU(void);
	t@@ -58,6 +56,9 @@ class DEM {
	// Allocate global device memory to hold data
	void allocateGlobalDeviceMemory(void);

	+ // Free dynamically allocated global device memory
	+ void freeGlobalDeviceMemory(void);
	+
	// Copy non-constant data to global GPU memory
	void transferToGlobalDeviceMemory(void);