GopherProxy

	tIO now works in Python, C++ and CUDA, see python/tests.py. Now working on impl…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit 472d81e85b29f2ad92df2031ff56a9ac2cdf8699
	parent c494a43f245cf27c2c498ce27636347fbbb1a909
	Author: Anders Damsgaard <[email protected]>
	Date: Fri, 2 Nov 2012 09:29:51 +0100

	IO now works in Python, C++ and CUDA, see python/tests.py. Now working on imple…

	Diffstat:
	M python/sphere.py \| 77 +++++++++++++++--------------…
	M python/tests.py \| 20 +++++++++++---------
	M src/Makefile \| 4 ++--
	M src/contactsearch.cuh \| 29 +++++++++++++++++++----------
	M src/datatypes.h \| 1 -
	M src/device.cu \| 346 ++++++++++++++++-------------…
	M src/file_io.cpp \| 1 -
	M src/integration.cuh \| 18 ++++++++++--------
	M src/sphere.cpp \| 22 ++++++++++++----------
	M src/sphere.h \| 51 +++++++++++++++++++++++++----…
	M src/utility.cu \| 4 ++--

	11 files changed, 315 insertions(+), 258 deletions(-)
	---
	diff --git a/python/sphere.py b/python/sphere.py
	t@@ -51,17 +51,21 @@ class Spherebin:
	self.ev = numpy.zeros(self.np, dtype=numpy.float64)
	self.p = numpy.zeros(self.np, dtype=numpy.float64)

	- self.g = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
	- self.k_n = numpy.ones(1, dtype=numpy.float64) * 1.16e9
	- self.k_t = numpy.ones(1, dtype=numpy.float64) * 1.16e9
	- self.k_r = numpy.zeros(1, dtype=numpy.float64)
	- self.gamma_n = numpy.zeros(1, dtype=numpy.float64)
	- self.gamma_t = numpy.zeros(1, dtype=numpy.float64)
	- self.gamma_r = numpy.zeros(1, dtype=numpy.float64)
	- self.mu_s = numpy.ones(1, dtype=numpy.float64)
	- self.mu_d = numpy.ones(1, dtype=numpy.float64)
	- self.mu_r = numpy.zeros(1, dtype=numpy.float64)
	- self.rho = numpy.ones(1, dtype=numpy.float64) * 2600.0
	+ self.g = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
	+ self.k_n = numpy.ones(1, dtype=numpy.float64) * 1.16e9
	+ self.k_t = numpy.ones(1, dtype=numpy.float64) * 1.16e9
	+ self.k_r = numpy.zeros(1, dtype=numpy.float64)
	+ self.gamma_n = numpy.zeros(1, dtype=numpy.float64)
	+ self.gamma_t = numpy.zeros(1, dtype=numpy.float64)
	+ self.gamma_r = numpy.zeros(1, dtype=numpy.float64)
	+ self.mu_s = numpy.ones(1, dtype=numpy.float64)
	+ self.mu_d = numpy.ones(1, dtype=numpy.float64)
	+ self.mu_r = numpy.zeros(1, dtype=numpy.float64)
	+ self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
	+ self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
	+ self.mu_ws = numpy.ones(1, dtype=numpy.float64)
	+ self.mu_wd = numpy.ones(1, dtype=numpy.float64)
	+ self.rho = numpy.ones(1, dtype=numpy.float64) * 2600.0
	self.contactmodel = numpy.ones(1, dtype=numpy.uint32) * 2 # contactLi…
	self.kappa = numpy.zeros(1, dtype=numpy.float64)
	self.db = numpy.zeros(1, dtype=numpy.float64)
	t@@ -70,19 +74,16 @@ class Spherebin:
	# Wall data
	self.nw = numpy.ones(1, dtype=numpy.uint32) * nw
	self.wmode = numpy.zeros(self.nw, dtype=numpy.int32)
	+
	self.w_n = numpy.zeros(self.nw*self.nd, dtype=numpy.float64).reshape(s…
	- self.w_n[nw-1,nd-1] = -1.0
	+ if (self.nw > 0):
	+ self.w_n[0,2] = -1.0
	self.w_x = numpy.ones(self.nw, dtype=numpy.float64)
	self.w_m = numpy.zeros(self.nw, dtype=numpy.float64)
	self.w_vel = numpy.zeros(self.nw, dtype=numpy.float64)
	self.w_force = numpy.zeros(self.nw, dtype=numpy.float64)
	self.w_devs = numpy.zeros(self.nw, dtype=numpy.float64)

	- # x- and y-boundary behavior
	- self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
	- self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
	- self.gamma_wr = numpy.ones(1, dtype=numpy.float64) * 1.0e3
	-
	# Compare the values of two Spherebin objects, and check
	# whether the values are identical
	def __cmp__(self, other):
	t@@ -136,8 +137,7 @@ class Spherebin:
	(self.w_force == other.w_force).all() and\
	(self.w_devs == other.w_devs).all() and\
	self.gamma_wn == other.gamma_wn and\
	- self.gamma_wt == other.gamma_wt and\
	- self.gamma_wr == other.gamma_wr\
	+ self.gamma_wt == other.gamma_wt\
	).all() == True):
	return 0 # All equal
	else:
	t@@ -224,6 +224,10 @@ class Spherebin:
	self.mu_s = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.mu_d = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.mu_r = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	+ self.gamma_wn = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	+ self.gamma_wt = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	+ self.mu_ws = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	+ self.mu_wd = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.rho = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.contactmodel = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
	self.kappa = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	t@@ -250,10 +254,6 @@ class Spherebin:
	self.w_force = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.w_devs = numpy.fromfile(fh, dtype=numpy.float64, count=1)

	- # x- and y-boundary behavior
	- self.gamma_wn = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	- self.gamma_wt = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	- self.gamma_wr = numpy.fromfile(fh, dtype=numpy.float64, count=1)

	fh.close()

	t@@ -325,6 +325,10 @@ class Spherebin:
	fh.write(self.mu_s.astype(numpy.float64))
	fh.write(self.mu_d.astype(numpy.float64))
	fh.write(self.mu_r.astype(numpy.float64))
	+ fh.write(self.gamma_wn.astype(numpy.float64))
	+ fh.write(self.gamma_wt.astype(numpy.float64))
	+ fh.write(self.mu_ws.astype(numpy.float64))
	+ fh.write(self.mu_wd.astype(numpy.float64))
	fh.write(self.rho.astype(numpy.float64))
	fh.write(self.contactmodel.astype(numpy.uint32))
	fh.write(self.kappa.astype(numpy.float64))
	t@@ -343,11 +347,6 @@ class Spherebin:
	fh.write(self.w_force[i].astype(numpy.float64))
	fh.write(self.w_devs[i].astype(numpy.float64))

	- # x- and y-boundary behavior
	- fh.write(self.gamma_wn.astype(numpy.float64))
	- fh.write(self.gamma_wt.astype(numpy.float64))
	- fh.write(self.gamma_wr.astype(numpy.float64))
	-
	fh.close()

	finally:
	t@@ -633,14 +632,15 @@ class Spherebin:
	self.L = self.num * cellsize

	# Initialize upper wall
	- self.wmode[0] = 0
	- self.w_n[0,2] = -1.0
	- self.w_x[0] = self.L[2]
	- self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
	- self.w_vel[0] = 0.0
	- self.w_force[0] = 0.0
	- self.w_devs[0] = 0.0
	- self.nw = numpy.ones(1, dtype=numpy.uint32) * 1
	+ if (self.nw > 0):
	+ self.wmode[0] = 0
	+ self.w_n[0,2] = -1.0
	+ self.w_x[0] = self.L[2]
	+ self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
	+ self.w_vel[0] = 0.0
	+ self.w_force[0] = 0.0
	+ self.w_devs[0] = 0.0
	+ self.nw = numpy.ones(1, dtype=numpy.uint32) * 1

	# Adjust grid and upper wall for consolidation under deviatoric stress
	def consolidate(self, deviatoric_stress = 10e3,
	t@@ -755,8 +755,6 @@ class Spherebin:
	# Set wall viscosities to zero
	self.gamma_wn[0] = 0.0
	self.gamma_wt[0] = 0.0
	- self.gamma_wr[0] = 0.0
	-


	def initTemporal(self, total,
	t@@ -792,7 +790,6 @@ class Spherebin:
	gamma_r = 0,
	gamma_wn = 1e3,
	gamma_wt = 1e3,
	- gamma_wr = 2e3,
	capillaryCohesion = 0):
	""" Initialize particle parameters to default values.
	Radii must be set prior to calling this function.
	t@@ -837,8 +834,6 @@ class Spherebin:
	# Wall viscosities
	self.gamma_wn[0] = gamma_wn # normal
	self.gamma_wt[0] = gamma_wt # sliding
	- self.gamma_wr[0] = gamma_wr # rolling
	-

	### Parameters related to capillary bonds

	diff --git a/python/tests.py b/python/tests.py
	t@@ -12,17 +12,13 @@ def compare(first, second, string):
	print("### Input/output tests ###")

	# Generate data in python
	-orig = Spherebin(100)
	+orig = Spherebin(np = 100, nw = 0)
	orig.generateRadii()
	orig.defaultParams()
	-orig.initRandomGridPos()
	-orig.initTemporal(total = 0.0, current = 0.0)
	-orig.xysum = numpy.ones(orig.np2, dtype=numpy.float64).reshape(orig.np, 2) 1
	-orig.vel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, o…
	-orig.force = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np,…
	-orig.angpos = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np…
	-orig.angvel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np…
	-orig.torque = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np…
	+orig.initRandomGridPos(g = numpy.zeros(orig.nd))
	+orig.initTemporal(current = 0.0, total = 0.0)
	+orig.time_total = 2.0*orig.time_dt;
	+orig.time_file_dt = orig.time_dt;
	orig.writebin("orig.bin", verbose=False)

	# Test Python IO routines
	t@@ -36,4 +32,10 @@ cpp = Spherebin()
	cpp.readbin("../output/orig.output0.bin", verbose=False)
	compare(orig, cpp, "C++ IO: ")

	+# Test CUDA IO routines
	+cuda = Spherebin()
	+cuda.readbin("../output/orig.output1.bin", verbose=False)
	+cuda.time_current = orig.time_current
	+cuda.time_step_count = orig.time_step_count
	+compare(orig, cuda, "CUDA IO: ")

	diff --git a/src/Makefile b/src/Makefile
	t@@ -27,8 +27,8 @@ NVCCFLAGS=--use_fast_math -O3 -m64 -gencode=arch=compute_20,…

	# Debugable code? Beware that enabling this option will
	# considerably slow down the execution.
	-CCFLAGS=-g -O0 -Wall
	-NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -X…
	+#CCFLAGS=-g -O0 -Wall
	+#NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -…

	DATE=`date +'%Y.%m.%d-%H:%M:%S'`
	BACKUPNAME=sphere.$(DATE).tar.gz
	diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
	t@@ -91,8 +91,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
	Float4* dev_angvel_sorted,
	unsigned int* dev_cellStart,
	unsigned int* dev_cellEnd,
	- Float4* dev_w_nx,
	- Float4* dev_w_mvfd)
	+ Float4* dev_walls_nx,
	+ Float4* dev_walls_mvfd)
	//uint4 bonds)
	{

	t@@ -382,9 +382,9 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	Float* dev_es,
	Float* dev_ev,
	Float* dev_p,
	- Float4* dev_w_nx,
	- Float4* dev_w_mvfd,
	- Float* dev_w_force, //uint4* dev_bonds_sorted,
	+ Float4* dev_walls_nx,
	+ Float4* dev_walls_mvfd,
	+ Float* dev_walls_force_pp, //uint4* dev_bonds_sorted,
	unsigned int* dev_contacts,
	Float4* dev_distmod,
	Float4* dev_delta_t)
	t@@ -399,9 +399,6 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	Float4 x_a = dev_x_sorted[idx_a];
	Float radius_a = x_a.w;

	- // Fetch wall data in global read
	- Float4 w_up_nx = dev_w_nx[0];
	- Float4 w_up_mvfd = dev_w_mvfd[0];

	// Fetch world dimensions in constant memory read
	Float3 origo = MAKE_FLOAT3(devC_grid.origo[0],
	t@@ -411,6 +408,17 @@ __global__ void interact(unsigned int* dev_gridParticleIn…
	devC_grid.L[1],
	devC_grid.L[2]);

	+ // Fetch wall data in global read
	+ Float4 w_up_nx;
	+ Float4 w_up_mvfd;
	+ if (devC_nw > 0) {
	+ w_up_nx = dev_walls_nx[0];
	+ w_up_mvfd = dev_walls_mvfd[0];
	+ } else {
	+ w_up_nx = MAKE_FLOAT4(0.0f, 0.0f, -1.0f, L.z);
	+ w_up_mvfd = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ }
	+
	// Index of particle which is bonded to particle A.
	// The index is equal to the particle no (p.np)
	// if particle A is bond-less.
	t@@ -526,7 +534,7 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	dev_x_sorted,
	dev_vel_sorted, dev_angvel_sorted,
	dev_cellStart, dev_cellEnd,
	- dev_w_nx, dev_w_mvfd);
	+ dev_walls_nx, dev_walls_mvfd);
	}
	}
	}
	t@@ -629,7 +637,8 @@ __global__ void interact(unsigned int* dev_gridParticleInd…
	dev_es[orig_idx] += es_dot * devC_dt;
	dev_ev[orig_idx] += ev_dot * devC_dt;
	dev_p[orig_idx] = p;
	- dev_w_force[orig_idx] = w_force;
	+ if (devC_nw > 0)
	+ dev_walls_force_pp[orig_idx] = w_force;
	}
	} // End of interact(...)

	diff --git a/src/datatypes.h b/src/datatypes.h
	t@@ -95,7 +95,6 @@ struct Walls {
	int wmode[MAXWALLS]; // Wall modes
	Float4* nx; // Wall normal and position
	Float4* mvfd; // Wall mass, velocity, force and dev. stress
	- Float* force; // Resulting forces on walls per particle
	};

	#endif
	diff --git a/src/device.cu b/src/device.cu
	t@@ -220,38 +220,44 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
	if (verbose == 1)
	std::cout << " Allocating global device memory: ";

	- // Particle arrays
	- cudaMalloc((void**)&dev_k->x, memSizeF4);
	- cudaMalloc((void**)&dev_sort->x_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_k->vel, memSizeF4);
	- cudaMalloc((void**)&dev_sort->vel_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_k->angvel, memSizeF4);
	- cudaMalloc((void**)&dev_sort->angvel_sorted, memSizeF4);
	- cudaMalloc((void**)&dev_k->acc, memSizeF4);
	k.acc = new Float4[np];
	- cudaMalloc((void**)&dev_k->angacc, memSizeF4);
	k.angacc = new Float4[np];
	- cudaMalloc((void**)&dev_k->force, memSizeF4);
	- cudaMalloc((void**)&dev_k->torque, memSizeF4);
	- cudaMalloc((void**)&dev_k->angpos, memSizeF4);
	- cudaMalloc((void**)&dev_e->es_dot, memSizeF);
	- cudaMalloc((void**)&dev_e->ev_dot, memSizeF);
	- cudaMalloc((void**)&dev_e->es, memSizeF);
	- cudaMalloc((void**)&dev_e->ev, memSizeF);
	- cudaMalloc((void**)&dev_e->p, memSizeF);

	- // Cell-related arrays
	- cudaMalloc((void*)&dev_sort->gridParticleCellID, sizeof(unsigned int)np);
	- cudaMalloc((void*)&dev_sort->gridParticleIndex, sizeof(unsigned int)np);
	- cudaMalloc((void*)&dev_sort->cellStart, sizeof(unsigned int)grid.num[0]*gr…
	- cudaMalloc((void*)&dev_sort->cellEnd, sizeof(unsigned int)grid.num[0]*grid…
	+ // Kinematics arrays
	+ cudaMalloc((void**)&dev_x, memSizeF4);
	+ cudaMalloc((void**)&dev_xysum, memSizeF4);
	+ cudaMalloc((void**)&dev_vel, memSizeF4);
	+ cudaMalloc((void**)&dev_acc, memSizeF4);
	+ cudaMalloc((void**)&dev_force, memSizeF4);
	+ cudaMalloc((void**)&dev_angpos, memSizeF4);
	+ cudaMalloc((void**)&dev_angvel, memSizeF4);
	+ cudaMalloc((void**)&dev_angacc, memSizeF4);
	+ cudaMalloc((void**)&dev_torque, memSizeF4);

	// Particle contact bookkeeping arrays
	- cudaMalloc((void*)&dev_k->contacts, sizeof(unsigned int)np*NC); // Max NC …
	- cudaMalloc((void*)&dev_k->distmod, sizeof(Float4)np*NC);
	- cudaMalloc((void*)&dev_k->delta_t, sizeof(Float4)np*NC);
	+ cudaMalloc((void*)&dev_contacts, sizeof(unsigned int)np*NC); // Max NC con…
	+ cudaMalloc((void*)&dev_distmod, memSizeF4NC);
	+ cudaMalloc((void*)&dev_delta_t, memSizeF4NC);
	+
	+ // Sorted arrays
	+ cudaMalloc((void**)&dev_x_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
	+
	+ // Energy arrays
	+ cudaMalloc((void**)&dev_es_dot, memSizeF);
	+ cudaMalloc((void**)&dev_ev_dot, memSizeF);
	+ cudaMalloc((void**)&dev_es, memSizeF);
	+ cudaMalloc((void**)&dev_ev, memSizeF);
	+ cudaMalloc((void**)&dev_p, memSizeF);
	+
	+ // Cell-related arrays
	+ cudaMalloc((void*)&dev_gridParticleCellID, sizeof(unsigned int)np);
	+ cudaMalloc((void*)&dev_gridParticleIndex, sizeof(unsigned int)np);
	+ cudaMalloc((void*)&dev_cellStart, sizeof(unsigned int)grid.num[0]*grid.num…
	+ cudaMalloc((void*)&dev_cellEnd, sizeof(unsigned int)grid.num[0]*grid.num[1…

	- // Host contact bookkeeping arrays
	+ // Host contact bookkeeping arrays
	k.contacts = new unsigned int[np*NC];
	// Initialize contacts lists to np
	for (unsigned int i=0; i<(np*NC); ++i)
	t@@ -260,10 +266,11 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
	k.delta_t = new Float4[np*NC];

	// Wall arrays
	- cudaMalloc((void*)&dev_walls->nx, sizeof(Float4)walls.nw);
	- cudaMalloc((void*)&dev_walls->mvfd, sizeof(Float4)walls.nw);
	- cudaMalloc((void*)&dev_walls->force, sizeof(Float)walls.nw*np);
	- // dev_w_force_partial allocated later
	+ cudaMalloc((void*)&dev_walls_wmode, sizeof(int)walls.nw);
	+ cudaMalloc((void*)&dev_walls_nx, sizeof(Float4)walls.nw);
	+ cudaMalloc((void*)&dev_walls_mvfd, sizeof(Float4)walls.nw);
	+ cudaMalloc((void*)&dev_walls_force_pp, sizeof(Float)walls.nw*np);
	+ // dev_walls_force_partial allocated later

	checkForCudaErrors("End of allocateGlobalDeviceMemory");
	if (verbose == 1)
	t@@ -275,36 +282,40 @@ __host__ void DEM::freeGlobalDeviceMemory()
	if (verbose == 1)
	printf("\nLiberating device memory: ");
	// Particle arrays
	- cudaFree(dev_k->x);
	- cudaFree(dev_sort->x_sorted);
	- cudaFree(dev_k->vel);
	- cudaFree(dev_sort->vel_sorted);
	- cudaFree(dev_k->angvel);
	- cudaFree(dev_sort->angvel_sorted);
	- cudaFree(dev_k->acc);
	- cudaFree(dev_k->angacc);
	- cudaFree(dev_k->force);
	- cudaFree(dev_k->torque);
	- cudaFree(dev_k->angpos);
	- cudaFree(dev_e->es_dot);
	- cudaFree(dev_e->ev_dot);
	- cudaFree(dev_e->es);
	- cudaFree(dev_e->ev);
	- cudaFree(dev_e->p);
	- cudaFree(dev_k->contacts);
	- cudaFree(dev_k->distmod);
	- cudaFree(dev_k->delta_t);
	+ cudaFree(dev_x);
	+ cudaFree(dev_xysum);
	+ cudaFree(dev_vel);
	+ cudaFree(dev_acc);
	+ cudaFree(dev_force);
	+ cudaFree(dev_angpos);
	+ cudaFree(dev_angvel);
	+ cudaFree(dev_angacc);
	+ cudaFree(dev_torque);
	+
	+ cudaFree(dev_contacts);
	+ cudaFree(dev_distmod);
	+ cudaFree(dev_delta_t);
	+
	+ cudaFree(dev_es_dot);
	+ cudaFree(dev_es);
	+ cudaFree(dev_ev_dot);
	+ cudaFree(dev_ev);
	+ cudaFree(dev_p);
	+
	+ cudaFree(dev_x_sorted);
	+ cudaFree(dev_vel_sorted);
	+ cudaFree(dev_angvel_sorted);

	// Cell-related arrays
	- cudaFree(dev_sort->gridParticleIndex);
	- cudaFree(dev_sort->cellStart);
	- cudaFree(dev_sort->cellEnd);
	+ cudaFree(dev_gridParticleIndex);
	+ cudaFree(dev_cellStart);
	+ cudaFree(dev_cellEnd);

	// Wall arrays
	- cudaFree(dev_walls->nx);
	- cudaFree(dev_walls->mvfd);
	- cudaFree(dev_walls->force);
	- //cudaFree(dev_w_force_partial);
	+ cudaFree(dev_walls_nx);
	+ cudaFree(dev_walls_mvfd);
	+ cudaFree(dev_walls_force_partial);
	+ cudaFree(dev_walls_force_pp);

	if (verbose == 1)
	printf("Done\n");
	t@@ -321,55 +332,53 @@ __host__ void DEM::transferToGlobalDeviceMemory()
	unsigned int memSizeF4 = sizeof(Float4) * np;

	// Copy static-size structure data from host to global device memory
	- cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
	+ //cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);

	// Kinematic particle values
	- cudaMemcpy( dev_k->x, k.x,
	+ cudaMemcpy( dev_x, k.x,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->xysum, k.xysum,
	+ cudaMemcpy( dev_xysum, k.xysum,
	sizeof(Float2)*np, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->vel, k.vel,
	+ cudaMemcpy( dev_vel, k.vel,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->acc, k.acc,
	+ cudaMemcpy( dev_acc, k.acc,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->force, k.force,
	+ cudaMemcpy( dev_force, k.force,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->angpos, k.angpos,
	+ cudaMemcpy( dev_angpos, k.angpos,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->angvel, k.angvel,
	+ cudaMemcpy( dev_angvel, k.angvel,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->angacc, k.angacc,
	+ cudaMemcpy( dev_angacc, k.angacc,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->torque, k.torque,
	+ cudaMemcpy( dev_torque, k.torque,
	memSizeF4, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->contacts, k.contacts,
	+ cudaMemcpy( dev_contacts, k.contacts,
	sizeof(unsigned int)npNC, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->distmod, k.distmod,
	+ cudaMemcpy( dev_distmod, k.distmod,
	memSizeF4*NC, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_k->delta_t, k.delta_t,
	+ cudaMemcpy( dev_delta_t, k.delta_t,
	memSizeF4*NC, cudaMemcpyHostToDevice);

	// Individual particle energy values
	- cudaMemcpy( dev_e->es_dot, e.es_dot,
	+ cudaMemcpy( dev_es_dot, e.es_dot,
	memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_e->es, e.es,
	+ cudaMemcpy( dev_es, e.es,
	memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_e->ev_dot, e.ev_dot,
	+ cudaMemcpy( dev_ev_dot, e.ev_dot,
	memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_e->ev, e.ev,
	+ cudaMemcpy( dev_ev, e.ev,
	memSizeF, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_e->p, e.p,
	+ cudaMemcpy( dev_p, e.p,
	memSizeF, cudaMemcpyHostToDevice);

	// Wall parameters
	- cudaMemcpy( dev_walls->wmode, walls.wmode,
	+ cudaMemcpy( dev_walls_wmode, walls.wmode,
	sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_walls->nx, walls.nx,
	+ cudaMemcpy( dev_walls_nx, walls.nx,
	sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_walls->mvfd, walls.mvfd,
	+ cudaMemcpy( dev_walls_mvfd, walls.mvfd,
	sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
	- cudaMemcpy( dev_walls->force, walls.force,
	- memSizeF*walls.nw, cudaMemcpyHostToDevice);

	checkForCudaErrors("End of transferToGlobalDeviceMemory");
	if (verbose == 1)
	t@@ -385,55 +394,53 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
	unsigned int memSizeF4 = sizeof(Float4) * np;

	// Copy static-size structure data from host to global device memory
	- cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
	+ //cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);

	// Kinematic particle values
	- cudaMemcpy( k.x, dev_k->x,
	+ cudaMemcpy( k.x, dev_x,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.xysum, dev_k->xysum,
	+ cudaMemcpy( k.xysum, dev_xysum,
	sizeof(Float2)*np, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.vel, dev_k->vel,
	+ cudaMemcpy( k.vel, dev_vel,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.acc, dev_k->acc,
	+ cudaMemcpy( k.acc, dev_acc,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.force, dev_k->force,
	+ cudaMemcpy( k.force, dev_force,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.angpos, dev_k->angpos,
	+ cudaMemcpy( k.angpos, dev_angpos,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.angvel, dev_k->angvel,
	+ cudaMemcpy( k.angvel, dev_angvel,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.angacc, dev_k->angacc,
	+ cudaMemcpy( k.angacc, dev_angacc,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.torque, dev_k->torque,
	+ cudaMemcpy( k.torque, dev_torque,
	memSizeF4, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.contacts, dev_k->contacts,
	+ cudaMemcpy( k.contacts, dev_contacts,
	sizeof(unsigned int)npNC, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.distmod, dev_k->distmod,
	+ cudaMemcpy( k.distmod, dev_distmod,
	memSizeF4*NC, cudaMemcpyDeviceToHost);
	- cudaMemcpy( k.delta_t, dev_k->delta_t,
	+ cudaMemcpy( k.delta_t, dev_delta_t,
	memSizeF4*NC, cudaMemcpyDeviceToHost);

	// Individual particle energy values
	- cudaMemcpy( e.es_dot, dev_e->es_dot,
	+ cudaMemcpy( e.es_dot, dev_es_dot,
	memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy( e.es, dev_e->es,
	+ cudaMemcpy( e.es, dev_es,
	memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy( e.ev_dot, dev_e->ev_dot,
	+ cudaMemcpy( e.ev_dot, dev_ev_dot,
	memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy( e.ev, dev_e->ev,
	+ cudaMemcpy( e.ev, dev_ev,
	memSizeF, cudaMemcpyDeviceToHost);
	- cudaMemcpy( e.p, dev_e->p,
	+ cudaMemcpy( e.p, dev_p,
	memSizeF, cudaMemcpyDeviceToHost);

	// Wall parameters
	- cudaMemcpy( walls.wmode, dev_walls->wmode,
	+ cudaMemcpy( walls.wmode, dev_walls_wmode,
	sizeof(int)*walls.nw, cudaMemcpyDeviceToHost);
	- cudaMemcpy( walls.nx, dev_walls->nx,
	+ cudaMemcpy( walls.nx, dev_walls_nx,
	sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
	- cudaMemcpy( walls.mvfd, dev_walls->mvfd,
	+ cudaMemcpy( walls.mvfd, dev_walls_mvfd,
	sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
	- cudaMemcpy( walls.force, dev_walls->force,
	- memSizeF*walls.nw, cudaMemcpyDeviceToHost);

	checkForCudaErrors("End of transferFromGlobalDeviceMemory");
	}
	t@@ -476,8 +483,8 @@ __host__ void DEM::startTime()
	// Shared memory per block
	unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);

	- Float* dev_w_force_partial;
	- cudaMalloc((void*)&dev_w_force_partial, sizeof(Float)dimGrid.x);
	+ // Pre-sum of force per wall
	+ cudaMalloc((void*)&dev_walls_force_partial, sizeof(Float)dimGrid.x);

	// Report to stdout
	if (verbose == 1) {
	t@@ -562,9 +569,9 @@ __host__ void DEM::startTime()
	// in the fine, uniform and homogenous grid.
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort->gridParticleCellID,
	- dev_sort->gridParticleIndex,
	- dev_k->x);
	+ calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
	+ dev_gridParticleIndex,
	+ dev_x);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -576,9 +583,9 @@ __host__ void DEM::startTime()
	// Sort particle (key, particle ID) pairs by hash key with Thrust radix so…
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort->gridParticleCellID),
	- thrust::device_ptr<uint>(dev_sort->gridParticleCellID …
	- thrust::device_ptr<uint>(dev_sort->gridParticleIndex));
	+ thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
	+ thrust::device_ptr<uint>(dev_gridParticleCellID + np),
	+ thrust::device_ptr<uint>(dev_gridParticleIndex));
	cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implic…
	if (PROFILING == 1)
	stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
	t@@ -588,7 +595,7 @@ __host__ void DEM::startTime()
	// Zero cell array values by setting cellStart to its highest possible val…
	// specified with pointer value 0xffffffff, which for a 32 bit unsigned int
	// is 4294967295.
	- cudaMemset(dev_sort->cellStart, 0xffffffff,
	+ cudaMemset(dev_cellStart, 0xffffffff,
	grid.num[0]grid.num[1]grid.num[2]*sizeof(unsigned int));
	cudaThreadSynchronize();
	checkForCudaErrors("Post cudaMemset");
	t@@ -597,15 +604,15 @@ __host__ void DEM::startTime()
	// coherent memory access. Save ordered configurations in new arrays (*_so…
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort->cellStart,
	- dev_sort->cellEnd,
	- dev_sort->gridParticleCellI…
	- dev_sort->gridParticleIndex,
	- dev_k->x, dev_k->vel,
	- dev_k->angvel,
	- dev_sort->x_sorted,
	- dev_sort->vel_sorted,
	- dev_sort->angvel_sorted);
	+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
	+ dev_cellEnd,
	+ dev_gridParticleCellID,
	+ dev_gridParticleIndex,
	+ dev_x, dev_vel,
	+ dev_angvel,
	+ dev_x_sorted,
	+ dev_vel_sorted,
	+ dev_angvel_sorted);

	// Synchronization point
	cudaThreadSynchronize();
	t@@ -620,12 +627,12 @@ __host__ void DEM::startTime()
	// For each particle: Search contacts in neighbor cells
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- topology<<<dimGrid, dimBlock>>>(dev_sort->cellStart,
	- dev_sort->cellEnd,
	- dev_sort->gridParticleIndex,
	- dev_sort->x_sorted,
	- dev_k->contacts,
	- dev_k->distmod);
	+ topology<<<dimGrid, dimBlock>>>(dev_cellStart,
	+ dev_cellEnd,
	+ dev_gridParticleIndex,
	+ dev_x_sorted,
	+ dev_contacts,
	+ dev_distmod);


	// Synchronization point
	t@@ -639,28 +646,28 @@ __host__ void DEM::startTime()
	// For each particle: Process collisions and compute resulting forces.
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- interact<<<dimGrid, dimBlock>>>(dev_sort->gridParticleIndex,
	- dev_sort->cellStart,
	- dev_sort->cellEnd,
	- dev_k->x,
	- dev_sort->x_sorted,
	- dev_sort->vel_sorted,
	- dev_sort->angvel_sorted,
	- dev_k->vel,
	- dev_k->angvel,
	- dev_k->force,
	- dev_k->torque,
	- dev_e->es_dot,
	- dev_e->ev_dot,
	- dev_e->es,
	- dev_e->ev,
	- dev_e->p,
	- dev_walls->nx,
	- dev_walls->mvfd,
	- dev_walls->force,
	- dev_k->contacts,
	- dev_k->distmod,
	- dev_k->delta_t);
	+ interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
	+ dev_cellStart,
	+ dev_cellEnd,
	+ dev_x,
	+ dev_x_sorted,
	+ dev_vel_sorted,
	+ dev_angvel_sorted,
	+ dev_vel,
	+ dev_angvel,
	+ dev_force,
	+ dev_torque,
	+ dev_es_dot,
	+ dev_ev_dot,
	+ dev_es,
	+ dev_ev,
	+ dev_p,
	+ dev_walls_nx,
	+ dev_walls_mvfd,
	+ dev_walls_force_pp,
	+ dev_contacts,
	+ dev_distmod,
	+ dev_delta_t);


	// Synchronization point
	t@@ -672,17 +679,20 @@ __host__ void DEM::startTime()
	// Update particle kinematics
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- integrate<<<dimGrid, dimBlock>>>(dev_sort->x_sorted,
	- dev_sort->vel_sorted,
	- dev_sort->angvel_sorted,
	- dev_k->x,
	- dev_k->vel,
	- dev_k->angvel,
	- dev_k->force,
	- dev_k->torque,
	- dev_k->angpos,
	- dev_k->xysum,
	- dev_sort->gridParticleIndex);
	+ integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
	+ dev_vel_sorted,
	+ dev_angvel_sorted,
	+ dev_x,
	+ dev_vel,
	+ dev_angvel,
	+ dev_force,
	+ dev_torque,
	+ dev_angpos,
	+ dev_xysum,
	+ dev_gridParticleIndex);
	+ cudaThreadSynchronize();
	+ checkForCudaErrors("Post integrate");
	+

	cudaThreadSynchronize();
	if (PROFILING == 1)
	t@@ -691,20 +701,26 @@ __host__ void DEM::startTime()
	// Summation of forces on wall
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- summation<<<dimGrid, dimBlock>>>(dev_walls->force, dev_w_force_partial);
	-
	+ if (walls.nw > 0) {
	+ summation<<<dimGrid, dimBlock>>>(dev_walls_force_pp,
	+ dev_walls_force_partial);
	+ }
	// Synchronization point
	cudaThreadSynchronize();
	if (PROFILING == 1)
	stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
	- checkForCudaErrors("Post integrate & wall force summation");
	+ checkForCudaErrors("Post wall force summation");

	// Update wall kinematics
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- integrateWalls<<< 1, walls.nw>>>(dev_walls,
	- dev_w_force_partial,
	- blocksPerGrid);
	+ if (walls.nw > 0) {
	+ integrateWalls<<< 1, walls.nw>>>(dev_walls_nx,
	+ dev_walls_mvfd,
	+ dev_walls_wmode,
	+ dev_walls_force_partial,
	+ blocksPerGrid);
	+ }

	// Synchronization point
	cudaThreadSynchronize();
	diff --git a/src/file_io.cpp b/src/file_io.cpp
	t@@ -189,7 +189,6 @@ void DEM::readbin(const char *target)
	// Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
	walls.nx = new Float4[walls.nw];
	walls.mvfd = new Float4[walls.nw];
	- walls.force = new Float[walls.nw*np];

	ifs.read(as_bytes(walls.wmode), sizeof(walls.wmode));
	for (i = 0; i<walls.nw; ++i) {
	diff --git a/src/integration.cuh b/src/integration.cuh
	t@@ -175,8 +175,10 @@ __global__ void summation(Float* in, Float *out)
	}

	// Update wall positions
	-__global__ void integrateWalls(Walls* dev_walls,
	- Float* dev_w_force_partial,
	+__global__ void integrateWalls(Float4* dev_walls_nx,
	+ Float4* dev_walls_mvfd,
	+ int* dev_walls_wmode,
	+ Float* dev_walls_force_partial,
	unsigned int blocksPerGrid)
	{
	unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	t@@ -185,9 +187,9 @@ __global__ void integrateWalls(Walls* dev_walls,

	// Copy data to temporary arrays to avoid any potential read-after-write,
	// write-after-read, or write-after-write hazards.
	- Float4 w_nx = dev_walls->nx[idx];
	- Float4 w_mvfd = dev_walls->mvfd[idx];
	- int wmode = dev_walls->wmode[idx]; // Wall BC, 0: fixed, 1: devs, 2: vel
	+ Float4 w_nx = dev_walls_nx[idx];
	+ Float4 w_mvfd = dev_walls_mvfd[idx];
	+ int wmode = dev_walls_wmode[idx]; // Wall BC, 0: fixed, 1: devs, 2: vel
	Float acc;

	if (wmode == 0) // Wall fixed: do nothing
	t@@ -196,7 +198,7 @@ __global__ void integrateWalls(Walls* dev_walls,
	// Find the final sum of forces on wall
	w_mvfd.z = 0.0f;
	for (int i=0; i<blocksPerGrid; ++i) {
	- w_mvfd.z += dev_w_force_partial[i];
	+ w_mvfd.z += dev_walls_force_partial[i];
	}

	Float dt = devC_dt;
	t@@ -222,8 +224,8 @@ __global__ void integrateWalls(Walls* dev_walls,
	w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;

	// Store data in global memory
	- dev_walls->nx[idx] = w_nx;
	- dev_walls->mvfd[idx] = w_mvfd;
	+ dev_walls_nx[idx] = w_nx;
	+ dev_walls_mvfd[idx] = w_mvfd;
	}
	} // End of integrateWalls(...)

	diff --git a/src/sphere.cpp b/src/sphere.cpp
	t@@ -207,16 +207,18 @@ void DEM::reportValues()
	else
	cout << " - 1st and 2nd dim. boundaries: Visco-frictional walls\n";

	- cout << " - Top BC: ";
	- if (walls.wmode[0] == 0)
	- cout << "Fixed\n";
	- else if (walls.wmode[0] == 1)
	- cout << "Deviatoric stress\n";
	- else if (walls.wmode[0] == 2)
	- cout << "Velocity\n";
	- else {
	- cerr << "Top BC not recognized!\n";
	- exit(1);
	+ if (walls.nw > 0) {
	+ cout << " - Top BC: ";
	+ if (walls.wmode[0] == 0)
	+ cout << "Fixed\n";
	+ else if (walls.wmode[0] == 1)
	+ cout << "Deviatoric stress\n";
	+ else if (walls.wmode[0] == 2)
	+ cout << "Velocity\n";
	+ else {
	+ cerr << "Top BC not recognized!\n";
	+ exit(1);
	+ }
	}

	cout << " - Grid: ";
	diff --git a/src/sphere.h b/src/sphere.h
	t@@ -25,13 +25,14 @@ class DEM {
	// Number of particles
	unsigned int np;

	+ // HOST STRUCTURES
	// Structure containing individual particle kinematics
	Kinematics k; // host
	- Kinematics *dev_k; // device
	+ //Kinematics *dev_k; // device

	// Structure containing energy values
	Energies e; // host
	- Energies *dev_e; // device
	+ //Energies *dev_e; // device

	// Structure of global parameters
	Params params; // host
	t@@ -40,15 +41,46 @@ class DEM {
	Grid grid; // host

	// Structure containing sorting arrays
	- Sorting *dev_sort; // device
	+ //Sorting *dev_sort; // device

	// Structure of temporal parameters
	Time time; // host
	- Time *dev_time; // device
	+ //Time *dev_time; // device

	// Structure of wall parameters
	Walls walls; // host
	- Walls *dev_walls; // device
	+ //Walls *dev_walls; // device
	+
	+ // DEVICE ARRAYS
	+ Float4 *dev_x;
	+ Float2 *dev_xysum;
	+ Float4 *dev_vel;
	+ Float4 *dev_acc;
	+ Float4 *dev_force;
	+ Float4 *dev_angpos;
	+ Float4 *dev_angvel;
	+ Float4 *dev_angacc;
	+ Float4 *dev_torque;
	+ unsigned int *dev_contacts;
	+ Float4 *dev_distmod;
	+ Float4 *dev_delta_t;
	+ Float *dev_es_dot;
	+ Float *dev_es;
	+ Float *dev_ev_dot;
	+ Float *dev_ev;
	+ Float *dev_p;
	+ Float4 *dev_x_sorted;
	+ Float4 *dev_vel_sorted;
	+ Float4 *dev_angvel_sorted;
	+ unsigned int *dev_gridParticleCellID;
	+ unsigned int *dev_gridParticleIndex;
	+ unsigned int *dev_cellStart;
	+ unsigned int *dev_cellEnd;
	+ int *dev_walls_wmode;
	+ Float4 *dev_walls_nx; // normal, pos.
	+ Float4 *dev_walls_mvfd; // Mass, velocity, force, dev. stress
	+ Float *dev_walls_force_partial; // Pre-sum per wall
	+ Float *dev_walls_force_pp; // Force per particle per wall

	// GPU initialization, must be called before startTime()
	void initializeGPU(void);
	t@@ -101,10 +133,11 @@ class DEM {
	void startTime(void);

	// Render particles using raytracing
	- // render(const char *target,
	- // Float3 lookat,
	- // Float3 eye,
	- // Float focalLength);
	+ void render(const char *target,
	+ const Float3 lookat,
	+ const Float3 eye,
	+ const Float focalLength = 1.0,
	+ const int method = 1);

	};

	diff --git a/src/utility.cu b/src/utility.cu
	t@@ -11,7 +11,7 @@ void checkForCudaErrors(const char* checkpoint_description)
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
	std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_descripti…
	- << "\nError string: " << cudaGetErrorString(err) << "\n";
	+ << "\nError string: " << cudaGetErrorString(err) << std::endl;
	exit(EXIT_FAILURE);
	}
	}
	t@@ -22,7 +22,7 @@ void checkForCudaErrors(const char* checkpoint_description, …
	if (err != cudaSuccess) {
	std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_descripti…
	<< "\nduring iteration " << iteration
	- << "\nError string: " << cudaGetErrorString(err) << "\n";
	+ << "\nError string: " << cudaGetErrorString(err) << std::endl;
	exit(EXIT_FAILURE);
	}
	}