GopherProxy

	tadd const and __restrict__ keywords to facilitate constant memory cache usage …
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit 21ed15ef593a831b2124a48bbc662b9a227f85ac
	parent 7626d19392ecba63c4b2e5f4d419e8423d7a90a0
	Author: Anders Damsgaard <[email protected]>
	Date: Mon, 11 Aug 2014 14:37:07 +0200

	add const and __restrict__ keywords to facilitate constant memory cache usage

	Diffstat:
	M src/cohesion.cuh \| 143 ++++++++---------------------…
	M src/contactmodels.cuh \| 164 ++++++++++++++++++-----------…
	M src/contactsearch.cuh \| 246 ++++++++++++++++-------------…
	M src/integration.cuh \| 70 ++++++++++++++++++-----------…
	M src/navierstokes.cuh \| 1213 ++++++++++++++++-------------…
	M src/raytracer.cuh \| 173 +++++++++++++++--------------…
	M tests/fluid_particle_interaction.py \| 3 +--

	7 files changed, 996 insertions(+), 1016 deletions(-)
	---
	diff --git a/src/cohesion.cuh b/src/cohesion.cuh
	t@@ -6,14 +6,14 @@

	// Check bond pair list, apply linear contact model to pairs
	__global__ void bondsLinear(
	- uint2* dev_bonds,
	- Float4* dev_bonds_delta, // Contact displacement
	- Float4* dev_bonds_omega, // Contact rotational displacement
	- Float4* dev_x,
	- Float4* dev_vel,
	- Float4* dev_angvel,
	- Float4* dev_force,
	- Float4* dev_torque)
	+ uint2* __restrict__ dev_bonds,
	+ Float4* __restrict__ dev_bonds_delta, // Contact displacement
	+ Float4* __restrict__ dev_bonds_omega, // Contact rotational displacement
	+ const Float4* __restrict__ dev_x,
	+ const Float4* __restrict__ dev_vel,
	+ const Float4* __restrict__ dev_angvel,
	+ Float4* __restrict__ dev_force,
	+ Float4* __restrict__ dev_torque)
	{
	// Find thread index
	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
	t@@ -35,16 +35,16 @@ __global__ void bondsLinear(
	// Convert tangential vectors to Float3's
	// Uncorrected tangential component of displacement
	Float3 delta0_t = MAKE_FLOAT3(
	- delta0_4.x,
	- delta0_4.y,
	- delta0_4.z);
	+ delta0_4.x,
	+ delta0_4.y,
	+ delta0_4.z);
	const Float delta0_n = delta0_4.w;

	// Uncorrected tangential component of rotation
	Float3 omega0_t = MAKE_FLOAT3(
	- omega0_4.x,
	- omega0_4.y,
	- omega0_4.z);
	+ omega0_4.x,
	+ omega0_4.y,
	+ omega0_4.z);
	const Float omega0_n = omega0_4.w;

	// Read particle data
	t@@ -76,9 +76,9 @@ __global__ void bondsLinear(

	// Inter-particle vector
	const Float3 x = MAKE_FLOAT3(
	- x_i.x - x_j.x,
	- x_i.y - x_j.y,
	- x_i.z - x_j.z);
	+ x_i.x - x_j.x,
	+ x_i.y - x_j.y,
	+ x_i.z - x_j.z);
	const Float x_length = length(x);

	// Find overlap (negative value if overlapping)
	t@@ -96,13 +96,13 @@ __global__ void bondsLinear(

	// Contact displacement, Luding 2008 eq. 10
	const Float3 ddelta = (
	- MAKE_FLOAT3(
	- vel_i.x - vel_j.x,
	- vel_i.y - vel_j.y,
	- vel_i.z - vel_j.z)
	- + (x_i.w + overlap/2.0) * cross(n, angvel_i)
	- + (x_j.w + overlap/2.0) * cross(n, angvel_j)
	- ) * devC_dt;
	+ MAKE_FLOAT3(
	+ vel_i.x - vel_j.x,
	+ vel_i.y - vel_j.y,
	+ vel_i.z - vel_j.z)
	+ + (x_i.w + overlap/2.0) * cross(n, angvel_i)
	+ + (x_j.w + overlap/2.0) * cross(n, angvel_j)
	+ ) * devC_dt;

	// Normal component of the displacement increment
	//const Float ddelta_n = dot(ddelta, n);
	t@@ -141,9 +141,9 @@ __global__ void bondsLinear(

	// Contact rotational velocity
	Float3 domega = MAKE_FLOAT3(
	- angvel_j.x - angvel_i.x,
	- angvel_j.y - angvel_i.y,
	- angvel_j.z - angvel_i.z) * devC_dt;
	+ angvel_j.x - angvel_i.x,
	+ angvel_j.y - angvel_i.y,
	+ angvel_j.z - angvel_i.z) * devC_dt;
	/*const Float3 domega = MAKE_FLOAT3(
	angvel_i.x - angvel_j.x,
	angvel_i.y - angvel_j.y,
	t@@ -232,89 +232,16 @@ __global__ void bondsLinear(
	}
	}

	-// Linear-elastic bond: Attractive force with normal- and shear components
	-// acting upon particle A in a bonded particle pair
	-__device__ void bondLinear_old(Float3* N, Float3* T, Float* es_dot, Float* p,
	- unsigned int idx_a, unsigned int idx_b,
	- Float4* dev_x_sorted, Float4* dev_vel_sorted,
	- Float4* dev_angvel_sorted,
	- Float radius_a, Float radius_b,
	- Float3 x_ab, Float x_ab_length,
	- Float delta_ab)
	-{
	-
	- // If particles are not overlapping, apply bond force
	- if (delta_ab > 0.0f) {
	-
	- // Allocate variables and fetch missing time=t values for particle A a…
	- Float4 vel_a = dev_vel_sorted[idx_a];
	- Float4 vel_b = dev_vel_sorted[idx_b];
	- Float4 angvel4_a = dev_angvel_sorted[idx_a];
	- Float4 angvel4_b = dev_angvel_sorted[idx_b];
	-
	- // Convert to Float3's
	- Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	- Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	-
	- // Normal vector of contact
	- Float3 n_ab = x_ab/x_ab_length;
	-
	- // Relative contact interface velocity, w/o rolling
	- Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	-
	- // Relative contact interface velocity of particle surfaces at
	- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	- Float3 vel_ab = vel_ab_linear
	- + radius_a * cross(n_ab, angvel_a)
	- + radius_b * cross(n_ab, angvel_b);
	-
	- // Relative contact interface rolling velocity
	- //Float3 angvel_ab = angvel_a - angvel_b;
	- //Float angvel_ab_length = length(angvel_ab);
	-
	- // Normal component of the relative contact interface velocity
	- //Float vel_n_ab = dot(vel_ab_linear, n_ab);
	-
	- // Tangential component of the relative contact interface velocity
	- // Hinrichsen and Wolf 2004, eq. 13.9
	- Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	- //Float vel_t_ab_length = length(vel_t_ab);
	-
	- Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	- Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	-
	- // Mean radius
	- Float R_bar = (radius_a + radius_b)/2.0f;
	-
	- // Normal force component: Elastic
	- f_n = devC_params.k_n * delta_ab * n_ab;
	-
	- if (length(vel_t_ab) > 0.f) {
	- // Shear force component: Viscous
	- f_t = -1.0f * devC_params.gamma_t * vel_t_ab;
	-
	- // Shear friction production rate [W]
	- //*es_dot += -dot(vel_t_ab, f_t);
	- }
	-
	- // Add force components from this bond to total force for particle
	- *N += f_n + f_t;
	- T += -R_bar cross(n_ab, f_t);
	-
	- // Pressure excerted onto the particle from this bond
	- p += length(f_n) / (4.0f PI * radius_a*radius_a);
	-
	- }
	-} // End of bondLinear()
	-

	// Capillary cohesion after Richefeu et al. (2006)
	-__device__ void capillaryCohesion_exp(Float3* N, Float radius_a,
	- Float radius_b, Float delta_ab,
	- Float3 x_ab, Float x_ab_length,
	- Float kappa)
	+__device__ void capillaryCohesion_exp(
	+ Float3* N,
	+ const Float radius_a,
	+ const Float radius_b,
	+ const Float delta_ab,
	+ const Float3 x_ab,
	+ const Float x_ab_length,
	+ const Float kappa)
	{

	// Normal vector
	diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
	t@@ -7,11 +7,19 @@

	// Linear viscoelastic contact model for particle-wall interactions
	// with tangential friction and rolling resistance
	-__device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
	- Float* ev_dot, Float* p,
	- unsigned int idx_a, Float radius_a,
	- Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
	- Float3 n, Float delta, Float wvel)
	+__device__ Float contactLinear_wall(
	+ Float3* F,
	+ Float3* T,
	+ Float* es_dot,
	+ Float* ev_dot,
	+ Float* p,
	+ const unsigned int idx_a,
	+ const Float radius_a,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ const Float4* __restrict__ dev_angvel_sorted,
	+ const Float3 n,
	+ const Float delta,
	+ const Float wvel)
	{
	// Fetch particle velocities from global memory
	Float4 vel_tmp = dev_vel_sorted[idx_a];
	t@@ -19,13 +27,13 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, …

	// Convert velocities to three-component vectors
	Float3 vel_linear = MAKE_FLOAT3(
	- vel_tmp.x,
	- vel_tmp.y,
	- vel_tmp.z);
	+ vel_tmp.x,
	+ vel_tmp.y,
	+ vel_tmp.z);
	Float3 angvel = MAKE_FLOAT3(
	- angvel_tmp.x,
	- angvel_tmp.y,
	- angvel_tmp.z);
	+ angvel_tmp.x,
	+ angvel_tmp.y,
	+ angvel_tmp.z);

	// Store the length of the angular velocity for later use
	Float angvel_length = length(angvel);
	t@@ -47,7 +55,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Fl…

	// Normal force component: Elastic - viscous damping
	Float3 f_n = fmax(0.0, -devC_params.k_n*delta
	- - devC_params.gamma_wnvel_n) n;
	+ - devC_params.gamma_wnvel_n) n;
	const Float f_n_length = length(f_n); // Save length for later use

	// Store the energy lost by viscous damping. See derivation in
	t@@ -103,14 +111,22 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T…

	// Linear vicoelastic contact model for particle-particle interactions
	// with tangential friction and rolling resistance
	-__device__ void contactLinearViscous(Float3* F, Float3* T,
	- Float* es_dot, Float* ev_dot, Float* p,
	- unsigned int idx_a, unsigned int idx_b,
	- Float4* dev_vel_sorted,
	- Float4* dev_angvel_sorted,
	- Float radius_a, Float radius_b,
	- Float3 x_ab, Float x_ab_length,
	- Float delta_ab, Float kappa)
	+__device__ void contactLinearViscous(
	+ Float3* F,
	+ Float3* T,
	+ Float* es_dot,
	+ Float* ev_dot,
	+ Float* p,
	+ const unsigned int idx_a,
	+ const unsigned int idx_b,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ const Float4* __restrict__ dev_angvel_sorted,
	+ const Float radius_a,
	+ const Float radius_b,
	+ const Float3 x_ab,
	+ const Float x_ab_length,
	+ const Float delta_ab,
	+ const Float kappa)
	{

	// Allocate variables and fetch missing time=t values for particle A and B
	t@@ -131,8 +147,8 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,

	// Relative contact interface velocity, w/o rolling
	Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);

	// Relative contact interface velocity of particle surfaces at
	// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	t@@ -209,18 +225,25 @@ __device__ void contactLinearViscous(Float3* F, Float3* …


	// Linear elastic contact model for particle-particle interactions
	-__device__ void contactLinear(Float3* F, Float3* T,
	- Float* es_dot, Float* ev_dot, Float* p,
	- unsigned int idx_a_orig,
	- unsigned int idx_b_orig,
	- Float4 vel_a,
	- Float4* dev_vel,
	- Float3 angvel_a,
	- Float4* dev_angvel,
	- Float radius_a, Float radius_b,
	- Float3 x, Float x_length,
	- Float delta, Float4* dev_delta_t,
	- unsigned int mempos)
	+__device__ void contactLinear(
	+ Float3* F,
	+ Float3* T,
	+ Float* es_dot,
	+ Float* ev_dot,
	+ Float* p,
	+ const unsigned int idx_a_orig,
	+ const unsigned int idx_b_orig,
	+ const Float4 vel_a,
	+ const Float4* __restrict__ dev_vel,
	+ const Float3 angvel_a,
	+ const Float4* __restrict__ dev_angvel,
	+ const Float radius_a,
	+ const Float radius_b,
	+ const Float3 x,
	+ const Float x_length,
	+ const Float delta,
	+ Float4* __restrict__ dev_delta_t,
	+ const unsigned int mempos)
	{

	// Allocate variables and fetch missing time=t values for particle A and B
	t@@ -231,15 +254,15 @@ __device__ void contactLinear(Float3* F, Float3* T,
	Float4 delta_t0_4 = dev_delta_t[mempos];

	Float3 delta_t0_uncor = MAKE_FLOAT3(
	- delta_t0_4.x,
	- delta_t0_4.y,
	- delta_t0_4.z);
	+ delta_t0_4.x,
	+ delta_t0_4.y,
	+ delta_t0_4.z);

	// Convert to Float3
	Float3 angvel_b = MAKE_FLOAT3(
	- angvel4_b.x,
	- angvel4_b.y,
	- angvel4_b.z);
	+ angvel4_b.x,
	+ angvel4_b.y,
	+ angvel4_b.z);

	// Force between grain pair decomposed into normal- and tangential part
	Float3 f_n, f_t, f_c;
	t@@ -249,9 +272,9 @@ __device__ void contactLinear(Float3* F, Float3* T,

	// Relative contact interface velocity, w/o rolling
	Float3 vel_linear = MAKE_FLOAT3(
	- vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	+ vel_a.x - vel_b.x,
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);

	// Relative contact interface velocity of particle surfaces at
	// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10,
	t@@ -335,7 +358,7 @@ __device__ void contactLinear(Float3* F, Float3* T,
	// 2008)
	delta_t = -1.0/devC_params.k_t
	* (devC_params.mu_d * length(f_n-f_c) * t
	- + devC_params.gamma_t * vel_t);
	+ + devC_params.gamma_t * vel_t);

	// Shear friction heat production rate:
	// The energy lost from the tangential spring is dissipated as heat
	t@@ -357,10 +380,10 @@ __device__ void contactLinear(Float3* F, Float3* T,

	// Store sum of tangential displacements
	dev_delta_t[mempos] = MAKE_FLOAT4(
	- delta_t.x,
	- delta_t.y,
	- delta_t.z,
	- 0.0f);
	+ delta_t.x,
	+ delta_t.y,
	+ delta_t.z,
	+ 0.0f);

	} // End of contactLinear()

	t@@ -368,18 +391,25 @@ __device__ void contactLinear(Float3* F, Float3* T,
	// Non-linear contact model for particle-particle interactions
	// Based on Hertzian and Mindlin contact theories (e.g. Hertz, 1882, Mindlin a…
	// Deresiewicz, 1953, Johnson, 1985). See Yohannes et al 2012 for example.
	-__device__ void contactHertz(Float3* F, Float3* T,
	- Float* es_dot, Float* ev_dot, Float* p,
	- unsigned int idx_a_orig,
	- unsigned int idx_b_orig,
	- Float4 vel_a,
	- Float4* dev_vel,
	- Float3 angvel_a,
	- Float4* dev_angvel,
	- Float radius_a, Float radius_b,
	- Float3 x_ab, Float x_ab_length,
	- Float delta_ab, Float4* dev_delta_t,
	- unsigned int mempos)
	+__device__ void contactHertz(
	+ Float3* F,
	+ Float3* T,
	+ Float* es_dot,
	+ Float* ev_dot,
	+ Float* p,
	+ const unsigned int idx_a_orig,
	+ const unsigned int idx_b_orig,
	+ const Float4 vel_a,
	+ const Float4* __restrict__ dev_vel,
	+ const Float3 angvel_a,
	+ const Float4* __restrict__ dev_angvel,
	+ const Float radius_a,
	+ const Float radius_b,
	+ const Float3 x_ab,
	+ const Float x_ab_length,
	+ const Float delta_ab,
	+ Float4* __restrict__ dev_delta_t,
	+ const unsigned int mempos)
	{

	// Allocate variables and fetch missing time=t values for particle A and B
	t@@ -390,8 +420,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
	Float4 delta_t0_4 = dev_delta_t[mempos];

	Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
	- delta_t0_4.y,
	- delta_t0_4.z);
	+ delta_t0_4.y,
	+ delta_t0_4.z);

	// Convert to Float3
	Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	t@@ -404,8 +434,8 @@ __device__ void contactHertz(Float3* F, Float3* T,

	// Relative contact interface velocity, w/o rolling
	Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);

	// Relative contact interface velocity of particle surfaces at
	// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	t@@ -435,7 +465,7 @@ __device__ void contactHertz(Float3* F, Float3* T,

	// Normal force component
	f_n = (-devC_params.k_n * powf(delta_ab, 3.0f/2.0f)
	- -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
	+ -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
	* n_ab;

	// Store energy dissipated in normal viscous component
	t@@ -517,9 +547,9 @@ __device__ void contactHertz(Float3* F, Float3* T,
	// New rolling resistance model
	/T_res = -1.0f fmin(devC_params.gamma_r * R_bar * angvel_ab_length,
	devC_params.mu_r * R_bar * f_n_length)
	- * angvel_ab/angvel_ab_length;*/
	+ * angvel_ab/angvel_ab_length;*/
	T_res = -1.0f * fmin(devC_params.gamma_r * radius_a * angvel_ab_length,
	- devC_params.mu_r * radius_a * f_n_length)
	+ devC_params.mu_r * radius_a * f_n_length)
	* angvel_ab/angvel_ab_length;
	}

	diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
	t@@ -85,19 +85,23 @@ __device__ int findDistMod(int3* targetCell, Float3* distm…
	// Used for contactmodel=1, where contact history is not needed.
	// Kernel executed on device, and callable from device only.
	// Function is called from interact().
	-__device__ void findAndProcessContactsInCell(int3 targetCell,
	- unsigned int idx_a,
	- Float4 x_a, Float radius_a,
	- Float3* F, Float3* T,
	- Float* es_dot, Float* ev_dot,
	- Float* p,
	- Float4* dev_x_sorted,
	- Float4* dev_vel_sorted,
	- Float4* dev_angvel_sorted,
	- unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd,
	- Float4* dev_walls_nx,
	- Float4* dev_walls_mvfd)
	+__device__ void findAndProcessContactsInCell(
	+ int3 targetCell,
	+ const unsigned int idx_a,
	+ const Float4 x_a,
	+ const Float radius_a,
	+ Float3* F,
	+ Float3* T,
	+ Float* es_dot,
	+ Float* ev_dot,
	+ Float* p,
	+ const Float4* __restrict__ dev_x_sorted,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ const Float4* __restrict__ dev_angvel_sorted,
	+ const unsigned int* __restrict__ dev_cellStart,
	+ const unsigned int* __restrict__ dev_cellEnd,
	+ const Float4* __restrict__ dev_walls_nx,
	+ Float4* __restrict__ dev_walls_mvfd)
	//uint4 bonds)
	{

	t@@ -133,8 +137,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCe…

	// Distance between particle centers (Float4 -> Float3)
	Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
	- x_a.y - x_b.y,
	- x_a.z - x_b.z);
	+ x_a.y - x_b.y,
	+ x_a.z - x_b.z);

	// Adjust interparticle vector if periodic boundary/bounda…
	// are crossed
	t@@ -147,16 +151,16 @@ __device__ void findAndProcessContactsInCell(int3 target…
	// Check for particle overlap
	if (delta_ab < 0.0f) {
	contactLinearViscous(F, T, es_dot, ev_dot, p,
	- idx_a, idx_b,
	- dev_vel_sorted,
	- dev_angvel_sorted,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_ab, kappa);
	+ idx_a, idx_b,
	+ dev_vel_sorted,
	+ dev_angvel_sorted,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_ab, kappa);
	} else if (delta_ab < devC_params.db) {
	// Check wether particle distance satisfies the capill…
	capillaryCohesion_exp(F, radius_a, radius_b, delta_ab,
	- x_ab, x_ab_length, kappa);
	+ x_ab, x_ab_length, kappa);
	}

	// Check wether particles are bonded together
	t@@ -183,16 +187,18 @@ __device__ void findAndProcessContactsInCell(int3 target…
	// Used for contactmodel=2, where bookkeeping of contact history is necessary.
	// Kernel executed on device, and callable from device only.
	// Function is called from topology().
	-__device__ void findContactsInCell(int3 targetCell,
	- unsigned int idx_a,
	- Float4 x_a, Float radius_a,
	- Float4* dev_x_sorted,
	- unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd,
	- unsigned int* dev_gridParticleIndex,
	- int* nc,
	- unsigned int* dev_contacts,
	- Float4* dev_distmod)
	+__device__ void findContactsInCell(
	+ int3 targetCell,
	+ const unsigned int idx_a,
	+ const Float4 x_a,
	+ const Float radius_a,
	+ const Float4* __restrict__ dev_x_sorted,
	+ const unsigned int* __restrict__ dev_cellStart,
	+ const unsigned int* __restrict__ dev_cellEnd,
	+ const unsigned int* __restrict__ dev_gridParticleIndex,
	+ int* nc,
	+ unsigned int* __restrict__ dev_contacts,
	+ Float4* __restrict__ dev_distmod)
	{
	// Get distance modifier for interparticle
	// vector, if it crosses a periodic boundary
	t@@ -237,8 +243,8 @@ __device__ void findContactsInCell(int3 targetCell,

	// Distance between particle centers (Float4 -> Float3)
	Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
	- x_a.y - x_b.y,
	- x_a.z - x_b.z);
	+ x_a.y - x_b.y,
	+ x_a.z - x_b.z);

	// Adjust interparticle vector if periodic boundary/bounda…
	// are crossed
	t@@ -316,12 +322,13 @@ __device__ void findContactsInCell(int3 targetCell,
	// Search for neighbors to particle 'idx' inside the 27 closest cells,
	// and save the contact pairs in global memory.
	// Function is called from mainGPU loop.
	-__global__ void topology(unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd, // Input: Particles in cell
	- unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
	- Float4* dev_x_sorted,
	- unsigned int* dev_contacts,
	- Float4* dev_distmod)
	+__global__ void topology(
	+ const unsigned int* __restrict__ dev_cellStart,
	+ const unsigned int* __restrict__ dev_cellEnd,
	+ const unsigned int* __restrict__ dev_gridParticleIndex,
	+ const Float4* __restrict__ dev_x_sorted,
	+ unsigned int* __restrict__ dev_contacts,
	+ Float4* __restrict__ dev_distmod)
	{
	// Thread index equals index of particle A
	unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
	t@@ -349,10 +356,10 @@ __global__ void topology(unsigned int* dev_cellStart,
	for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	findContactsInCell(targetPos, idx_a, x_a, radius_a,
	- dev_x_sorted,
	- dev_cellStart, dev_cellEnd,
	- dev_gridParticleIndex,
	- &nc, dev_contacts, dev_distmod);
	+ dev_x_sorted,
	+ dev_cellStart, dev_cellEnd,
	+ dev_gridParticleIndex,
	+ &nc, dev_contacts, dev_distmod);
	}
	}
	}
	t@@ -372,28 +379,28 @@ __global__ void topology(unsigned int* dev_cellStart,
	// Kernel is executed on device, and is callable from host only.
	// Function is called from mainGPU loop.
	__global__ void interact(
	- unsigned int* dev_gridParticleIndex, // in
	- unsigned int* dev_cellStart, // in
	- unsigned int* dev_cellEnd, // in
	- Float4* dev_x, // in
	- Float4* dev_x_sorted, // in
	- Float4* dev_vel_sorted, // in
	- Float4* dev_angvel_sorted, // in
	- Float4* dev_vel, // in
	- Float4* dev_angvel, // in
	- Float4* dev_force, // out
	- Float4* dev_torque, // out
	- Float* dev_es_dot, // out
	- Float* dev_ev_dot, // out
	- Float* dev_es, // out
	- Float* dev_ev, // out
	- Float* dev_p, // out
	- Float4* dev_walls_nx, // in
	- Float4* dev_walls_mvfd, // in
	- Float* dev_walls_force_pp, // out
	- unsigned int* dev_contacts, // out
	- Float4* dev_distmod, // in
	- Float4* dev_delta_t) // out
	+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
	+ const unsigned int* __restrict__ dev_cellStart, // in
	+ const unsigned int* __restrict__ dev_cellEnd, // in
	+ const Float4* __restrict__ dev_x, // in
	+ const Float4* __restrict__ dev_x_sorted, // in
	+ const Float4* __restrict__ dev_vel_sorted, // in
	+ const Float4* __restrict__ dev_angvel_sorted, // in
	+ const Float4* __restrict__ dev_vel, // in
	+ const Float4* __restrict__ dev_angvel, // in
	+ Float4* __restrict__ dev_force, // out
	+ Float4* __restrict__ dev_torque, // out
	+ Float* __restrict__ dev_es_dot, // out
	+ Float* __restrict__ dev_ev_dot, // out
	+ Float* __restrict__ dev_es, // out
	+ Float* __restrict__ dev_ev, // out
	+ Float* __restrict__ dev_p, // out
	+ const Float4* __restrict__ dev_walls_nx, // in
	+ Float4* __restrict__ dev_walls_mvfd, // in
	+ Float* __restrict__ dev_walls_force_pp, // out
	+ unsigned int* __restrict__ dev_contacts, // out
	+ const Float4* __restrict__ dev_distmod, // in
	+ Float4* __restrict__ dev_delta_t) // out
	{
	// Thread index equals index of particle A
	unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
	t@@ -407,11 +414,11 @@ __global__ void interact(

	// Fetch world dimensions in constant memory read
	Float3 origo = MAKE_FLOAT3(devC_grid.origo[0],
	- devC_grid.origo[1],
	- devC_grid.origo[2]);
	+ devC_grid.origo[1],
	+ devC_grid.origo[2]);
	Float3 L = MAKE_FLOAT3(devC_grid.L[0],
	- devC_grid.L[1],
	- devC_grid.L[2]);
	+ devC_grid.L[1],
	+ devC_grid.L[2]);

	// Fetch wall data in global read
	Float4 w_0_nx, w_1_nx, w_2_nx, w_3_nx, w_4_nx;
	t@@ -497,8 +504,8 @@ __global__ void interact(

	// Inter-particle vector, corrected for periodic boundaries
	x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
	- x_a.y - x_b.y + distmod.y,
	- x_a.z - x_b.z + distmod.z);
	+ x_a.y - x_b.y + distmod.y,
	+ x_a.z - x_b.z + distmod.z);

	x_ab_length = length(x_ab);
	delta_n = x_ab_length - (radius_a + radius_b);
	t@@ -507,28 +514,28 @@ __global__ void interact(
	if (delta_n < 0.0) {
	if (devC_params.contactmodel == 2) {
	contactLinear(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a_orig,
	- idx_b_orig,
	- vel_a,
	- dev_vel,
	- angvel_a,
	- dev_angvel,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_n, dev_delta_t,
	- mempos);
	+ idx_a_orig,
	+ idx_b_orig,
	+ vel_a,
	+ dev_vel,
	+ angvel_a,
	+ dev_angvel,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_n, dev_delta_t,
	+ mempos);
	} else if (devC_params.contactmodel == 3) {
	contactHertz(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a_orig,
	- idx_b_orig,
	- vel_a,
	- dev_vel,
	- angvel_a,
	- dev_angvel,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_n, dev_delta_t,
	- mempos);
	+ idx_a_orig,
	+ idx_b_orig,
	+ vel_a,
	+ dev_vel,
	+ angvel_a,
	+ dev_angvel,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_n, dev_delta_t,
	+ mempos);
	}
	} else {
	__syncthreads();
	t@@ -556,11 +563,11 @@ __global__ void interact(

	// Calculate address in grid from position
	gridPos.x = floor((x_a.x - devC_grid.origo[0])
	- / (devC_grid.L[0]/devC_grid.num[0]));
	+ / (devC_grid.L[0]/devC_grid.num[0]));
	gridPos.y = floor((x_a.y - devC_grid.origo[1])
	- / (devC_grid.L[1]/devC_grid.num[1]));
	+ / (devC_grid.L[1]/devC_grid.num[1]));
	gridPos.z = floor((x_a.z - devC_grid.origo[2])
	- / (devC_grid.L[2]/devC_grid.num[2]));
	+ / (devC_grid.L[2]/devC_grid.num[2]));

	// Find overlaps between particle no. idx and all particles
	// from its own cell + 26 neighbor cells.
	t@@ -570,12 +577,17 @@ __global__ void interact(
	for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	- findAndProcessContactsInCell(targetPos, idx_a, x_a, ra…
	- &F, &T, &es_dot, &ev_dot, &p,
	- dev_x_sorted,
	- dev_vel_sorted, dev_angvel_sorted,
	- dev_cellStart, dev_cellEnd,
	- dev_walls_nx, dev_walls_mvfd);
	+ findAndProcessContactsInCell(targetPos, idx_a,
	+ x_a, radius_a,
	+ &F, &T, &es_dot,
	+ &ev_dot, &p,
	+ dev_x_sorted,
	+ dev_vel_sorted,
	+ dev_angvel_sorted,
	+ dev_cellStart,
	+ dev_cellEnd,
	+ dev_walls_nx,
	+ dev_walls_mvfd);
	}
	}
	}
	t@@ -596,8 +608,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_0_nx.x, w_0_nx.y, w_0_nx.z);
	if (delta_w < 0.0f) {
	w_0_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
	- radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
	- w_0_mvfd.y);
	+ radius_a, dev_vel_sorted, dev_angve…
	+ w_0_mvfd.y);
	}

	// Lower wall (force on wall not stored)
	t@@ -605,8 +617,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
	if (delta_w < 0.0f) {
	(void)contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
	- radius_a, dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	+ radius_a, dev_vel_sorted, dev_angvel_sort…
	+ w_n, delta_w, 0.0f);
	}


	t@@ -617,8 +629,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_1_nx.x, w_1_nx.y, w_1_nx.z);
	if (delta_w < 0.0f) {
	w_1_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_1_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_1_mvfd.y);
	}

	// Left wall (idx 2)
	t@@ -626,8 +638,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_2_nx.x, w_2_nx.y, w_2_nx.z);
	if (delta_w < 0.0f) {
	w_2_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_2_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_2_mvfd.y);
	}

	// Back wall (idx 3)
	t@@ -635,8 +647,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
	if (delta_w < 0.0f) {
	w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_3_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_3_mvfd.y);
	}

	// Front wall (idx 4)
	t@@ -644,8 +656,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
	if (delta_w < 0.0f) {
	w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_4_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_4_mvfd.y);
	}

	} else if (devC_grid.periodic == 2) { // right and left walls period…
	t@@ -655,8 +667,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
	if (delta_w < 0.0f) {
	w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_3_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_3_mvfd.y);
	}

	// Front wall (idx 4)
	t@@ -664,8 +676,8 @@ __global__ void interact(
	w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
	if (delta_w < 0.0f) {
	w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
	- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
	- delta_w, w_4_mvfd.y);
	+ idx_a, radius_a, dev_vel_sorted…
	+ delta_w, w_4_mvfd.y);
	}
	}

	diff --git a/src/integration.cuh b/src/integration.cuh
	t@@ -11,15 +11,23 @@

	// Second order integration scheme based on Taylor expansion of particle kinem…
	// Kernel executed on device, and callable from host only.
	-__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp…
	- Float4* dev_angvel_sorted,
	- Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
	- Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
	- Float4* dev_acc, Float4* dev_angacc,
	- Float4* dev_vel0, Float4* dev_angvel0,
	- Float4* dev_xyzsum,
	- unsigned int* dev_gridParticleIndex, // Input: Sorted-Unsorted key
	- unsigned int iter)
	+__global__ void integrate(
	+ const Float4* __restrict__ dev_x_sorted,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ const Float4* __restrict__ dev_angvel_sorted,
	+ Float4* __restrict__ dev_x,
	+ Float4* __restrict__ dev_vel,
	+ Float4* __restrict__ dev_angvel,
	+ const Float4* __restrict__ dev_force,
	+ const Float4* __restrict__ dev_torque,
	+ Float4* __restrict__ dev_angpos,
	+ Float4* __restrict__ dev_acc,
	+ Float4* __restrict__ dev_angacc,
	+ Float4* __restrict__ dev_vel0,
	+ Float4* __restrict__ dev_angvel0,
	+ Float4* __restrict__ dev_xyzsum,
	+ const unsigned int* __restrict__ dev_gridParticleIndex,
	+ const unsigned int iter)
	{
	unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id

	t@@ -59,13 +67,13 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de…
	// Coherent read from constant memory to registers
	const Float dt = devC_dt;
	const Float3 origo = MAKE_FLOAT3(
	- devC_grid.origo[0],
	- devC_grid.origo[1],
	- devC_grid.origo[2]);
	+ devC_grid.origo[0],
	+ devC_grid.origo[1],
	+ devC_grid.origo[2]);
	const Float3 L = MAKE_FLOAT3(
	- devC_grid.L[0],
	- devC_grid.L[1],
	- devC_grid.L[2]);
	+ devC_grid.L[0],
	+ devC_grid.L[1],
	+ devC_grid.L[2]);

	// Particle mass
	Float m = 4.0/3.0 * PI * radiusradiusradius * devC_params.rho;
	t@@ -179,14 +187,14 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* …
	// Truncation error O(dt^4) for positions, O(dt^3) for velocities
	// Approximate acceleration change by backwards difference:
	const Float3 dacc_dt = MAKE_FLOAT3(
	- (acc.x - acc0.x)/dt,
	- (acc.y - acc0.y)/dt,
	- (acc.z - acc0.z)/dt);
	+ (acc.x - acc0.x)/dt,
	+ (acc.y - acc0.y)/dt,
	+ (acc.z - acc0.z)/dt);

	const Float3 dangacc_dt = MAKE_FLOAT3(
	- (angacc.x - angacc0.x)/dt,
	- (angacc.y - angacc0.y)/dt,
	- (angacc.z - angacc0.z)/dt);
	+ (angacc.x - angacc0.x)/dt,
	+ (angacc.y - angacc0.y)/dt,
	+ (angacc.z - angacc0.z)/dt);

	x_new.x = x.x + vel.xdt + 0.5acc.xdtdt + 1.0/6.0dacc_dt.xdtdtd…
	x_new.y = x.y + vel.ydt + 0.5acc.ydtdt + 1.0/6.0dacc_dt.ydtdtd…
	t@@ -253,7 +261,9 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de…


	// Reduce wall force contributions from particles to a single value per wall
	-__global__ void summation(Float* in, Float *out)
	+__global__ void summation(
	+ const Float* __restrict__ in,
	+ Float* __restrict__ out)
	{
	__shared__ Float cache[256];
	unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
	t@@ -287,14 +297,14 @@ __global__ void summation(Float* in, Float *out)

	// Update wall positions
	__global__ void integrateWalls(
	- Float4* dev_walls_nx,
	- Float4* dev_walls_mvfd,
	- int* dev_walls_wmode,
	- Float* dev_walls_force_partial,
	- Float* dev_walls_acc,
	- unsigned int blocksPerGrid,
	- Float t_current,
	- unsigned int iter)
	+ Float4* __restrict__ dev_walls_nx,
	+ Float4* __restrict__ dev_walls_mvfd,
	+ const int* __restrict__ dev_walls_wmode,
	+ const Float* __restrict__ dev_walls_force_partial,
	+ Float* __restrict__ dev_walls_acc,
	+ const unsigned int blocksPerGrid,
	+ const Float t_current,
	+ const unsigned int iter)
	{
	unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id

	diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
	t@@ -25,34 +25,34 @@ __inline__ __device__ Float hmean(Float a, Float b) {

	// Helper functions for checking whether a value is NaN or Inf
	__device__ int checkFiniteFloat(
	- const char* desc,
	- const unsigned int x,
	- const unsigned int y,
	- const unsigned int z,
	- const Float s)
	+ const char* desc,
	+ const unsigned int x,
	+ const unsigned int y,
	+ const unsigned int z,
	+ const Float s)
	{
	- __syncthreads();
	- if (!isfinite(s)) {
	- printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
	- return 1;
	- }
	- return 0;
	+ __syncthreads();
	+ if (!isfinite(s)) {
	+ printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
	+ return 1;
	+ }
	+ return 0;
	}

	__device__ int checkFiniteFloat3(
	- const char* desc,
	- const unsigned int x,
	- const unsigned int y,
	- const unsigned int z,
	- const Float3 v)
	+ const char* desc,
	+ const unsigned int x,
	+ const unsigned int y,
	+ const unsigned int z,
	+ const Float3 v)
	{
	- __syncthreads();
	- if (!isfinite(v.x) \|\| !isfinite(v.y) \|\| !isfinite(v.z)) {
	- printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
	- x, y, z, desc, v.x, v.y, v.z);
	- return 1;
	- }
	- return 0;
	+ __syncthreads();
	+ if (!isfinite(v.x) \|\| !isfinite(v.y) \|\| !isfinite(v.z)) {
	+ printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
	+ x, y, z, desc, v.x, v.y, v.z);
	+ return 1;
	+ }
	+ return 0;
	}

	// Initialize memory
	t@@ -143,10 +143,10 @@ void DEM::freeNSmemDev()
	void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
	{
	checkForCudaErrors("Before attempting cudaMemcpy in "
	- "transferNStoGlobalDeviceMemory");
	+ "transferNStoGlobalDeviceMemory");

	//if (verbose == 1 && statusmsg == 1)
	- //std::cout << " Transfering fluid data to the device: ";
	+ //std::cout << " Transfering fluid data to the device: ";

	// memory size for a scalar field
	unsigned int memSizeF = sizeof(Float)*NScells();
	t@@ -162,7 +162,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg)

	checkForCudaErrors("End of transferNStoGlobalDeviceMemory");
	//if (verbose == 1 && statusmsg == 1)
	- //std::cout << "Done" << std::endl;
	+ //std::cout << "Done" << std::endl;
	}

	// Transfer from device
	t@@ -194,7 +194,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
	void DEM::transferNSnormFromGlobalDeviceMemory()
	{
	cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(),
	- cudaMemcpyDeviceToHost);
	+ cudaMemcpyDeviceToHost);
	checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory");
	}

	t@@ -202,7 +202,7 @@ void DEM::transferNSnormFromGlobalDeviceMemory()
	void DEM::transferNSepsilonFromGlobalDeviceMemory()
	{
	cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(),
	- cudaMemcpyDeviceToHost);
	+ cudaMemcpyDeviceToHost);
	checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
	}

	t@@ -210,13 +210,13 @@ void DEM::transferNSepsilonFromGlobalDeviceMemory()
	void DEM::transferNSepsilonNewFromGlobalDeviceMemory()
	{
	cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(),
	- cudaMemcpyDeviceToHost);
	+ cudaMemcpyDeviceToHost);
	checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
	}

	// Get linear index from 3D grid position
	__inline__ __device__ unsigned int idx(
	- const int x, const int y, const int z)
	+ const int x, const int y, const int z)
	{
	// without ghost nodes
	//return x + dev_grid.num[0]y + dev_grid.num[0]dev_grid.num[1]*z;
	t@@ -229,11 +229,11 @@ __inline__ __device__ unsigned int idx(

	// Get linear index of velocity node from 3D grid position in staggered grid
	__inline__ __device__ unsigned int vidx(
	- const int x, const int y, const int z)
	+ const int x, const int y, const int z)
	{
	// without ghost nodes
	//return x + (devC_grid.num[0]+1)*y
	- //+ (devC_grid.num[0]+1)(devC_grid.num[1]+1)z;
	+ //+ (devC_grid.num[0]+1)(devC_grid.num[1]+1)z;

	// with ghost nodes
	// the ghost nodes are placed at x,y,z = -1 and WIDTH+1
	t@@ -246,10 +246,10 @@ __inline__ __device__ unsigned int vidx(
	// dev_ns_v or dev_ns_v_p array. This function does not set the averaged
	// velocity values in the ghost node cells.
	__global__ void findNSavgVel(
	- Float3* dev_ns_v, // out
	- Float* dev_ns_v_x, // in
	- Float* dev_ns_v_y, // in
	- Float* dev_ns_v_z) // in
	+ Float3* __restrict__ dev_ns_v, // out
	+ const Float* __restrict__ dev_ns_v_x, // in
	+ const Float* __restrict__ dev_ns_v_y, // in
	+ const Float* __restrict__ dev_ns_v_z) // in
	{

	// 3D thread index
	t@@ -272,9 +272,9 @@ __global__ void findNSavgVel(

	// Find average velocity using arithmetic means
	const Float3 v_bar = MAKE_FLOAT3(
	- amean(v_xn, v_xp),
	- amean(v_yn, v_yp),
	- amean(v_zn, v_zp));
	+ amean(v_xn, v_xp),
	+ amean(v_yn, v_yp),
	+ amean(v_zn, v_zp));

	// Save value
	__syncthreads();
	t@@ -287,10 +287,10 @@ __global__ void findNSavgVel(
	// or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are s…
	// beforehand.
	__global__ void findNScellFaceVel(
	- Float3* dev_ns_v, // in
	- Float* dev_ns_v_x, // out
	- Float* dev_ns_v_y, // out
	- Float* dev_ns_v_z) // out
	+ const Float3* __restrict__ dev_ns_v, // in
	+ Float* __restrict__ dev_ns_v_x, // out
	+ Float* __restrict__ dev_ns_v_y, // out
	+ Float* __restrict__ dev_ns_v_z) // out
	{

	// 3D thread index
	t@@ -340,7 +340,9 @@ __global__ void findNScellFaceVel(


	// Set the initial guess of the values of epsilon.
	-__global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
	+__global__ void setNSepsilonInterior(
	+ Float* __restrict__ dev_ns_epsilon,
	+ const Float value)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -349,7 +351,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon…

	// check that we are not outside the fluid grid
	if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
	- z > 0 && z < devC_grid.num[2]-1) {
	+ z > 0 && z < devC_grid.num[2]-1) {
	__syncthreads();
	const unsigned int cellidx = idx(x,y,z);
	dev_ns_epsilon[cellidx] = value;
	t@@ -358,7 +360,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon…

	// The normalized residuals are given an initial value of 0, since the values …
	// the Dirichlet boundaries aren't written during the iterations.
	-__global__ void setNSnormZero(Float* dev_ns_norm)
	+__global__ void setNSnormZero(Float* __restrict__ dev_ns_norm)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -380,9 +382,9 @@ __global__ void setNSnormZero(Float* dev_ns_norm)
	// the Dirichlet boundary condition: the new value should be identical to the
	// old value, i.e. the temporal gradient is 0
	__global__ void setNSepsilonBottom(
	- Float* dev_ns_epsilon,
	- Float* dev_ns_epsilon_new,
	- const Float value)
	+ Float* __restrict__ dev_ns_epsilon,
	+ Float* __restrict__ dev_ns_epsilon_new,
	+ const Float value)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -408,9 +410,9 @@ __global__ void setNSepsilonBottom(
	// the Dirichlet boundary condition: the new value should be identical to the
	// old value, i.e. the temporal gradient is 0
	__global__ void setNSepsilonTop(
	- Float* dev_ns_epsilon,
	- Float* dev_ns_epsilon_new,
	- const Float value)
	+ Float* __restrict__ dev_ns_epsilon,
	+ Float* __restrict__ dev_ns_epsilon_new,
	+ const Float value)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -419,7 +421,7 @@ __global__ void setNSepsilonTop(

	// check that we are not outside the fluid grid, and at the upper z bounda…
	if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
	- z == devC_grid.num[2]-1) {
	+ z == devC_grid.num[2]-1) {

	__syncthreads();
	const unsigned int cellidx = idx(x,y,z);
	t@@ -428,11 +430,14 @@ __global__ void setNSepsilonTop(
	}
	}
	__device__ void copyNSvalsDev(
	- unsigned int read, unsigned int write,
	- Float* dev_ns_p,
	- Float3* dev_ns_v, Float3* dev_ns_v_p,
	- Float* dev_ns_phi, Float* dev_ns_dphi,
	- Float* dev_ns_epsilon)
	+ const unsigned int read,
	+ const unsigned int write,
	+ Float* __restrict__ dev_ns_p,
	+ Float3* __restrict__ dev_ns_v,
	+ Float3* __restrict__ dev_ns_v_p,
	+ Float* __restrict__ dev_ns_phi,
	+ Float* __restrict__ dev_ns_dphi,
	+ Float* __restrict__ dev_ns_epsilon)
	{
	// Coalesced read
	const Float p = dev_ns_p[read];
	t@@ -457,10 +462,12 @@ __device__ void copyNSvalsDev(
	// are not written since they are not read. Launch this kernel for all cells in
	// the grid
	__global__ void setNSghostNodesDev(
	- Float* dev_ns_p,
	- Float3* dev_ns_v, Float3* dev_ns_v_p,
	- Float* dev_ns_phi, Float* dev_ns_dphi,
	- Float* dev_ns_epsilon)
	+ Float* __restrict__ dev_ns_p,
	+ Float3* __restrict__ dev_ns_v,
	+ Float3* __restrict__ dev_ns_v_p,
	+ Float* __restrict__ dev_ns_phi,
	+ Float* __restrict__ dev_ns_dphi,
	+ Float* __restrict__ dev_ns_epsilon)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -484,72 +491,72 @@ __global__ void setNSghostNodesDev(
	if (x == 0) {
	writeidx = idx(nx,y,z);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}
	if (x == nx-1) {
	writeidx = idx(-1,y,z);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}

	if (y == 0) {
	writeidx = idx(x,ny,z);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}
	if (y == ny-1) {
	writeidx = idx(x,-1,z);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}

	// Z boundaries fixed
	if (z == 0) {
	writeidx = idx(x,y,-1);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}
	if (z == nz-1) {
	writeidx = idx(x,y,nz);
	copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	}

	// Z boundaries periodic
	/*if (z == 0) {
	- writeidx = idx(x,y,nz);
	- copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	- }
	- if (z == nz-1) {
	- writeidx = idx(x,y,-1);
	- copyNSvalsDev(cellidx, writeidx,
	- dev_ns_p,
	- dev_ns_v, dev_ns_v_p,
	- dev_ns_phi, dev_ns_dphi,
	- dev_ns_epsilon);
	- }*/
	+ writeidx = idx(x,y,nz);
	+ copyNSvalsDev(cellidx, writeidx,
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	+ }
	+ if (z == nz-1) {
	+ writeidx = idx(x,y,-1);
	+ copyNSvalsDev(cellidx, writeidx,
	+ dev_ns_p,
	+ dev_ns_v, dev_ns_v_p,
	+ dev_ns_phi, dev_ns_dphi,
	+ dev_ns_epsilon);
	+ }*/
	}
	}

	t@@ -557,7 +564,7 @@ __global__ void setNSghostNodesDev(
	// (diagonal) cells are not written since they are not read. Launch this kernel
	// for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. …
	template<typename T>
	-__global__ void setNSghostNodes(T* dev_scalarfield)
	+__global__ void setNSghostNodes(T* __restrict__ dev_scalarfield)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -586,10 +593,10 @@ __global__ void setNSghostNodes(T* dev_scalarfield)

	if (z == 0)
	dev_scalarfield[idx(x,y,-1)] = val; // Dirichlet
	- //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
	+ //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
	if (z == nz-1)
	dev_scalarfield[idx(x,y,nz)] = val; // Dirichlet
	- //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
	+ //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
	}
	}

	t@@ -597,9 +604,9 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
	// (diagonal) cells are not written since they are not read.
	template<typename T>
	__global__ void setNSghostNodes(
	- T* dev_scalarfield,
	- int bc_bot,
	- int bc_top)
	+ T* __restrict__ dev_scalarfield,
	+ const int bc_bot,
	+ const int bc_top)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -651,11 +658,11 @@ __global__ void setNSghostNodes(
	// According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics"
	template<typename T>
	__global__ void setNSghostNodesFace(
	- T* dev_scalarfield_x,
	- T* dev_scalarfield_y,
	- T* dev_scalarfield_z,
	- int bc_bot,
	- int bc_top)
	+ T* __restrict__ dev_scalarfield_x,
	+ T* __restrict__ dev_scalarfield_y,
	+ T* __restrict__ dev_scalarfield_z,
	+ const int bc_bot,
	+ const int bc_top)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -849,9 +856,9 @@ __global__ void setNSghostNodesFace(
	// The edge (diagonal) cells are not written since they are not read. Launch
	// this kernel for all cells in the grid.
	__global__ void setNSghostNodes_tau(
	- Float* dev_ns_tau,
	- int bc_bot,
	- int bc_top)
	+ Float* __restrict__ dev_ns_tau,
	+ const int bc_bot,
	+ const int bc_top)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -981,108 +988,108 @@ __global__ void setNSghostNodes_tau(
	// The edge (diagonal) cells are not written since they are not read. Launch
	// this kernel for all cells in the grid.
	/*
	-__global__ void setNSghostNodesForcing(
	- Float* dev_ns_f1,
	- Float3* dev_ns_f2,
	- Float* dev_ns_f,
	- unsigned int nijac)
	-
	-{
	- // 3D thread index
	- const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	- const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
	- const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
	-
	- // Grid dimensions
	- const unsigned int nx = devC_grid.num[0];
	- const unsigned int ny = devC_grid.num[1];
	- const unsigned int nz = devC_grid.num[2];
	-
	- // 1D thread index
	- unsigned int cellidx = idx(x,y,z);
	-
	- // check that we are not outside the fluid grid
	- if (x < nx && y < ny && z < nz) {
	-
	- __syncthreads();
	- const Float f = dev_ns_f[cellidx];
	- Float f1;
	- Float3 f2;
	-
	- if (nijac == 0) {
	- __syncthreads();
	- f1 = dev_ns_f1[cellidx];
	- f2 = dev_ns_f2[cellidx];
	- }
	-
	- if (x == 0) {
	- cellidx = idx(nx,y,z);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	- if (x == nx-1) {
	- cellidx = idx(-1,y,z);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	-
	- if (y == 0) {
	- cellidx = idx(x,ny,z);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	- if (y == ny-1) {
	- cellidx = idx(x,-1,z);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	-
	- if (z == 0) {
	- cellidx = idx(x,y,nz);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	- if (z == nz-1) {
	- cellidx = idx(x,y,-1);
	- dev_ns_f[cellidx] = f;
	- if (nijac == 0) {
	- dev_ns_f1[cellidx] = f1;
	- dev_ns_f2[cellidx] = f2;
	- }
	- }
	- }
	-}
	+ __global__ void setNSghostNodesForcing(
	+ Float* dev_ns_f1,
	+ Float3* dev_ns_f2,
	+ Float* dev_ns_f,
	+ unsigned int nijac)
	+
	+ {
	+ // 3D thread index
	+ const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	+ const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
	+ const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
	+
	+ // Grid dimensions
	+ const unsigned int nx = devC_grid.num[0];
	+ const unsigned int ny = devC_grid.num[1];
	+ const unsigned int nz = devC_grid.num[2];
	+
	+ // 1D thread index
	+ unsigned int cellidx = idx(x,y,z);
	+
	+ // check that we are not outside the fluid grid
	+ if (x < nx && y < ny && z < nz) {
	+
	+ __syncthreads();
	+ const Float f = dev_ns_f[cellidx];
	+ Float f1;
	+ Float3 f2;
	+
	+ if (nijac == 0) {
	+ __syncthreads();
	+ f1 = dev_ns_f1[cellidx];
	+ f2 = dev_ns_f2[cellidx];
	+ }
	+
	+ if (x == 0) {
	+ cellidx = idx(nx,y,z);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+ if (x == nx-1) {
	+ cellidx = idx(-1,y,z);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+
	+ if (y == 0) {
	+ cellidx = idx(x,ny,z);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+ if (y == ny-1) {
	+ cellidx = idx(x,-1,z);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+
	+ if (z == 0) {
	+ cellidx = idx(x,y,nz);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+ if (z == nz-1) {
	+ cellidx = idx(x,y,-1);
	+ dev_ns_f[cellidx] = f;
	+ if (nijac == 0) {
	+ dev_ns_f1[cellidx] = f1;
	+ dev_ns_f2[cellidx] = f2;
	+ }
	+ }
	+ }
	+ }
	*/

	// Find the porosity in each cell on the base of a sphere, centered at the cell
	// center.
	__global__ void findPorositiesVelocitiesDiametersSpherical(
	- const unsigned int* dev_cellStart,
	- const unsigned int* dev_cellEnd,
	- const Float4* dev_x_sorted,
	- const Float4* dev_vel_sorted,
	- Float* dev_ns_phi,
	- Float* dev_ns_dphi,
	- Float3* dev_ns_vp_avg,
	- Float* dev_ns_d_avg,
	- const unsigned int iteration,
	- const unsigned int np,
	- const Float c_phi)
	+ const unsigned int* __restrict__ dev_cellStart,
	+ const unsigned int* __restrict__ dev_cellEnd,
	+ const Float4* __restrict__ dev_x_sorted,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ Float* __restrict__ dev_ns_phi,
	+ Float* __restrict__ dev_ns_dphi,
	+ Float3* __restrict__ dev_ns_vp_avg,
	+ Float* __restrict__ dev_ns_d_avg,
	+ const unsigned int iteration,
	+ const unsigned int np,
	+ const Float c_phi)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1114,9 +1121,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…

	// Cell sphere center position
	const Float3 X = MAKE_FLOAT3(
	- xdx + 0.5dx,
	- ydy + 0.5dy,
	- zdz + 0.5dz);
	+ xdx + 0.5dx,
	+ ydy + 0.5dy,
	+ zdz + 0.5dz);

	Float d, r;
	Float phi = 1.00;
	t@@ -1143,12 +1150,12 @@ __global__ void findPorositiesVelocitiesDiametersSpher…

	// Iterate over 27 neighbor cells, R = cell width
	/*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
	+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/

	// Iterate over 27 neighbor cells, R = 2*cell width
	for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
	- //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	+ //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
	for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis

	t@@ -1186,9 +1193,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…

	// Find center distance
	dist = MAKE_FLOAT3(
	- X.x - xr.x,
	- X.y - xr.y,
	- X.z - xr.z);
	+ X.x - xr.x,
	+ X.y - xr.y,
	+ X.z - xr.z);
	dist += distmod;
	d = length(dist);

	t@@ -1196,10 +1203,10 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
	if ((R - r) < d && d < (R + r)) {
	void_volume -=
	1.0/(12.0d) (
	- M_PI(R + r - d)(R + r - …
	- (dd + 2.0dr - 3.0rr
	- + 2.0dR + 6.0rR
	- - 3.0RR) );
	+ M_PI(R + r - d)(R + r - d)
	+ (dd + 2.0dr - 3.0rr
	+ + 2.0dR + 6.0rR
	+ - 3.0RR) );
	v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
	d_avg += 2.0*r;
	n++;
	t@@ -1259,8 +1266,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
	//Float phi = 0.5;
	//Float dphi = 0.0;
	//if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) {
	- //phi = 0.4;
	- //dphi = 0.1;
	+ //phi = 0.4;
	+ //dphi = 0.1;
	//}
	//dev_ns_phi[cellidx] = phi;
	//dev_ns_dphi[cellidx] = dphi;
	t@@ -1276,17 +1283,17 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
	// Find the porosity in each cell on the base of a sphere, centered at the cell
	// center.
	__global__ void findPorositiesVelocitiesDiametersSphericalGradient(
	- const unsigned int* dev_cellStart,
	- const unsigned int* dev_cellEnd,
	- const Float4* dev_x_sorted,
	- const Float4* dev_vel_sorted,
	- Float* dev_ns_phi,
	- Float* dev_ns_dphi,
	- Float3* dev_ns_vp_avg,
	- Float* dev_ns_d_avg,
	- const unsigned int iteration,
	- const unsigned int ndem,
	- const unsigned int np)
	+ const unsigned int* __restrict__ dev_cellStart,
	+ const unsigned int* __restrict__ dev_cellEnd,
	+ const Float4* __restrict__ dev_x_sorted,
	+ const Float4* __restrict__ dev_vel_sorted,
	+ Float* __restrict__ dev_ns_phi,
	+ Float* __restrict__ dev_ns_dphi,
	+ Float3* __restrict__ dev_ns_vp_avg,
	+ Float* __restrict__ dev_ns_d_avg,
	+ const unsigned int iteration,
	+ const unsigned int ndem,
	+ const unsigned int np)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1304,7 +1311,7 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
	const Float dz = devC_grid.L[2]/nz;

	// Cell sphere radius
	- const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width
	+ const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width

	Float4 xr; // particle pos. and radius

	t@@ -1315,9 +1322,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…

	// Cell sphere center position
	const Float3 X = MAKE_FLOAT3(
	- xdx + 0.5dx,
	- ydy + 0.5dy,
	- zdz + 0.5dz);
	+ xdx + 0.5dx,
	+ ydy + 0.5dy,
	+ zdz + 0.5dz);

	Float d, r;
	Float phi = 1.00;
	t@@ -1396,9 +1403,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…

	// Find center distance and normal vector
	x_p = MAKE_FLOAT3(
	- xr.x - X.x,
	- xr.y - X.y,
	- xr.z - X.z);
	+ xr.x - X.x,
	+ xr.y - X.y,
	+ xr.z - X.z);
	d = length(x_p);
	n_p = x_p/d;
	q = d/R;
	t@@ -1438,8 +1445,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
	phi = phi_0 + dphi/(ndem*devC_dt);

	//if (dot_epsilon_kk != 0.0)
	- //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n…
	- //x,y,z, dot_epsilon_kk, dphi, phi);
	+ //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
	+ //x,y,z, dot_epsilon_kk, dphi, phi);

	// Make sure that the porosity is in the interval [0.0;1.0]
	phi = fmin(1.00, fmax(0.00, phi));
	t@@ -1484,11 +1491,11 @@ __global__ void findPorositiesVelocitiesDiametersSpher…

	// Modulate the hydraulic pressure at the upper boundary
	__global__ void setUpperPressureNS(
	- Float* dev_ns_p,
	- Float* dev_ns_epsilon,
	- Float* dev_ns_epsilon_new,
	- Float beta,
	- const Float new_pressure)
	+ Float* __restrict__ dev_ns_p,
	+ Float* __restrict__ dev_ns_epsilon,
	+ Float* __restrict__ dev_ns_epsilon_new,
	+ const Float beta,
	+ const Float new_pressure)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1497,8 +1504,8 @@ __global__ void setUpperPressureNS(

	// check that the thread is located at the top boundary
	if (x < devC_grid.num[0] &&
	- y < devC_grid.num[1] &&
	- z == devC_grid.num[2]-1) {
	+ y < devC_grid.num[1] &&
	+ z == devC_grid.num[2]-1) {

	const unsigned int cellidx = idx(x,y,z);

	t@@ -1524,13 +1531,13 @@ __global__ void setUpperPressureNS(
	// Find the gradient in a cell in a homogeneous, cubic 3D scalar field using
	// finite central differences
	__device__ Float3 gradient(
	- const Float* dev_scalarfield,
	- const unsigned int x,
	- const unsigned int y,
	- const unsigned int z,
	- const Float dx,
	- const Float dy,
	- const Float dz)
	+ const Float* __restrict__ dev_scalarfield,
	+ const unsigned int x,
	+ const unsigned int y,
	+ const unsigned int z,
	+ const Float dx,
	+ const Float dy,
	+ const Float dz)
	{
	// Read 6 neighbor cells
	__syncthreads();
	t@@ -1544,26 +1551,26 @@ __device__ Float3 gradient(

	//__syncthreads();
	//if (p != 0.0)
	- //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
	+ //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);

	// Calculate central-difference gradients
	return MAKE_FLOAT3(
	- (xp - xn)/(2.0*dx),
	- (yp - yn)/(2.0*dy),
	- (zp - zn)/(2.0*dz));
	+ (xp - xn)/(2.0*dx),
	+ (yp - yn)/(2.0*dy),
	+ (zp - zn)/(2.0*dz));
	}

	// Find the divergence in a cell in a homogeneous, cubic, 3D vector field
	__device__ Float divergence(
	- const Float* dev_vectorfield_x,
	- const Float* dev_vectorfield_y,
	- const Float* dev_vectorfield_z,
	- const unsigned int x,
	- const unsigned int y,
	- const unsigned int z,
	- const Float dx,
	- const Float dy,
	- const Float dz)
	+ const Float* __restrict__ dev_vectorfield_x,
	+ const Float* __restrict__ dev_vectorfield_y,
	+ const Float* __restrict__ dev_vectorfield_z,
	+ const unsigned int x,
	+ const unsigned int y,
	+ const unsigned int z,
	+ const Float dx,
	+ const Float dy,
	+ const Float dz)
	{
	// Read 6 cell-face values
	__syncthreads();
	t@@ -1583,13 +1590,13 @@ __device__ Float divergence(

	// Find the divergence of a tensor field
	__device__ Float3 divergence_tensor(
	- Float* dev_tensorfield,
	- const unsigned int x,
	- const unsigned int y,
	- const unsigned int z,
	- const Float dx,
	- const Float dy,
	- const Float dz)
	+ const Float* __restrict__ dev_tensorfield,
	+ const unsigned int x,
	+ const unsigned int y,
	+ const unsigned int z,
	+ const Float dx,
	+ const Float dy,
	+ const Float dz)
	{
	__syncthreads();

	t@@ -1638,18 +1645,18 @@ __device__ Float3 divergence_tensor(

	// Calculate div(phi*tau)
	const Float3 div_tensor = MAKE_FLOAT3(
	- // x
	- (t_xx_xp - t_xx_xn)/dx +
	- (t_xy_yp - t_xy_yn)/dy +
	- (t_xz_zp - t_xz_zn)/dz,
	- // y
	- (t_xy_xp - t_xy_xn)/dx +
	- (t_yy_yp - t_yy_yn)/dy +
	- (t_yz_zp - t_yz_zn)/dz,
	- // z
	- (t_xz_xp - t_xz_xn)/dx +
	- (t_yz_yp - t_yz_yn)/dy +
	- (t_zz_zp - t_zz_zn)/dz);
	+ // x
	+ (t_xx_xp - t_xx_xn)/dx +
	+ (t_xy_yp - t_xy_yn)/dy +
	+ (t_xz_zp - t_xz_zn)/dz,
	+ // y
	+ (t_xy_xp - t_xy_xn)/dx +
	+ (t_yy_yp - t_yy_yn)/dy +
	+ (t_yz_zp - t_yz_zn)/dz,
	+ // z
	+ (t_xz_xp - t_xz_xn)/dx +
	+ (t_yz_yp - t_yz_yn)/dy +
	+ (t_zz_zp - t_zz_zn)/dz);

	#ifdef CHECK_NS_FINITE
	(void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor);
	t@@ -1661,8 +1668,8 @@ __device__ Float3 divergence_tensor(
	// Find the spatial gradient in e.g. pressures per cell
	// using first order central differences
	__global__ void findNSgradientsDev(
	- Float* dev_scalarfield, // in
	- Float3* dev_vectorfield) // out
	+ const Float* __restrict__ dev_scalarfield, // in
	+ Float3* __restrict__ dev_vectorfield) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1699,8 +1706,8 @@ __global__ void findNSgradientsDev(

	// Find the outer product of v v
	__global__ void findvvOuterProdNS(
	- Float3* dev_ns_v, // in
	- Float* dev_ns_v_prod) // out
	+ const Float3* __restrict__ dev_ns_v, // in
	+ Float* __restrict__ dev_ns_v_prod) // out
	{
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
	t@@ -1750,8 +1757,8 @@ __global__ void findvvOuterProdNS(
	// Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6
	// values in 3D.
	__global__ void findNSstressTensor(
	- Float3* dev_ns_v, // in
	- Float* dev_ns_tau) // out
	+ const Float3* __restrict__ dev_ns_v, // in
	+ Float* __restrict__ dev_ns_tau) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1809,15 +1816,15 @@ __global__ void findNSstressTensor(
	devC_params.mu((zp.y - zn.y)/(2.0dz) + (yp.z - yn.z)/(2.0*dy));

	/*
	- if (x == 0 && y == 0 && z == 0)
	- printf("mu = %f\n", mu);
	- if (tau_xz > 1.0e-6)
	- printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
	- if (tau_yz > 1.0e-6)
	- printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
	- if (tau_zz > 1.0e-6)
	- printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
	- */
	+ if (x == 0 && y == 0 && z == 0)
	+ printf("mu = %f\n", mu);
	+ if (tau_xz > 1.0e-6)
	+ printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
	+ if (tau_yz > 1.0e-6)
	+ printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
	+ if (tau_zz > 1.0e-6)
	+ printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
	+ */

	// Store values in global memory
	__syncthreads();
	t@@ -1842,9 +1849,9 @@ __global__ void findNSstressTensor(

	// Find the divergence of phivv
	__global__ void findNSdivphiviv(
	- Float* dev_ns_phi, // in
	- Float3* dev_ns_v, // in
	- Float3* dev_ns_div_phi_vi_v) // out
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float3* __restrict__ dev_ns_v, // in
	+ Float3* __restrict__ dev_ns_div_phi_vi_v) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -1888,58 +1895,58 @@ __global__ void findNSdivphiviv(
	// Calculate upwind coefficients
	//*
	const Float3 a = MAKE_FLOAT3(
	- copysign(1.0, v.x),
	- copysign(1.0, v.y),
	- copysign(1.0, v.z));
	+ copysign(1.0, v.x),
	+ copysign(1.0, v.y),
	+ copysign(1.0, v.z));

	// Calculate the divergence based on the upwind differences (Griebel et
	// al. 1998, eq. 3.9)
	const Float3 div_uw = MAKE_FLOAT3(
	- // x
	- ((1.0 + a.x)(phiv.xv.x - phi_xnv_xn.x*v_xn.x) +
	- (1.0 - a.x)(phi_xpv_xp.xv_xp.x - phiv.xv.x))/(2.0dx) +
	+ // x
	+ ((1.0 + a.x)(phiv.xv.x - phi_xnv_xn.x*v_xn.x) +
	+ (1.0 - a.x)(phi_xpv_xp.xv_xp.x - phiv.xv.x))/(2.0dx) +

	- ((1.0 + a.y)(phiv.xv.y - phi_ynv_yn.x*v_yn.y) +
	- (1.0 - a.y)(phi_ypv_yp.xv_yp.y - phiv.xv.y))/(2.0dy) +
	+ ((1.0 + a.y)(phiv.xv.y - phi_ynv_yn.x*v_yn.y) +
	+ (1.0 - a.y)(phi_ypv_yp.xv_yp.y - phiv.xv.y))/(2.0dy) +

	- ((1.0 + a.z)(phiv.xv.z - phi_znv_zn.x*v_zn.z) +
	- (1.0 - a.z)(phi_zpv_zp.xv_zp.z - phiv.xv.z))/(2.0dz),
	+ ((1.0 + a.z)(phiv.xv.z - phi_znv_zn.x*v_zn.z) +
	+ (1.0 - a.z)(phi_zpv_zp.xv_zp.z - phiv.xv.z))/(2.0dz),

	- // y
	- ((1.0 + a.x)(phiv.yv.x - phi_xnv_xn.y*v_xn.x) +
	- (1.0 - a.x)(phi_xpv_xp.yv_xp.x - phiv.yv.x))/(2.0dx) +
	+ // y
	+ ((1.0 + a.x)(phiv.yv.x - phi_xnv_xn.y*v_xn.x) +
	+ (1.0 - a.x)(phi_xpv_xp.yv_xp.x - phiv.yv.x))/(2.0dx) +

	- ((1.0 + a.y)(phiv.yv.y - phi_ynv_yn.y*v_yn.y) +
	- (1.0 - a.y)(phi_ypv_yp.yv_yp.y - phiv.yv.y))/(2.0dy) +
	+ ((1.0 + a.y)(phiv.yv.y - phi_ynv_yn.y*v_yn.y) +
	+ (1.0 - a.y)(phi_ypv_yp.yv_yp.y - phiv.yv.y))/(2.0dy) +

	- ((1.0 + a.z)(phiv.yv.z - phi_znv_zn.y*v_zn.z) +
	- (1.0 - a.z)(phi_zpv_zp.yv_zp.z - phiv.yv.z))/(2.0dz),
	+ ((1.0 + a.z)(phiv.yv.z - phi_znv_zn.y*v_zn.z) +
	+ (1.0 - a.z)(phi_zpv_zp.yv_zp.z - phiv.yv.z))/(2.0dz),

	- // z
	- ((1.0 + a.x)(phiv.zv.x - phi_xnv_xn.z*v_xn.x) +
	- (1.0 - a.x)(phi_xpv_xp.zv_xp.x - phiv.zv.x))/(2.0dx) +
	+ // z
	+ ((1.0 + a.x)(phiv.zv.x - phi_xnv_xn.z*v_xn.x) +
	+ (1.0 - a.x)(phi_xpv_xp.zv_xp.x - phiv.zv.x))/(2.0dx) +

	- ((1.0 + a.y)(phiv.zv.y - phi_ynv_yn.z*v_yn.y) +
	- (1.0 - a.y)(phi_ypv_yp.zv_yp.y - phiv.zv.y))/(2.0dy) +
	+ ((1.0 + a.y)(phiv.zv.y - phi_ynv_yn.z*v_yn.y) +
	+ (1.0 - a.y)(phi_ypv_yp.zv_yp.y - phiv.zv.y))/(2.0dy) +

	- ((1.0 + a.z)(phiv.zv.z - phi_znv_zn.z*v_zn.z) +
	- (1.0 - a.z)(phi_zpv_zp.zv_zp.z - phiv.zv.z))/(2.0dz));
	+ ((1.0 + a.z)(phiv.zv.z - phi_znv_zn.z*v_zn.z) +
	+ (1.0 - a.z)(phi_zpv_zp.zv_zp.z - phiv.zv.z))/(2.0dz));


	// Calculate the divergence based on the central-difference gradients
	const Float3 div_cd = MAKE_FLOAT3(
	- // x
	- (phi_xpv_xp.xv_xp.x - phi_xnv_xn.xv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.xv_yp.y - phi_ynv_yn.xv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.xv_zp.z - phi_znv_zn.xv_zn.z)/(2.0*dz),
	- // y
	- (phi_xpv_xp.yv_xp.x - phi_xnv_xn.yv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.yv_yp.y - phi_ynv_yn.yv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.yv_zp.z - phi_znv_zn.yv_zn.z)/(2.0*dz),
	- // z
	- (phi_xpv_xp.zv_xp.x - phi_xnv_xn.zv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.zv_yp.y - phi_ynv_yn.zv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.zv_zp.z - phi_znv_zn.zv_zn.z)/(2.0*dz));
	+ // x
	+ (phi_xpv_xp.xv_xp.x - phi_xnv_xn.xv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.xv_yp.y - phi_ynv_yn.xv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.xv_zp.z - phi_znv_zn.xv_zn.z)/(2.0*dz),
	+ // y
	+ (phi_xpv_xp.yv_xp.x - phi_xnv_xn.yv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.yv_yp.y - phi_ynv_yn.yv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.yv_zp.z - phi_znv_zn.yv_zn.z)/(2.0*dz),
	+ // z
	+ (phi_xpv_xp.zv_xp.x - phi_xnv_xn.zv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.zv_yp.y - phi_ynv_yn.zv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.zv_zp.z - phi_znv_zn.zv_zn.z)/(2.0*dz));

	// Weighting parameter
	const Float tau = 0.5;
	t@@ -1951,26 +1958,26 @@ __global__ void findNSdivphiviv(
	/*
	// Calculate the divergence: div(phiv_iv)
	const Float3 div_phi_vi_v = MAKE_FLOAT3(
	- // x
	- (phi_xpv_xp.xv_xp.x - phi_xnv_xn.xv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.xv_yp.y - phi_ynv_yn.xv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.xv_zp.z - phi_znv_zn.xv_zn.z)/(2.0*dz),
	- // y
	- (phi_xpv_xp.yv_xp.x - phi_xnv_xn.yv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.yv_yp.y - phi_ynv_yn.yv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.yv_zp.z - phi_znv_zn.yv_zn.z)/(2.0*dz),
	- // z
	- (phi_xpv_xp.zv_xp.x - phi_xnv_xn.zv_xn.x)/(2.0*dx) +
	- (phi_ypv_yp.zv_yp.y - phi_ynv_yn.zv_yn.y)/(2.0*dy) +
	- (phi_zpv_zp.zv_zp.z - phi_znv_zn.zv_zn.z)/(2.0*dz));
	- // */
	+ // x
	+ (phi_xpv_xp.xv_xp.x - phi_xnv_xn.xv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.xv_yp.y - phi_ynv_yn.xv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.xv_zp.z - phi_znv_zn.xv_zn.z)/(2.0*dz),
	+ // y
	+ (phi_xpv_xp.yv_xp.x - phi_xnv_xn.yv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.yv_yp.y - phi_ynv_yn.yv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.yv_zp.z - phi_znv_zn.yv_zn.z)/(2.0*dz),
	+ // z
	+ (phi_xpv_xp.zv_xp.x - phi_xnv_xn.zv_xn.x)/(2.0*dx) +
	+ (phi_ypv_yp.zv_yp.y - phi_ynv_yn.zv_yn.y)/(2.0*dy) +
	+ (phi_zpv_zp.zv_zp.z - phi_znv_zn.zv_zn.z)/(2.0*dz));
	+ // */

	// Write divergence
	__syncthreads();
	dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v;

	//printf("div(phivv) [%d,%d,%d] = %f, %f, %f\n", x,y,z,
	- //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
	+ //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);

	#ifdef CHECK_NS_FINITE
	(void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v);
	t@@ -1979,8 +1986,8 @@ __global__ void findNSdivphiviv(
	}

	__global__ void findNSdivtau(
	- Float* dev_ns_tau, // in
	- Float3* dev_ns_div_tau) // out
	+ const Float* __restrict__ dev_ns_tau, // in
	+ Float3* __restrict__ dev_ns_div_tau) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2015,9 +2022,9 @@ __global__ void findNSdivtau(

	// Find the divergence of phi*tau
	__global__ void findNSdivphitau(
	- Float* dev_ns_phi, // in
	- Float* dev_ns_tau, // in
	- Float3* dev_ns_div_phi_tau) // out
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float* __restrict__ dev_ns_tau, // in
	+ Float3* __restrict__ dev_ns_div_phi_tau) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2094,18 +2101,18 @@ __global__ void findNSdivphitau(

	// Calculate div(phi*tau)
	const Float3 div_phi_tau = MAKE_FLOAT3(
	- // x
	- (phi_xptau_xx_xp - phi_xntau_xx_xn)/dx +
	- (phi_yptau_xy_yp - phi_yntau_xy_yn)/dy +
	- (phi_zptau_xz_zp - phi_zntau_xz_zn)/dz,
	- // y
	- (phi_xptau_xy_xp - phi_xntau_xy_xn)/dx +
	- (phi_yptau_yy_yp - phi_yntau_yy_yn)/dy +
	- (phi_zptau_yz_zp - phi_zntau_yz_zn)/dz,
	- // z
	- (phi_xptau_xz_xp - phi_xntau_xz_xn)/dx +
	- (phi_yptau_yz_yp - phi_yntau_yz_yn)/dy +
	- (phi_zptau_zz_zp - phi_zntau_zz_zn)/dz);
	+ // x
	+ (phi_xptau_xx_xp - phi_xntau_xx_xn)/dx +
	+ (phi_yptau_xy_yp - phi_yntau_xy_yn)/dy +
	+ (phi_zptau_xz_zp - phi_zntau_xz_zn)/dz,
	+ // y
	+ (phi_xptau_xy_xp - phi_xntau_xy_xn)/dx +
	+ (phi_yptau_yy_yp - phi_yntau_yy_yn)/dy +
	+ (phi_zptau_yz_zp - phi_zntau_yz_zn)/dz,
	+ // z
	+ (phi_xptau_xz_xp - phi_xntau_xz_xn)/dx +
	+ (phi_yptau_yz_yp - phi_yntau_yz_yn)/dy +
	+ (phi_zptau_zz_zp - phi_zntau_zz_zn)/dz);

	// Write divergence
	__syncthreads();
	t@@ -2120,9 +2127,9 @@ __global__ void findNSdivphitau(
	// Find the divergence of phi v v
	// Unused
	__global__ void findNSdivphivv(
	- Float* dev_ns_v_prod, // in
	- Float* dev_ns_phi, // in
	- Float3* dev_ns_div_phi_v_v) // out
	+ const Float* __restrict__ dev_ns_v_prod, // in
	+ const Float* __restrict__ dev_ns_phi, // in
	+ Float3* __restrict__ dev_ns_div_phi_v_v) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2174,24 +2181,24 @@ __global__ void findNSdivphivv(
	// The symmetry described in findvvOuterProdNS is used
	__syncthreads();
	const Float3 div = MAKE_FLOAT3(
	- ((dev_ns_v_prod[idx(x+1,y,z)6]phi_xp
	- - dev_ns_v_prod[idx(x-1,y,z)6]phi_xn)/(2.0*dx) +
	- (dev_ns_v_prod[idx(x,y+1,z)6+1]phi_yp
	- - dev_ns_v_prod[idx(x,y-1,z)6+1]phi_yn)/(2.0*dy) +
	- (dev_ns_v_prod[idx(x,y,z+1)6+2]phi_zp
	- - dev_ns_v_prod[idx(x,y,z-1)6+2]phi_zn)/(2.0*dz)),
	- ((dev_ns_v_prod[idx(x+1,y,z)6+1]phi_xp
	- - dev_ns_v_prod[idx(x-1,y,z)6+1]phi_xn)/(2.0*dx) +
	- (dev_ns_v_prod[idx(x,y+1,z)6+3]phi_yp
	- - dev_ns_v_prod[idx(x,y-1,z)6+3]phi_yn)/(2.0*dy) +
	- (dev_ns_v_prod[idx(x,y,z+1)6+4]phi_zp
	- - dev_ns_v_prod[idx(x,y,z-1)6+4]phi_zn)/(2.0*dz)),
	- ((dev_ns_v_prod[idx(x+1,y,z)6+2]phi_xp
	- - dev_ns_v_prod[idx(x-1,y,z)6+2]phi_xn)/(2.0*dx) +
	- (dev_ns_v_prod[idx(x,y+1,z)6+4]phi_yp
	- - dev_ns_v_prod[idx(x,y-1,z)6+4]phi_yn)/(2.0*dy) +
	- (dev_ns_v_prod[idx(x,y,z+1)6+5]phi_zp
	- - dev_ns_v_prod[idx(x,y,z-1)6+5]phi_zn)/(2.0*dz)) );
	+ ((dev_ns_v_prod[idx(x+1,y,z)6]phi_xp
	+ - dev_ns_v_prod[idx(x-1,y,z)6]phi_xn)/(2.0*dx) +
	+ (dev_ns_v_prod[idx(x,y+1,z)6+1]phi_yp
	+ - dev_ns_v_prod[idx(x,y-1,z)6+1]phi_yn)/(2.0*dy) +
	+ (dev_ns_v_prod[idx(x,y,z+1)6+2]phi_zp
	+ - dev_ns_v_prod[idx(x,y,z-1)6+2]phi_zn)/(2.0*dz)),
	+ ((dev_ns_v_prod[idx(x+1,y,z)6+1]phi_xp
	+ - dev_ns_v_prod[idx(x-1,y,z)6+1]phi_xn)/(2.0*dx) +
	+ (dev_ns_v_prod[idx(x,y+1,z)6+3]phi_yp
	+ - dev_ns_v_prod[idx(x,y-1,z)6+3]phi_yn)/(2.0*dy) +
	+ (dev_ns_v_prod[idx(x,y,z+1)6+4]phi_zp
	+ - dev_ns_v_prod[idx(x,y,z-1)6+4]phi_zn)/(2.0*dz)),
	+ ((dev_ns_v_prod[idx(x+1,y,z)6+2]phi_xp
	+ - dev_ns_v_prod[idx(x-1,y,z)6+2]phi_xn)/(2.0*dx) +
	+ (dev_ns_v_prod[idx(x,y+1,z)6+4]phi_yp
	+ - dev_ns_v_prod[idx(x,y-1,z)6+4]phi_yn)/(2.0*dy) +
	+ (dev_ns_v_prod[idx(x,y,z+1)6+5]phi_zp
	+ - dev_ns_v_prod[idx(x,y,z-1)6+5]phi_zn)/(2.0*dz)) );

	//printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z);

	t@@ -2209,25 +2216,25 @@ __global__ void findNSdivphivv(
	// Find predicted fluid velocity
	// Launch per face.
	__global__ void findPredNSvelocities(
	- Float* dev_ns_p, // in
	- Float* dev_ns_v_x, // in
	- Float* dev_ns_v_y, // in
	- Float* dev_ns_v_z, // in
	- Float* dev_ns_phi, // in
	- Float* dev_ns_dphi, // in
	- Float* dev_ns_div_tau_x, // in
	- Float* dev_ns_div_tau_y, // in
	- Float* dev_ns_div_tau_z, // in
	- Float3* dev_ns_div_phi_vi_v, // in
	- int bc_bot, // in
	- int bc_top, // in
	- Float beta, // in
	- Float3* dev_ns_F_pf, // in
	- unsigned int ndem, // in
	- Float c_grad_p, // in
	- Float* dev_ns_v_p_x, // out
	- Float* dev_ns_v_p_y, // out
	- Float* dev_ns_v_p_z) // out
	+ const Float* __restrict__ dev_ns_p, // in
	+ const Float* __restrict__ dev_ns_v_x, // in
	+ const Float* __restrict__ dev_ns_v_y, // in
	+ const Float* __restrict__ dev_ns_v_z, // in
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float* __restrict__ dev_ns_dphi, // in
	+ const Float* __restrict__ dev_ns_div_tau_x, // in
	+ const Float* __restrict__ dev_ns_div_tau_y, // in
	+ const Float* __restrict__ dev_ns_div_tau_z, // in
	+ const Float3* __restrict__ dev_ns_div_phi_vi_v, // in
	+ const int bc_bot, // in
	+ const int bc_top, // in
	+ const Float beta, // in
	+ const Float3* __restrict__ dev_ns_F_pf, // in
	+ const unsigned int ndem, // in
	+ const Float __restrict__ c_grad_p, // in
	+ Float* __restrict__ dev_ns_v_p_x, // out
	+ Float* __restrict__ dev_ns_v_p_y, // out
	+ Float* __restrict__ dev_ns_v_p_z) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2255,9 +2262,9 @@ __global__ void findPredNSvelocities(
	// Values that are needed for calculating the predicted velocity
	__syncthreads();
	const Float3 v = MAKE_FLOAT3(
	- dev_ns_v_x[fidx],
	- dev_ns_v_y[fidx],
	- dev_ns_v_z[fidx]);
	+ dev_ns_v_x[fidx],
	+ dev_ns_v_y[fidx],
	+ dev_ns_v_z[fidx]);

	Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0);
	if (devC_params.mu > 0.0) {
	t@@ -2285,13 +2292,13 @@ __global__ void findPredNSvelocities(

	// component-wise average values
	const Float3 phi = MAKE_FLOAT3(
	- amean(phi_c, phi_xn),
	- amean(phi_c, phi_yn),
	- amean(phi_c, phi_zn));
	+ amean(phi_c, phi_xn),
	+ amean(phi_c, phi_yn),
	+ amean(phi_c, phi_zn));
	const Float3 dphi = MAKE_FLOAT3(
	- amean(dphi_c, dphi_xn),
	- amean(dphi_c, dphi_yn),
	- amean(dphi_c, dphi_zn));
	+ amean(dphi_c, dphi_xn),
	+ amean(dphi_c, dphi_yn),
	+ amean(dphi_c, dphi_zn));

	// The particle-fluid interaction force should only be incoorporated if
	// there is a fluid viscosity
	t@@ -2308,9 +2315,9 @@ __global__ void findPredNSvelocities(
	f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
	}
	const Float3 f_i = MAKE_FLOAT3(
	- amean(f_i_c.x, f_i_xn.x),
	- amean(f_i_c.y, f_i_yn.y),
	- amean(f_i_c.z, f_i_zn.z));
	+ amean(f_i_c.x, f_i_xn.x),
	+ amean(f_i_c.y, f_i_yn.y),
	+ amean(f_i_c.z, f_i_zn.z));

	const Float dt = ndem*devC_dt;
	const Float rho = devC_params.rho_f;
	t@@ -2326,9 +2333,9 @@ __global__ void findPredNSvelocities(
	const Float p_yn = dev_ns_p[idx(x,y-1,z)];
	const Float p_zn = dev_ns_p[idx(x,y,z-1)];
	const Float3 grad_p = MAKE_FLOAT3(
	- (p - p_xn)/dx,
	- (p - p_yn)/dy,
	- (p - p_zn)/dz) * c_grad_p;
	+ (p - p_xn)/dx,
	+ (p - p_yn)/dy,
	+ (p - p_zn)/dz) * c_grad_p;
	#ifdef SET_1
	pressure_term = -betadt/(rhophi)*grad_p;
	#endif
	t@@ -2338,16 +2345,16 @@ __global__ void findPredNSvelocities(
	}

	const Float3 div_phi_vi_v = MAKE_FLOAT3(
	- amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
	- amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
	- amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
	+ amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
	+ amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
	+ amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));

	// Determine the predicted velocity
	#ifdef SET_1
	const Float3 interaction_term = -dt/(rhophi)f_i;
	const Float3 diffusion_term = dt/(rhophi)div_tau;
	const Float3 gravity_term = MAKE_FLOAT3(
	- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
	+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
	const Float3 porosity_term = -1.0vdphi/phi;
	const Float3 advection_term = -1.0div_phi_vi_vdt/phi;
	#endif
	t@@ -2355,7 +2362,7 @@ __global__ void findPredNSvelocities(
	const Float3 interaction_term = -dt/(rhophi)f_i;
	const Float3 diffusion_term = dt/rho*div_tau;
	const Float3 gravity_term = MAKE_FLOAT3(
	- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
	+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
	const Float3 porosity_term = -1.0vdphi/phi;
	const Float3 advection_term = -1.0div_phi_vi_vdt/phi;
	#endif
	t@@ -2376,31 +2383,31 @@ __global__ void findPredNSvelocities(

	// No slip
	/*if ((z == 0 && bc_bot == 2) \|\| (z == nz-1 && bc_top == 2)) {
	- v_p.x = 0.0;
	- v_p.y = 0.0;
	- v_p.z = 0.0;
	- }*/
	+ v_p.x = 0.0;
	+ v_p.y = 0.0;
	+ v_p.z = 0.0;
	+ }*/


	#ifdef REPORT_V_P_COMPONENTS
	// Report velocity components to stdout for debugging
	if (z==0)
	- printf("\n[%d,%d,%d]"
	- "\tv_p = %+e %+e %+e\n"
	- "\tpres = %+e %+e %+e\n"
	- "\tinteract = %+e %+e %+e\n"
	- "\tdiff = %+e %+e %+e\n"
	- "\tgrav = %+e %+e %+e\n"
	- "\tporos = %+e %+e %+e\n"
	- "\tadv = %+e %+e %+e\n",
	- x, y, z,
	- v_p.x, v_p.y, v_p.z,
	- pressure_term.x, pressure_term.y, pressure_term.z,
	- interaction_term.x, interaction_term.y, interaction_term.z,
	- diffusion_term.x, diffusion_term.y, diffusion_term.z,
	- gravity_term.x, gravity_term.y, gravity_term.z,
	- porosity_term.x, porosity_term.y, porosity_term.z,
	- advection_term.x, advection_term.y, advection_term.z);
	+ printf("\n[%d,%d,%d]"
	+ "\tv_p = %+e %+e %+e\n"
	+ "\tpres = %+e %+e %+e\n"
	+ "\tinteract = %+e %+e %+e\n"
	+ "\tdiff = %+e %+e %+e\n"
	+ "\tgrav = %+e %+e %+e\n"
	+ "\tporos = %+e %+e %+e\n"
	+ "\tadv = %+e %+e %+e\n",
	+ x, y, z,
	+ v_p.x, v_p.y, v_p.z,
	+ pressure_term.x, pressure_term.y, pressure_term.z,
	+ interaction_term.x, interaction_term.y, interaction_term.z,
	+ diffusion_term.x, diffusion_term.y, diffusion_term.z,
	+ gravity_term.x, gravity_term.y, gravity_term.z,
	+ porosity_term.x, porosity_term.y, porosity_term.z,
	+ advection_term.x, advection_term.y, advection_term.z);
	#endif

	// Save the predicted velocity
	t@@ -2421,18 +2428,18 @@ __global__ void findPredNSvelocities(
	// At each iteration, the value of the forcing function is found as:
	// f = f1 - f2 dot grad(epsilon)
	__global__ void findNSforcing(
	- Float* dev_ns_epsilon, // in
	- Float* dev_ns_phi, // in
	- Float* dev_ns_dphi, // in
	- Float3* dev_ns_v_p, // in
	- Float* dev_ns_v_p_x, // in
	- Float* dev_ns_v_p_y, // in
	- Float* dev_ns_v_p_z, // in
	- unsigned int nijac, // in
	- unsigned int ndem, // in
	- Float* dev_ns_f1, // out
	- Float3* dev_ns_f2, // out
	- Float* dev_ns_f) // out
	+ const Float* __restrict__ dev_ns_epsilon, // in
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float* __restrict__ dev_ns_dphi, // in
	+ const Float3* __restrict__ dev_ns_v_p, // in
	+ const Float* __restrict__ dev_ns_v_p_x, // in
	+ const Float* __restrict__ dev_ns_v_p_y, // in
	+ const Float* __restrict__ dev_ns_v_p_z, // in
	+ const unsigned int nijac, // in
	+ const unsigned int ndem, // in
	+ Float* __restrict__ dev_ns_f1, // out
	+ Float3* __restrict__ dev_ns_f2, // out
	+ Float* __restrict__ dev_ns_f) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2472,7 +2479,7 @@ __global__ void findNSforcing(
	// Calculate derivatives
	const Float div_v_p
	= divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z,
	- x, y, z, dx, dy, dz);
	+ x, y, z, dx, dy, dz);
	const Float3 grad_phi
	= gradient(dev_ns_phi, x, y, z, dx, dy, dz);

	t@@ -2497,7 +2504,7 @@ __global__ void findNSforcing(
	#ifdef REPORT_FORCING_TERMS
	// Report values terms in the forcing function for debugging
	printf("[%d,%d,%d]\tt1 = %f\tt2 = %f\tt4 = %f\n",
	- x,y,z, t1, t2, t4);
	+ x,y,z, t1, t2, t4);
	#endif

	// Save values
	t@@ -2524,7 +2531,7 @@ __global__ void findNSforcing(
	const Float t3 = -dot(f2, grad_epsilon);
	if (z >= nz-3)
	printf("[%d,%d,%d]\tf = %f\tf1 = %f\tt3 = %f\n",
	- x,y,z, f, f1, t3);
	+ x,y,z, f, f1, t3);
	#endif

	// Save forcing function value
	t@@ -2542,10 +2549,10 @@ __global__ void findNSforcing(
	// non-smoothed and smoothed values.
	template<typename T>
	__global__ void smoothing(
	- T* dev_arr,
	- const Float gamma,
	- const unsigned int bc_bot,
	- const unsigned int bc_top)
	+ T* __restrict__ dev_arr,
	+ const Float gamma,
	+ const unsigned int bc_bot,
	+ const unsigned int bc_top)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2602,13 +2609,13 @@ __global__ void smoothing(

	// Perform a single Jacobi iteration
	__global__ void jacobiIterationNS(
	- const Float* dev_ns_epsilon,
	- Float* dev_ns_epsilon_new,
	- Float* dev_ns_norm,
	- const Float* dev_ns_f,
	- const int bc_bot,
	- const int bc_top,
	- const Float theta)
	+ const Float* __restrict__ dev_ns_epsilon,
	+ Float* __restrict__ dev_ns_epsilon_new,
	+ Float* __restrict__ dev_ns_norm,
	+ const Float* __restrict__ dev_ns_f,
	+ const int bc_bot,
	+ const int bc_top,
	+ const Float theta)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2672,9 +2679,9 @@ __global__ void jacobiIterationNS(
	const Float dzdz = dz*dz;
	Float e_new
	= (-dxdxdydydzdz*f
	- + dydydzdz(e_xn + e_xp)
	- + dxdxdzdz(e_yn + e_yp)
	- + dxdxdydy(e_zn + e_zp))
	+ + dydydzdz(e_xn + e_xp)
	+ + dxdxdzdz(e_yn + e_yp)
	+ + dxdxdydy(e_zn + e_zp))
	/(2.0(dxdxdydy + dxdxdzdz + dydydzdz));

	// New value of epsilon in 1D update
	t@@ -2682,7 +2689,7 @@ __global__ void jacobiIterationNS(

	// Print values for debugging
	/*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n",
	- x,y,z, e, f, e_new);*/
	+ x,y,z, e, f, e_new);*/

	const Float res_norm = (e_new - e)(e_new - e)/(e_newe_new + 1.0e-16);
	const Float e_relax = e(1.0-theta) + e_newtheta;
	t@@ -2697,7 +2704,7 @@ __global__ void jacobiIterationNS(
	//(void)checkFiniteFloat("res_norm", x, y, z, res_norm);
	if (checkFiniteFloat("res_norm", x, y, z, res_norm)) {
	printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n",
	- x,y,z, e, f, e_new, res_norm);
	+ x,y,z, e, f, e_new, res_norm);
	}
	#endif
	}
	t@@ -2706,8 +2713,8 @@ __global__ void jacobiIterationNS(
	// Copy all values from one array to the other
	template<typename T>
	__global__ void copyValues(
	- T* dev_read,
	- T* dev_write)
	+ const T* __restrict__ dev_read,
	+ T* __restrict__ dev_write)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2717,10 +2724,10 @@ __global__ void copyValues(
	// Internal nodes only
	if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {

	- // Internal nodes + ghost nodes
	- /*if (x <= devC_grid.num[0]+1 &&
	- y <= devC_grid.num[1]+1 &&
	- z <= devC_grid.num[2]+1) {*/
	+ // Internal nodes + ghost nodes
	+ /*if (x <= devC_grid.num[0]+1 &&
	+ y <= devC_grid.num[1]+1 &&
	+ z <= devC_grid.num[2]+1) {*/

	const unsigned int cellidx = idx(x,y,z); // without ghost nodes
	//const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes
	t@@ -2730,7 +2737,7 @@ __global__ void copyValues(
	const T val = dev_read[cellidx];

	//if (z == devC_grid.num[2]-1)
	- //printf("[%d,%d,%d] = %f\n", x, y, z, val);
	+ //printf("[%d,%d,%d] = %f\n", x, y, z, val);

	// Write
	__syncthreads();
	t@@ -2740,11 +2747,11 @@ __global__ void copyValues(

	// Find and store the normalized residuals
	__global__ void findNormalizedResiduals(
	- Float* dev_ns_epsilon_old,
	- Float* dev_ns_epsilon,
	- Float* dev_ns_norm,
	- const unsigned int bc_bot,
	- const unsigned int bc_top)
	+ const Float* __restrict__ dev_ns_epsilon_old,
	+ const Float* __restrict__ dev_ns_epsilon,
	+ Float* __restrict__ dev_ns_norm,
	+ const unsigned int bc_bot,
	+ const unsigned int bc_top)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2791,9 +2798,9 @@ __global__ void findNormalizedResiduals(

	// Computes the new velocity and pressure using the corrector
	__global__ void updateNSpressure(
	- Float* dev_ns_epsilon, // in
	- Float beta, // in
	- Float* dev_ns_p) // out
	+ const Float* __restrict__ dev_ns_epsilon, // in
	+ const Float __restrict__ beta, // in
	+ Float* __restrict__ dev_ns_p) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2825,19 +2832,19 @@ __global__ void updateNSpressure(
	}

	__global__ void updateNSvelocity(
	- Float* dev_ns_v_p_x, // in
	- Float* dev_ns_v_p_y, // in
	- Float* dev_ns_v_p_z, // in
	- Float* dev_ns_phi, // in
	- Float* dev_ns_epsilon, // in
	- Float beta, // in
	- int bc_bot, // in
	- int bc_top, // in
	- unsigned int ndem, // in
	- Float c_grad_p, // in
	- Float* dev_ns_v_x, // out
	- Float* dev_ns_v_y, // out
	- Float* dev_ns_v_z) // out
	+ const Float* __restrict__ dev_ns_v_p_x, // in
	+ const Float* __restrict__ dev_ns_v_p_y, // in
	+ const Float* __restrict__ dev_ns_v_p_z, // in
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float* __restrict__ dev_ns_epsilon, // in
	+ const Float beta, // in
	+ const int bc_bot, // in
	+ const int bc_top, // in
	+ const unsigned int ndem, // in
	+ const Float c_grad_p, // in
	+ Float* __restrict__ dev_ns_v_x, // out
	+ Float* __restrict__ dev_ns_v_y, // out
	+ Float* __restrict__ dev_ns_v_z) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -2879,15 +2886,15 @@ __global__ void updateNSvelocity(
	const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];

	const Float3 phi = MAKE_FLOAT3(
	- amean(phi_c, phi_xn),
	- amean(phi_c, phi_yn),
	- amean(phi_c, phi_zn));
	+ amean(phi_c, phi_xn),
	+ amean(phi_c, phi_yn),
	+ amean(phi_c, phi_zn));

	// Find corrector gradient
	const Float3 grad_epsilon = MAKE_FLOAT3(
	- (epsilon_c - epsilon_xn)/dx,
	- (epsilon_c - epsilon_yn)/dy,
	- (epsilon_c - epsilon_zn)/dz) * c_grad_p;
	+ (epsilon_c - epsilon_xn)/dx,
	+ (epsilon_c - epsilon_yn)/dy,
	+ (epsilon_c - epsilon_zn)/dz) * c_grad_p;

	// Find new velocity
	#ifdef SET_1
	t@@ -2900,16 +2907,16 @@ __global__ void updateNSvelocity(

	// Print values for debugging
	/* if (z == 0) {
	- Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
	- Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
	- printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
	- x,y,z,
	- grad_epsilon.x,
	- grad_epsilon.y,
	- grad_epsilon.z,
	- e_up,
	- e_down);
	- }*/
	+ Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
	+ Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
	+ printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
	+ x,y,z,
	+ grad_epsilon.x,
	+ grad_epsilon.y,
	+ grad_epsilon.z,
	+ e_up,
	+ e_down);
	+ }*/

	if ((z == 0 && bc_bot == 1) \|\| (z == nz-1 && bc_top == 1))
	v.z = 0.0;
	t@@ -2919,11 +2926,11 @@ __global__ void updateNSvelocity(

	// Check the advection term using the Courant-Friedrichs-Lewy condition
	if (v.xndemdevC_dt/dx
	- + v.yndemdevC_dt/dy
	- + v.zndemdevC_dt/dz > 1.0) {
	+ + v.yndemdevC_dt/dy
	+ + v.zndemdevC_dt/dz > 1.0) {
	printf("[%d,%d,%d] Warning: Advection term in fluid may be "
	- "unstable (CFL condition), v = %f,%f,%f\n",
	- x,y,z, v.x, v.y, v.z);
	+ "unstable (CFL condition), v = %f,%f,%f\n",
	+ x,y,z, v.x, v.y, v.z);
	}

	// Write new values
	t@@ -2941,12 +2948,12 @@ __global__ void updateNSvelocity(
	// Find the average particle diameter and velocity for each CFD cell.
	// UNUSED: The values are estimated in the porosity estimation function instead
	__global__ void findAvgParticleVelocityDiameter(
	- unsigned int* dev_cellStart, // in
	- unsigned int* dev_cellEnd, // in
	- Float4* dev_vel_sorted, // in
	- Float4* dev_x_sorted, // in
	- Float3* dev_ns_vp_avg, // out
	- Float* dev_ns_d_avg) // out
	+ const unsigned int* __restrict__ dev_cellStart, // in
	+ const unsigned int* __restrict__ dev_cellEnd, // in
	+ const Float4* __restrict__ dev_vel_sorted, // in
	+ const Float4* __restrict__ dev_x_sorted, // in
	+ Float3* dev_ns_vp_avg, // out
	+ Float* dev_ns_d_avg) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -3023,18 +3030,18 @@ __device__ Float dragCoefficient(Float re)
	// Find particle-fluid interaction force as outlined by Zhou et al. 2010, and
	// originally by Gidaspow 1992.
	__global__ void findInteractionForce(
	- Float4* dev_x, // in
	- Float4* dev_vel, // in
	- Float* dev_ns_phi, // in
	- Float* dev_ns_p, // in
	- Float* dev_ns_v_x, // in
	- Float* dev_ns_v_y, // in
	- Float* dev_ns_v_z, // in
	- Float* dev_ns_div_tau_x,// in
	- Float* dev_ns_div_tau_y,// in
	- Float* dev_ns_div_tau_z,// in
	- Float3* dev_ns_f_pf, // out
	- Float4* dev_force) // out
	+ const Float4* __restrict__ dev_x, // in
	+ const Float4* __restrict__ dev_vel, // in
	+ const Float* __restrict__ dev_ns_phi, // in
	+ const Float* __restrict__ dev_ns_p, // in
	+ const Float* __restrict__ dev_ns_v_x, // in
	+ const Float* __restrict__ dev_ns_v_y, // in
	+ const Float* __restrict__ dev_ns_v_z, // in
	+ const Float* __restrict__ dev_ns_div_tau_x,// in
	+ const Float* __restrict__ dev_ns_div_tau_y,// in
	+ const Float* __restrict__ dev_ns_div_tau_z,// in
	+ Float3* __restrict__ dev_ns_f_pf, // out
	+ Float4* __restrict__ dev_force) // out
	{
	unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index

	t@@ -3080,14 +3087,14 @@ __global__ void findInteractionForce(
	const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)];

	const Float3 v_f = MAKE_FLOAT3(
	- amean(v_x, v_x_p),
	- amean(v_y, v_y_p),
	- amean(v_z, v_z_p));
	+ amean(v_x, v_x_p),
	+ amean(v_y, v_y_p),
	+ amean(v_z, v_z_p));

	const Float3 div_tau = MAKE_FLOAT3(
	- amean(div_tau_x, div_tau_x_p),
	- amean(div_tau_y, div_tau_y_p),
	- amean(div_tau_z, div_tau_z_p));
	+ amean(div_tau_x, div_tau_x_p),
	+ amean(div_tau_y, div_tau_y_p),
	+ amean(div_tau_z, div_tau_z_p));

	const Float3 v_rel = v_f - v_p;
	const Float v_rel_length = length(v_rel);
	t@@ -3119,15 +3126,15 @@ __global__ void findInteractionForce(

	#ifdef CHECK_NS_FINITE
	/*
	- printf("\nfindInteractionForce %d [%d,%d,%d]\n"
	- "\tV_p = %f Re=%f Cd=%f chi=%f\n"
	- "\tf_d = %+e %+e %+e\n"
	- "\tf_p = %+e %+e %+e\n"
	- "\tf_v = %+e %+e %+e\n",
	- i, i_x, i_y, i_z, V_p, Re, Cd, chi,
	- f_d.x, f_d.y, f_d.z,
	- f_p.x, f_p.y, f_p.z,
	- f_v.x, f_v.y, f_v.z);// */
	+ printf("\nfindInteractionForce %d [%d,%d,%d]\n"
	+ "\tV_p = %f Re=%f Cd=%f chi=%f\n"
	+ "\tf_d = %+e %+e %+e\n"
	+ "\tf_p = %+e %+e %+e\n"
	+ "\tf_v = %+e %+e %+e\n",
	+ i, i_x, i_y, i_z, V_p, Re, Cd, chi,
	+ f_d.x, f_d.y, f_d.z,
	+ f_p.x, f_p.y, f_p.z,
	+ f_v.x, f_v.y, f_v.z);// */
	checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d);
	checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p);
	checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v);
	t@@ -3149,11 +3156,11 @@ __global__ void findInteractionForce(
	// Apply the fluid-particle interaction force to the fluid cell based on the
	// interaction forces from each particle in it
	__global__ void applyInteractionForceToFluid(
	- unsigned int* dev_gridParticleIndex, // in
	- unsigned int* dev_cellStart, // in
	- unsigned int* dev_cellEnd, // in
	- Float3* dev_ns_f_pf, // in
	- Float3* dev_ns_F_pf) // out
	+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
	+ const unsigned int* __restrict__ dev_cellStart, // in
	+ const unsigned int* __restrict__ dev_cellEnd, // in
	+ const Float3* __restrict__ dev_ns_f_pf, // in
	+ Float3* __restrict__ dev_ns_F_pf) // out
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -3212,10 +3219,10 @@ __global__ void applyInteractionForceToFluid(
	// Launch per cell face node.
	// Cell center ghost nodes must be set prior to call.
	__global__ void interpolateCenterToFace(
	- Float3* dev_in,
	- Float* dev_out_x,
	- Float* dev_out_y,
	- Float* dev_out_z)
	+ const Float3* __restrict__ dev_in,
	+ Float* __restrict__ dev_out_x,
	+ Float* __restrict__ dev_out_y,
	+ Float* __restrict__ dev_out_z)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -3224,8 +3231,8 @@ __global__ void interpolateCenterToFace(

	// Check that we are not outside the fluid grid
	if (x <= devC_grid.num[0]
	- && y <= devC_grid.num[1]
	- && z <= devC_grid.num[2]) {
	+ && y <= devC_grid.num[1]
	+ && z <= devC_grid.num[2]) {

	const unsigned int faceidx = vidx(x,y,z);

	t@@ -3249,10 +3256,10 @@ __global__ void interpolateCenterToFace(

	// Launch per cell center node
	__global__ void interpolateFaceToCenter(
	- Float* dev_in_x,
	- Float* dev_in_y,
	- Float* dev_in_z,
	- Float3* dev_out)
	+ Float* dev_in_x,
	+ Float* dev_in_y,
	+ Float* dev_in_z,
	+ Float3* dev_out)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -3277,9 +3284,9 @@ __global__ void interpolateFaceToCenter(
	const Float z_p = dev_in_z[vidx(x,y,z+1)];

	const Float3 val = MAKE_FLOAT3(
	- amean(x_n, x_p),
	- amean(y_n, y_p),
	- amean(z_n, z_p));
	+ amean(x_n, x_p),
	+ amean(y_n, y_p),
	+ amean(z_n, z_p));

	__syncthreads();
	//printf("[%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z);
	t@@ -3292,12 +3299,12 @@ __global__ void interpolateFaceToCenter(
	// Warning: The grid-corner values will be invalid, along with the non-normal
	// components of the ghost nodes
	__global__ void findFaceDivTau(
	- Float* dev_ns_v_x,
	- Float* dev_ns_v_y,
	- Float* dev_ns_v_z,
	- Float* dev_ns_div_tau_x,
	- Float* dev_ns_div_tau_y,
	- Float* dev_ns_div_tau_z)
	+ const Float* __restrict__ dev_ns_v_x,
	+ const Float* __restrict__ dev_ns_v_y,
	+ const Float* __restrict__ dev_ns_v_z,
	+ Float* __restrict__ dev_ns_div_tau_x,
	+ Float* __restrict__ dev_ns_div_tau_y,
	+ Float* __restrict__ dev_ns_div_tau_z)
	{
	// 3D thread index
	const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	t@@ -3347,19 +3354,19 @@ __global__ void findFaceDivTau(

	const Float div_tau_x =
	devC_params.mu*(
	- (v_x_xp - 2.0v_x + v_x_xn)/(dxdx) +
	- (v_x_yp - 2.0v_x + v_x_yn)/(dydy) +
	- (v_x_zp - 2.0v_x + v_x_zn)/(dzdz));
	+ (v_x_xp - 2.0v_x + v_x_xn)/(dxdx) +
	+ (v_x_yp - 2.0v_x + v_x_yn)/(dydy) +
	+ (v_x_zp - 2.0v_x + v_x_zn)/(dzdz));
	const Float div_tau_y =
	devC_params.mu*(
	- (v_y_xp - 2.0v_y + v_y_xn)/(dxdx) +
	- (v_y_yp - 2.0v_y + v_y_yn)/(dydy) +
	- (v_y_zp - 2.0v_y + v_y_zn)/(dzdz));
	+ (v_y_xp - 2.0v_y + v_y_xn)/(dxdx) +
	+ (v_y_yp - 2.0v_y + v_y_yn)/(dydy) +
	+ (v_y_zp - 2.0v_y + v_y_zn)/(dzdz));
	const Float div_tau_z =
	devC_params.mu*(
	- (v_z_xp - 2.0v_z + v_z_xn)/(dxdx) +
	- (v_z_yp - 2.0v_z + v_z_yn)/(dydy) +
	- (v_z_zp - 2.0v_z + v_z_zn)/(dzdz));
	+ (v_z_xp - 2.0v_z + v_z_xn)/(dxdx) +
	+ (v_z_yp - 2.0v_z + v_z_yn)/(dydy) +
	+ (v_z_zp - 2.0v_z + v_z_zn)/(dzdz));

	__syncthreads();
	//printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z,
	diff --git a/src/raytracer.cuh b/src/raytracer.cuh
	t@@ -6,12 +6,14 @@
	//#include "cuPrintf.cu"

	// Template for discarding the last term in four-component vector structs
	-__device__ __inline__ float3 f4_to_f3(float4 in) {
	+__device__ __inline__ float3 f4_to_f3(const float4 in) {
	return make_float3(in.x, in.y, in.z);
	}

	// Kernel for initializing image data
	-__global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
	+__global__ void imageInit(
	+ unsigned char* __restrict__ dev_img,
	+ const unsigned int pixels)
	{
	// Compute pixel position from threadIdx/blockIdx
	unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	t@@ -24,11 +26,12 @@ __global__ void imageInit(unsigned char* dev_img, unsigned…
	}

	// Calculate ray origins and directions
	-__global__ void rayInitPerspective(float4* dev_ray_origo,
	- float4* dev_ray_direction,
	- float4 eye,
	- unsigned int width,
	- unsigned int height)
	+__global__ void rayInitPerspective(
	+ float4* __restrict__ dev_ray_origo,
	+ float4* __restrict__ dev_ray_direction,
	+ const float4 eye,
	+ const unsigned int width,
	+ const unsigned int height)
	{
	// Compute pixel position from threadIdx/blockIdx
	unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	t@@ -47,15 +50,17 @@ __global__ void rayInitPerspective(float4* dev_ray_origo,

	// Write ray origo and direction to global memory
	dev_ray_origo[mempos] = make_float4(devC_eye, 0.0f);
	- dev_ray_direction[mempos] = make_float4(-devC_ddevC_w + p_udevC_u + p_v*…
	+ dev_ray_direction[mempos] =
	+ make_float4(-devC_ddevC_w + p_udevC_u + p_v*devC_v, 0.0f);
	}

	// Check wether the pixel's viewing ray intersects with the spheres,
	// and shade the pixel correspondingly
	-__global__ void rayIntersectSpheres(float4* dev_ray_origo,
	- float4* dev_ray_direction,
	- Float4* dev_x,
	- unsigned char* dev_img)
	+__global__ void rayIntersectSpheres(
	+ const float4* __restrict__ dev_ray_origo,
	+ const float4* __restrict__ dev_ray_direction,
	+ const Float4* __restrict__ dev_x,
	+ unsigned char* __restrict__ dev_img)
	{
	// Compute pixel position from threadIdx/blockIdx
	unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	t@@ -67,7 +72,8 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
	float3 d = f4_to_f3(dev_ray_direction[mempos]);
	//float step = length(d);

	- // Distance, in ray steps, between object and eye initialized with a large…
	+ // Distance, in ray steps, between object and eye initialized with
	+ // a large value
	float tdist = 1e10f;

	// Surface normal at closest sphere intersection
	t@@ -86,8 +92,6 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
	float3 c = make_float3(x.x, x.y, x.z);
	float R = x.w;

	- //cuPrintf("particle %d at: %f, %f, %f, radius: %f\n", i, c.x, c.y, c.…
	-
	// Calculate the discriminant: d = B^2 - 4AC
	float Delta = (2.0fdot(d,(e-c)))(2.0f*dot(d,(e-c))) // B^2
	- 4.0fdot(d,d) // -4A
	t@@ -98,8 +102,10 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
	if (Delta > 0.0f) {

	// Calculate roots, Shirley 2009 p. 77
	- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) …
	- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
	+ float t_minus = ((dot(-d,(e-c))
	+ - sqrt( dot(d,(e-c))*dot(d,(e-c))
	+ - dot(d,d)(dot((e-c),(e-c)) - RR)))
	+ / dot(d,d));

	// Check wether intersection is closer than previous values
	if (fabs(t_minus) < tdist) {
	t@@ -137,14 +143,15 @@ __global__ void rayIntersectSpheres(float4* dev_ray_orig…

	// Check wether the pixel's viewing ray intersects with the spheres,
	// and shade the pixel correspondingly using a colormap
	-__global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
	- float4* dev_ray_direction,
	- Float4* dev_x,
	- Float4* dev_vel,
	- Float* dev_linarr,
	- float max_val,
	- float lower_cutoff,
	- unsigned char* dev_img)
	+__global__ void rayIntersectSpheresColormap(
	+ const float4* __restrict__ dev_ray_origo,
	+ const float4* __restrict__ dev_ray_direction,
	+ const Float4* __restrict__ dev_x,
	+ const Float4* __restrict__ dev_vel,
	+ const Float* __restrict__ dev_linarr,
	+ const float max_val,
	+ const float lower_cutoff,
	+ unsigned char* __restrict__ dev_img)
	{
	// Compute pixel position from threadIdx/blockIdx
	unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	t@@ -155,7 +162,8 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra…
	float3 e = f4_to_f3(dev_ray_origo[mempos]);
	float3 d = f4_to_f3(dev_ray_direction[mempos]);

	- // Distance, in ray steps, between object and eye initialized with a large…
	+ // Distance, in ray steps, between object and eye initialized with
	+ // a large value
	float tdist = 1e10f;

	// Surface normal at closest sphere intersection
	t@@ -181,8 +189,6 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra…
	- 4.0fdot(d,d) // -4A
	* (dot((e-c),(e-c)) - R*R); // C

	-
	-
	// If the determinant is positive, there are two solutions
	// One where the line enters the sphere, and one where it exits
	if (lower_cutoff > 0.0) {
	t@@ -198,8 +204,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r…
	if (Delta > 0.0f && val > lower_cutoff && fixvel == 0.f) {

	// Calculate roots, Shirley 2009 p. 77
	- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-…
	- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
	+ float t_minus =
	+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
	+ - dot(d,d)(dot((e-c),(e-c)) - RR)…
	+ / dot(d,d));

	// Check wether intersection is closer than previous values
	if (fabs(t_minus) < tdist) {
	t@@ -217,8 +225,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r…
	if (Delta > 0.0f) {

	// Calculate roots, Shirley 2009 p. 77
	- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-…
	- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
	+ float t_minus =
	+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
	+ - dot(d,d)(dot((e-c),(e-c)) - RR)…
	+ / dot(d,d));

	// Check wether intersection is closer than previous values
	if (fabs(t_minus) < tdist) {
	t@@ -269,25 +279,28 @@ __global__ void rayIntersectSpheresColormap(float4* dev_…

	// Write shading model values to pixel color channels
	dev_img[mempos4] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*redv);
	+ + k_a * I_a)*redv);
	dev_img[mempos4 + 1] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*greenv);
	+ + k_a * I_a)*greenv);
	dev_img[mempos4 + 2] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*bluev);
	+ + k_a * I_a)*bluev);
	}
	}


	__host__ void DEM::cameraInit(
	- const float3 eye,
	- const float3 lookat,
	- const float imgw,
	- const float focalLength)
	+ const float3 eye,
	+ const float3 lookat,
	+ const float imgw,
	+ const float focalLength)
	{
	float hw_ratio = height/width;

	// Image dimensions in world space (l, r, b, t)
	- float4 imgplane = make_float4(-0.5fimgw, 0.5fimgw, -0.5fimgwhw_ratio, …
	+ float4 imgplane = make_float4(
	+ -0.5fimgw, 0.5fimgw,
	+ -0.5fimgwhw_ratio,
	+ 0.5fimgwhw_ratio);

	// The view vector
	float3 view = eye - lookat;
	t@@ -309,15 +322,6 @@ __host__ void DEM::cameraInit(
	if (verbose == 1)
	std::cout << " Transfering camera values to constant memory: ";

	- /* Reference by string removed in cuda 5.0
	- cudaMemcpyToSymbol("devC_u", &u, sizeof(u));
	- cudaMemcpyToSymbol("devC_v", &v, sizeof(v));
	- cudaMemcpyToSymbol("devC_w", &w, sizeof(w));
	- cudaMemcpyToSymbol("devC_eye", &eye, sizeof(eye));
	- cudaMemcpyToSymbol("devC_imgplane", &imgplane, sizeof(imgplane));
	- cudaMemcpyToSymbol("devC_d", &d, sizeof(d));
	- cudaMemcpyToSymbol("devC_light", &light, sizeof(light));
	- cudaMemcpyToSymbol("devC_pixels", &pixels, sizeof(pixels));*/
	cudaMemcpyToSymbol(devC_u, &u, sizeof(u));
	cudaMemcpyToSymbol(devC_v, &v, sizeof(v));
	cudaMemcpyToSymbol(devC_w, &w, sizeof(w));
	t@@ -333,7 +337,7 @@ __host__ void DEM::cameraInit(
	}

	// Allocate global device memory
	-__host__ void DEM::rt_allocateGlobalDeviceMemory(void)
	+__host__ void DEM::rt_allocateGlobalDeviceMemory()
	{
	if (verbose == 1)
	std::cout << " Allocating device memory: ";
	t@@ -347,7 +351,7 @@ __host__ void DEM::rt_allocateGlobalDeviceMemory(void)


	// Free dynamically allocated device memory
	-__host__ void DEM::rt_freeGlobalDeviceMemory(void)
	+__host__ void DEM::rt_freeGlobalDeviceMemory()
	{
	if (verbose == 1)
	std::cout << " Freeing device memory: ";
	t@@ -360,11 +364,12 @@ __host__ void DEM::rt_freeGlobalDeviceMemory(void)
	}

	// Transfer image data from device to host
	-__host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
	+__host__ void DEM::rt_transferFromGlobalDeviceMemory()
	{
	if (verbose == 1)
	std::cout << " Transfering image data: device -> host: ";
	- cudaMemcpy(img, dev_img, widthheight4*sizeof(unsigned char), cudaMemcpyD…
	+ cudaMemcpy(img, dev_img, widthheight4*sizeof(unsigned char),
	+ cudaMemcpyDeviceToHost);
	if (verbose == 1)
	std::cout << "Done" << std::endl;
	checkForCudaErrors("During rt_transferFromGlobalDeviceMemory()");
	t@@ -372,24 +377,13 @@ __host__ void DEM::rt_transferFromGlobalDeviceMemory(voi…

	// Wrapper for the rt kernel
	__host__ void DEM::render(
	- const int method,
	- const float maxval,
	- const float lower_cutoff,
	- const float focalLength,
	- const unsigned int img_width,
	- const unsigned int img_height)
	- /float4 p, unsigned int np,
	- rgba* img, unsigned int width, unsigned int height,
	- f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
	- int visualize, float max_val,
	- float* fixvel,
	- float* xsum,
	- float* pres,
	- float* es_dot,
	- float* es,
	- float* vel)*/
	+ const int method,
	+ const float maxval,
	+ const float lower_cutoff,
	+ const float focalLength,
	+ const unsigned int img_width,
	+ const unsigned int img_height)
	{
	- // Namespace directives
	using std::cout;
	using std::cerr;
	using std::endl;
	t@@ -418,15 +412,15 @@ __host__ void DEM::render(
	// Look at the centre of the mean positions
	float3 lookat = make_float3(maxpos.x, maxpos.y, maxpos.z) / 2.0f;
	float3 eye = make_float3(
	- grid.L[0] * 2.3f,
	- grid.L[1] * -5.0f,
	- grid.L[2] * 1.3f);
	+ grid.L[0] * 2.3f,
	+ grid.L[1] * -5.0f,
	+ grid.L[2] * 1.3f);
	cameraInit(eye, lookat, imgw, focalLength);

	// Construct rays for perspective projection
	rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
	- dev_ray_origo, dev_ray_direction,
	- make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
	+ dev_ray_origo, dev_ray_direction,
	+ make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
	cudaThreadSynchronize();

	Float* linarr; // Linear array to use for color visualization
	t@@ -440,8 +434,8 @@ __host__ void DEM::render(
	// Visualize spheres without color scale overlay
	if (method == 0) {
	rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
	- dev_ray_origo, dev_ray_direction,
	- dev_x, dev_img);
	+ dev_ray_origo, dev_ray_direction,
	+ dev_x, dev_img);
	} else {

	if (method == 1) { // Visualize pressure
	t@@ -455,8 +449,8 @@ __host__ void DEM::render(
	#pragma omp parallel for if(np>100)
	for (i = 0; i<np; ++i) {
	linarr[i] = sqrt(k.vel[i].x*k.vel[i].x
	- + k.vel[i].y*k.vel[i].y
	- + k.vel[i].z*k.vel[i].z);
	+ + k.vel[i].y*k.vel[i].y
	+ + k.vel[i].z*k.vel[i].z);
	}
	transfer = 1;
	desc = "Linear velocity";
	t@@ -468,8 +462,8 @@ __host__ void DEM::render(
	#pragma omp parallel for if(np>100)
	for (i = 0; i<np; ++i) {
	linarr[i] = sqrt(k.angvel[i].x*k.angvel[i].x
	- + k.angvel[i].y*k.angvel[i].y
	- + k.angvel[i].z*k.angvel[i].z);
	+ + k.angvel[i].y*k.angvel[i].y
	+ + k.angvel[i].z*k.angvel[i].z);
	}
	transfer = 1;
	desc = "Angular velocity";
	t@@ -492,8 +486,8 @@ __host__ void DEM::render(
	#pragma omp parallel for if(np>100)
	for (i = 0; i<np; ++i) {
	linarr[i] = sqrt(k.angpos[i].x*k.angpos[i].x
	- + k.angpos[i].y*k.angpos[i].y
	- + k.angpos[i].z*k.angpos[i].z);
	+ + k.angpos[i].y*k.angpos[i].y
	+ + k.angpos[i].z*k.angpos[i].z);
	}
	transfer = 1;
	desc = "Angular positions";
	t@@ -504,23 +498,24 @@ __host__ void DEM::render(
	// Report color visualization method and color map range
	if (verbose == 1) {
	cout << " " << desc << " color map range: [0, "
	- << maxval << "] " << unit << endl;
	+ << maxval << "] " << unit << endl;
	}

	// Copy linarr to dev_linarr if required
	if (transfer == 1) {
	cudaMalloc((void*)&dev_linarr, npsizeof(Float));
	checkForCudaErrors("Error during cudaMalloc of linear array");
	- cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), cudaMemcpyHostToD…
	+ cudaMemcpy(dev_linarr, linarr, np*sizeof(Float),
	+ cudaMemcpyHostToDevice);
	checkForCudaErrors("Error during cudaMemcpy of linear array");
	}

	// Start raytracing kernel
	rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
	- dev_ray_origo, dev_ray_direction,
	- dev_x, dev_vel,
	- dev_linarr, maxval, lower_cutoff,
	- dev_img);
	+ dev_ray_origo, dev_ray_direction,
	+ dev_x, dev_vel,
	+ dev_linarr, maxval, lower_cutoff,
	+ dev_img);

	}

	diff --git a/tests/fluid_particle_interaction.py b/tests/fluid_particle_interac…
	t@@ -50,5 +50,4 @@ test(sim.vel[0,0] > 0.0, 'Particle 0 velocity:')
	test(sim.vel[1,0] > 0.0, 'Particle 1 velocity:')
	test(sim.vel[2,0] > 0.0, 'Particle 2 velocity:')

	-
	-sim.cleanup()
	+#sim.cleanup()