Introduction
Introduction Statistics Contact Development Disclaimer Help
tadd const and __restrict__ keywords to facilitate constant memory cache usage …
git clone git://src.adamsgaard.dk/sphere
Log
Files
Refs
LICENSE
---
commit 21ed15ef593a831b2124a48bbc662b9a227f85ac
parent 7626d19392ecba63c4b2e5f4d419e8423d7a90a0
Author: Anders Damsgaard <[email protected]>
Date: Mon, 11 Aug 2014 14:37:07 +0200
add const and __restrict__ keywords to facilitate constant memory cache usage
Diffstat:
M src/cohesion.cuh | 143 ++++++++---------------------…
M src/contactmodels.cuh | 164 ++++++++++++++++++-----------…
M src/contactsearch.cuh | 246 ++++++++++++++++-------------…
M src/integration.cuh | 70 ++++++++++++++++++-----------…
M src/navierstokes.cuh | 1213 ++++++++++++++++-------------…
M src/raytracer.cuh | 173 +++++++++++++++--------------…
M tests/fluid_particle_interaction.py | 3 +--
7 files changed, 996 insertions(+), 1016 deletions(-)
---
diff --git a/src/cohesion.cuh b/src/cohesion.cuh
t@@ -6,14 +6,14 @@
// Check bond pair list, apply linear contact model to pairs
__global__ void bondsLinear(
- uint2* dev_bonds,
- Float4* dev_bonds_delta, // Contact displacement
- Float4* dev_bonds_omega, // Contact rotational displacement
- Float4* dev_x,
- Float4* dev_vel,
- Float4* dev_angvel,
- Float4* dev_force,
- Float4* dev_torque)
+ uint2* __restrict__ dev_bonds,
+ Float4* __restrict__ dev_bonds_delta, // Contact displacement
+ Float4* __restrict__ dev_bonds_omega, // Contact rotational displacement
+ const Float4* __restrict__ dev_x,
+ const Float4* __restrict__ dev_vel,
+ const Float4* __restrict__ dev_angvel,
+ Float4* __restrict__ dev_force,
+ Float4* __restrict__ dev_torque)
{
// Find thread index
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -35,16 +35,16 @@ __global__ void bondsLinear(
// Convert tangential vectors to Float3's
// Uncorrected tangential component of displacement
Float3 delta0_t = MAKE_FLOAT3(
- delta0_4.x,
- delta0_4.y,
- delta0_4.z);
+ delta0_4.x,
+ delta0_4.y,
+ delta0_4.z);
const Float delta0_n = delta0_4.w;
// Uncorrected tangential component of rotation
Float3 omega0_t = MAKE_FLOAT3(
- omega0_4.x,
- omega0_4.y,
- omega0_4.z);
+ omega0_4.x,
+ omega0_4.y,
+ omega0_4.z);
const Float omega0_n = omega0_4.w;
// Read particle data
t@@ -76,9 +76,9 @@ __global__ void bondsLinear(
// Inter-particle vector
const Float3 x = MAKE_FLOAT3(
- x_i.x - x_j.x,
- x_i.y - x_j.y,
- x_i.z - x_j.z);
+ x_i.x - x_j.x,
+ x_i.y - x_j.y,
+ x_i.z - x_j.z);
const Float x_length = length(x);
// Find overlap (negative value if overlapping)
t@@ -96,13 +96,13 @@ __global__ void bondsLinear(
// Contact displacement, Luding 2008 eq. 10
const Float3 ddelta = (
- MAKE_FLOAT3(
- vel_i.x - vel_j.x,
- vel_i.y - vel_j.y,
- vel_i.z - vel_j.z)
- + (x_i.w + overlap/2.0) * cross(n, angvel_i)
- + (x_j.w + overlap/2.0) * cross(n, angvel_j)
- ) * devC_dt;
+ MAKE_FLOAT3(
+ vel_i.x - vel_j.x,
+ vel_i.y - vel_j.y,
+ vel_i.z - vel_j.z)
+ + (x_i.w + overlap/2.0) * cross(n, angvel_i)
+ + (x_j.w + overlap/2.0) * cross(n, angvel_j)
+ ) * devC_dt;
// Normal component of the displacement increment
//const Float ddelta_n = dot(ddelta, n);
t@@ -141,9 +141,9 @@ __global__ void bondsLinear(
// Contact rotational velocity
Float3 domega = MAKE_FLOAT3(
- angvel_j.x - angvel_i.x,
- angvel_j.y - angvel_i.y,
- angvel_j.z - angvel_i.z) * devC_dt;
+ angvel_j.x - angvel_i.x,
+ angvel_j.y - angvel_i.y,
+ angvel_j.z - angvel_i.z) * devC_dt;
/*const Float3 domega = MAKE_FLOAT3(
angvel_i.x - angvel_j.x,
angvel_i.y - angvel_j.y,
t@@ -232,89 +232,16 @@ __global__ void bondsLinear(
}
}
-// Linear-elastic bond: Attractive force with normal- and shear components
-// acting upon particle A in a bonded particle pair
-__device__ void bondLinear_old(Float3* N, Float3* T, Float* es_dot, Float* p,
- unsigned int idx_a, unsigned int idx_b,
- Float4* dev_x_sorted, Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab)
-{
-
- // If particles are not overlapping, apply bond force
- if (delta_ab > 0.0f) {
-
- // Allocate variables and fetch missing time=t values for particle A a…
- Float4 vel_a = dev_vel_sorted[idx_a];
- Float4 vel_b = dev_vel_sorted[idx_b];
- Float4 angvel4_a = dev_angvel_sorted[idx_a];
- Float4 angvel4_b = dev_angvel_sorted[idx_b];
-
- // Convert to Float3's
- Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
- Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
-
- // Normal vector of contact
- Float3 n_ab = x_ab/x_ab_length;
-
- // Relative contact interface velocity, w/o rolling
- Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
-
- // Relative contact interface velocity of particle surfaces at
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
- Float3 vel_ab = vel_ab_linear
- + radius_a * cross(n_ab, angvel_a)
- + radius_b * cross(n_ab, angvel_b);
-
- // Relative contact interface rolling velocity
- //Float3 angvel_ab = angvel_a - angvel_b;
- //Float angvel_ab_length = length(angvel_ab);
-
- // Normal component of the relative contact interface velocity
- //Float vel_n_ab = dot(vel_ab_linear, n_ab);
-
- // Tangential component of the relative contact interface velocity
- // Hinrichsen and Wolf 2004, eq. 13.9
- Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
- //Float vel_t_ab_length = length(vel_t_ab);
-
- Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
- Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
-
- // Mean radius
- Float R_bar = (radius_a + radius_b)/2.0f;
-
- // Normal force component: Elastic
- f_n = devC_params.k_n * delta_ab * n_ab;
-
- if (length(vel_t_ab) > 0.f) {
- // Shear force component: Viscous
- f_t = -1.0f * devC_params.gamma_t * vel_t_ab;
-
- // Shear friction production rate [W]
- //*es_dot += -dot(vel_t_ab, f_t);
- }
-
- // Add force components from this bond to total force for particle
- *N += f_n + f_t;
- *T += -R_bar * cross(n_ab, f_t);
-
- // Pressure excerted onto the particle from this bond
- *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
-
- }
-} // End of bondLinear()
-
// Capillary cohesion after Richefeu et al. (2006)
-__device__ void capillaryCohesion_exp(Float3* N, Float radius_a,
- Float radius_b, Float delta_ab,
- Float3 x_ab, Float x_ab_length,
- Float kappa)
+__device__ void capillaryCohesion_exp(
+ Float3* N,
+ const Float radius_a,
+ const Float radius_b,
+ const Float delta_ab,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float kappa)
{
// Normal vector
diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
t@@ -7,11 +7,19 @@
// Linear viscoelastic contact model for particle-wall interactions
// with tangential friction and rolling resistance
-__device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
- Float* ev_dot, Float* p,
- unsigned int idx_a, Float radius_a,
- Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
- Float3 n, Float delta, Float wvel)
+__device__ Float contactLinear_wall(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a,
+ const Float radius_a,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const Float3 n,
+ const Float delta,
+ const Float wvel)
{
// Fetch particle velocities from global memory
Float4 vel_tmp = dev_vel_sorted[idx_a];
t@@ -19,13 +27,13 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, …
// Convert velocities to three-component vectors
Float3 vel_linear = MAKE_FLOAT3(
- vel_tmp.x,
- vel_tmp.y,
- vel_tmp.z);
+ vel_tmp.x,
+ vel_tmp.y,
+ vel_tmp.z);
Float3 angvel = MAKE_FLOAT3(
- angvel_tmp.x,
- angvel_tmp.y,
- angvel_tmp.z);
+ angvel_tmp.x,
+ angvel_tmp.y,
+ angvel_tmp.z);
// Store the length of the angular velocity for later use
Float angvel_length = length(angvel);
t@@ -47,7 +55,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Fl…
// Normal force component: Elastic - viscous damping
Float3 f_n = fmax(0.0, -devC_params.k_n*delta
- - devC_params.gamma_wn*vel_n) * n;
+ - devC_params.gamma_wn*vel_n) * n;
const Float f_n_length = length(f_n); // Save length for later use
// Store the energy lost by viscous damping. See derivation in
t@@ -103,14 +111,22 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T…
// Linear vicoelastic contact model for particle-particle interactions
// with tangential friction and rolling resistance
-__device__ void contactLinearViscous(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a, unsigned int idx_b,
- Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab, Float kappa)
+__device__ void contactLinearViscous(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a,
+ const unsigned int idx_b,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float delta_ab,
+ const Float kappa)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -131,8 +147,8 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
t@@ -209,18 +225,25 @@ __device__ void contactLinearViscous(Float3* F, Float3* …
// Linear elastic contact model for particle-particle interactions
-__device__ void contactLinear(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a_orig,
- unsigned int idx_b_orig,
- Float4 vel_a,
- Float4* dev_vel,
- Float3 angvel_a,
- Float4* dev_angvel,
- Float radius_a, Float radius_b,
- Float3 x, Float x_length,
- Float delta, Float4* dev_delta_t,
- unsigned int mempos)
+__device__ void contactLinear(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a_orig,
+ const unsigned int idx_b_orig,
+ const Float4 vel_a,
+ const Float4* __restrict__ dev_vel,
+ const Float3 angvel_a,
+ const Float4* __restrict__ dev_angvel,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x,
+ const Float x_length,
+ const Float delta,
+ Float4* __restrict__ dev_delta_t,
+ const unsigned int mempos)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -231,15 +254,15 @@ __device__ void contactLinear(Float3* F, Float3* T,
Float4 delta_t0_4 = dev_delta_t[mempos];
Float3 delta_t0_uncor = MAKE_FLOAT3(
- delta_t0_4.x,
- delta_t0_4.y,
- delta_t0_4.z);
+ delta_t0_4.x,
+ delta_t0_4.y,
+ delta_t0_4.z);
// Convert to Float3
Float3 angvel_b = MAKE_FLOAT3(
- angvel4_b.x,
- angvel4_b.y,
- angvel4_b.z);
+ angvel4_b.x,
+ angvel4_b.y,
+ angvel4_b.z);
// Force between grain pair decomposed into normal- and tangential part
Float3 f_n, f_t, f_c;
t@@ -249,9 +272,9 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_linear = MAKE_FLOAT3(
- vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.x - vel_b.x,
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10,
t@@ -335,7 +358,7 @@ __device__ void contactLinear(Float3* F, Float3* T,
// 2008)
delta_t = -1.0/devC_params.k_t
* (devC_params.mu_d * length(f_n-f_c) * t
- + devC_params.gamma_t * vel_t);
+ + devC_params.gamma_t * vel_t);
// Shear friction heat production rate:
// The energy lost from the tangential spring is dissipated as heat
t@@ -357,10 +380,10 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Store sum of tangential displacements
dev_delta_t[mempos] = MAKE_FLOAT4(
- delta_t.x,
- delta_t.y,
- delta_t.z,
- 0.0f);
+ delta_t.x,
+ delta_t.y,
+ delta_t.z,
+ 0.0f);
} // End of contactLinear()
t@@ -368,18 +391,25 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Non-linear contact model for particle-particle interactions
// Based on Hertzian and Mindlin contact theories (e.g. Hertz, 1882, Mindlin a…
// Deresiewicz, 1953, Johnson, 1985). See Yohannes et al 2012 for example.
-__device__ void contactHertz(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a_orig,
- unsigned int idx_b_orig,
- Float4 vel_a,
- Float4* dev_vel,
- Float3 angvel_a,
- Float4* dev_angvel,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab, Float4* dev_delta_t,
- unsigned int mempos)
+__device__ void contactHertz(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a_orig,
+ const unsigned int idx_b_orig,
+ const Float4 vel_a,
+ const Float4* __restrict__ dev_vel,
+ const Float3 angvel_a,
+ const Float4* __restrict__ dev_angvel,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float delta_ab,
+ Float4* __restrict__ dev_delta_t,
+ const unsigned int mempos)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -390,8 +420,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
Float4 delta_t0_4 = dev_delta_t[mempos];
Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
- delta_t0_4.y,
- delta_t0_4.z);
+ delta_t0_4.y,
+ delta_t0_4.z);
// Convert to Float3
Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
t@@ -404,8 +434,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
t@@ -435,7 +465,7 @@ __device__ void contactHertz(Float3* F, Float3* T,
// Normal force component
f_n = (-devC_params.k_n * powf(delta_ab, 3.0f/2.0f)
- -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
+ -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
* n_ab;
// Store energy dissipated in normal viscous component
t@@ -517,9 +547,9 @@ __device__ void contactHertz(Float3* F, Float3* T,
// New rolling resistance model
/*T_res = -1.0f * fmin(devC_params.gamma_r * R_bar * angvel_ab_length,
devC_params.mu_r * R_bar * f_n_length)
- * angvel_ab/angvel_ab_length;*/
+ * angvel_ab/angvel_ab_length;*/
T_res = -1.0f * fmin(devC_params.gamma_r * radius_a * angvel_ab_length,
- devC_params.mu_r * radius_a * f_n_length)
+ devC_params.mu_r * radius_a * f_n_length)
* angvel_ab/angvel_ab_length;
}
diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -85,19 +85,23 @@ __device__ int findDistMod(int3* targetCell, Float3* distm…
// Used for contactmodel=1, where contact history is not needed.
// Kernel executed on device, and callable from device only.
// Function is called from interact().
-__device__ void findAndProcessContactsInCell(int3 targetCell,
- unsigned int idx_a,
- Float4 x_a, Float radius_a,
- Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot,
- Float* p,
- Float4* dev_x_sorted,
- Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- Float4* dev_walls_nx,
- Float4* dev_walls_mvfd)
+__device__ void findAndProcessContactsInCell(
+ int3 targetCell,
+ const unsigned int idx_a,
+ const Float4 x_a,
+ const Float radius_a,
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_walls_nx,
+ Float4* __restrict__ dev_walls_mvfd)
//uint4 bonds)
{
t@@ -133,8 +137,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCe…
// Distance between particle centers (Float4 -> Float3)
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
// Adjust interparticle vector if periodic boundary/bounda…
// are crossed
t@@ -147,16 +151,16 @@ __device__ void findAndProcessContactsInCell(int3 target…
// Check for particle overlap
if (delta_ab < 0.0f) {
contactLinearViscous(F, T, es_dot, ev_dot, p,
- idx_a, idx_b,
- dev_vel_sorted,
- dev_angvel_sorted,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_ab, kappa);
+ idx_a, idx_b,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_ab, kappa);
} else if (delta_ab < devC_params.db) {
// Check wether particle distance satisfies the capill…
capillaryCohesion_exp(F, radius_a, radius_b, delta_ab,
- x_ab, x_ab_length, kappa);
+ x_ab, x_ab_length, kappa);
}
// Check wether particles are bonded together
t@@ -183,16 +187,18 @@ __device__ void findAndProcessContactsInCell(int3 target…
// Used for contactmodel=2, where bookkeeping of contact history is necessary.
// Kernel executed on device, and callable from device only.
// Function is called from topology().
-__device__ void findContactsInCell(int3 targetCell,
- unsigned int idx_a,
- Float4 x_a, Float radius_a,
- Float4* dev_x_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- unsigned int* dev_gridParticleIndex,
- int* nc,
- unsigned int* dev_contacts,
- Float4* dev_distmod)
+__device__ void findContactsInCell(
+ int3 targetCell,
+ const unsigned int idx_a,
+ const Float4 x_a,
+ const Float radius_a,
+ const Float4* __restrict__ dev_x_sorted,
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ int* nc,
+ unsigned int* __restrict__ dev_contacts,
+ Float4* __restrict__ dev_distmod)
{
// Get distance modifier for interparticle
// vector, if it crosses a periodic boundary
t@@ -237,8 +243,8 @@ __device__ void findContactsInCell(int3 targetCell,
// Distance between particle centers (Float4 -> Float3)
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
// Adjust interparticle vector if periodic boundary/bounda…
// are crossed
t@@ -316,12 +322,13 @@ __device__ void findContactsInCell(int3 targetCell,
// Search for neighbors to particle 'idx' inside the 27 closest cells,
// and save the contact pairs in global memory.
// Function is called from mainGPU loop.
-__global__ void topology(unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd, // Input: Particles in cell
- unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
- Float4* dev_x_sorted,
- unsigned int* dev_contacts,
- Float4* dev_distmod)
+__global__ void topology(
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ const Float4* __restrict__ dev_x_sorted,
+ unsigned int* __restrict__ dev_contacts,
+ Float4* __restrict__ dev_distmod)
{
// Thread index equals index of particle A
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -349,10 +356,10 @@ __global__ void topology(unsigned int* dev_cellStart,
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
findContactsInCell(targetPos, idx_a, x_a, radius_a,
- dev_x_sorted,
- dev_cellStart, dev_cellEnd,
- dev_gridParticleIndex,
- &nc, dev_contacts, dev_distmod);
+ dev_x_sorted,
+ dev_cellStart, dev_cellEnd,
+ dev_gridParticleIndex,
+ &nc, dev_contacts, dev_distmod);
}
}
}
t@@ -372,28 +379,28 @@ __global__ void topology(unsigned int* dev_cellStart,
// Kernel is executed on device, and is callable from host only.
// Function is called from mainGPU loop.
__global__ void interact(
- unsigned int* dev_gridParticleIndex, // in
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float4* dev_x, // in
- Float4* dev_x_sorted, // in
- Float4* dev_vel_sorted, // in
- Float4* dev_angvel_sorted, // in
- Float4* dev_vel, // in
- Float4* dev_angvel, // in
- Float4* dev_force, // out
- Float4* dev_torque, // out
- Float* dev_es_dot, // out
- Float* dev_ev_dot, // out
- Float* dev_es, // out
- Float* dev_ev, // out
- Float* dev_p, // out
- Float4* dev_walls_nx, // in
- Float4* dev_walls_mvfd, // in
- Float* dev_walls_force_pp, // out
- unsigned int* dev_contacts, // out
- Float4* dev_distmod, // in
- Float4* dev_delta_t) // out
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float4* __restrict__ dev_x, // in
+ const Float4* __restrict__ dev_x_sorted, // in
+ const Float4* __restrict__ dev_vel_sorted, // in
+ const Float4* __restrict__ dev_angvel_sorted, // in
+ const Float4* __restrict__ dev_vel, // in
+ const Float4* __restrict__ dev_angvel, // in
+ Float4* __restrict__ dev_force, // out
+ Float4* __restrict__ dev_torque, // out
+ Float* __restrict__ dev_es_dot, // out
+ Float* __restrict__ dev_ev_dot, // out
+ Float* __restrict__ dev_es, // out
+ Float* __restrict__ dev_ev, // out
+ Float* __restrict__ dev_p, // out
+ const Float4* __restrict__ dev_walls_nx, // in
+ Float4* __restrict__ dev_walls_mvfd, // in
+ Float* __restrict__ dev_walls_force_pp, // out
+ unsigned int* __restrict__ dev_contacts, // out
+ const Float4* __restrict__ dev_distmod, // in
+ Float4* __restrict__ dev_delta_t) // out
{
// Thread index equals index of particle A
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -407,11 +414,11 @@ __global__ void interact(
// Fetch world dimensions in constant memory read
Float3 origo = MAKE_FLOAT3(devC_grid.origo[0],
- devC_grid.origo[1],
- devC_grid.origo[2]);
+ devC_grid.origo[1],
+ devC_grid.origo[2]);
Float3 L = MAKE_FLOAT3(devC_grid.L[0],
- devC_grid.L[1],
- devC_grid.L[2]);
+ devC_grid.L[1],
+ devC_grid.L[2]);
// Fetch wall data in global read
Float4 w_0_nx, w_1_nx, w_2_nx, w_3_nx, w_4_nx;
t@@ -497,8 +504,8 @@ __global__ void interact(
// Inter-particle vector, corrected for periodic boundaries
x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
- x_a.y - x_b.y + distmod.y,
- x_a.z - x_b.z + distmod.z);
+ x_a.y - x_b.y + distmod.y,
+ x_a.z - x_b.z + distmod.z);
x_ab_length = length(x_ab);
delta_n = x_ab_length - (radius_a + radius_b);
t@@ -507,28 +514,28 @@ __global__ void interact(
if (delta_n < 0.0) {
if (devC_params.contactmodel == 2) {
contactLinear(&F, &T, &es_dot, &ev_dot, &p,
- idx_a_orig,
- idx_b_orig,
- vel_a,
- dev_vel,
- angvel_a,
- dev_angvel,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_n, dev_delta_t,
- mempos);
+ idx_a_orig,
+ idx_b_orig,
+ vel_a,
+ dev_vel,
+ angvel_a,
+ dev_angvel,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_n, dev_delta_t,
+ mempos);
} else if (devC_params.contactmodel == 3) {
contactHertz(&F, &T, &es_dot, &ev_dot, &p,
- idx_a_orig,
- idx_b_orig,
- vel_a,
- dev_vel,
- angvel_a,
- dev_angvel,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_n, dev_delta_t,
- mempos);
+ idx_a_orig,
+ idx_b_orig,
+ vel_a,
+ dev_vel,
+ angvel_a,
+ dev_angvel,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_n, dev_delta_t,
+ mempos);
}
} else {
__syncthreads();
t@@ -556,11 +563,11 @@ __global__ void interact(
// Calculate address in grid from position
gridPos.x = floor((x_a.x - devC_grid.origo[0])
- / (devC_grid.L[0]/devC_grid.num[0]));
+ / (devC_grid.L[0]/devC_grid.num[0]));
gridPos.y = floor((x_a.y - devC_grid.origo[1])
- / (devC_grid.L[1]/devC_grid.num[1]));
+ / (devC_grid.L[1]/devC_grid.num[1]));
gridPos.z = floor((x_a.z - devC_grid.origo[2])
- / (devC_grid.L[2]/devC_grid.num[2]));
+ / (devC_grid.L[2]/devC_grid.num[2]));
// Find overlaps between particle no. idx and all particles
// from its own cell + 26 neighbor cells.
t@@ -570,12 +577,17 @@ __global__ void interact(
for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
- findAndProcessContactsInCell(targetPos, idx_a, x_a, ra…
- &F, &T, &es_dot, &ev_dot, &p,
- dev_x_sorted,
- dev_vel_sorted, dev_angvel_sorted,
- dev_cellStart, dev_cellEnd,
- dev_walls_nx, dev_walls_mvfd);
+ findAndProcessContactsInCell(targetPos, idx_a,
+ x_a, radius_a,
+ &F, &T, &es_dot,
+ &ev_dot, &p,
+ dev_x_sorted,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ dev_cellStart,
+ dev_cellEnd,
+ dev_walls_nx,
+ dev_walls_mvfd);
}
}
}
t@@ -596,8 +608,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_0_nx.x, w_0_nx.y, w_0_nx.z);
if (delta_w < 0.0f) {
w_0_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
- radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
- w_0_mvfd.y);
+ radius_a, dev_vel_sorted, dev_angve…
+ w_0_mvfd.y);
}
// Lower wall (force on wall not stored)
t@@ -605,8 +617,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
if (delta_w < 0.0f) {
(void)contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
- radius_a, dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
+ radius_a, dev_vel_sorted, dev_angvel_sort…
+ w_n, delta_w, 0.0f);
}
t@@ -617,8 +629,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_1_nx.x, w_1_nx.y, w_1_nx.z);
if (delta_w < 0.0f) {
w_1_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_1_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_1_mvfd.y);
}
// Left wall (idx 2)
t@@ -626,8 +638,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_2_nx.x, w_2_nx.y, w_2_nx.z);
if (delta_w < 0.0f) {
w_2_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_2_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_2_mvfd.y);
}
// Back wall (idx 3)
t@@ -635,8 +647,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
if (delta_w < 0.0f) {
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_3_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_3_mvfd.y);
}
// Front wall (idx 4)
t@@ -644,8 +656,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
if (delta_w < 0.0f) {
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_4_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_4_mvfd.y);
}
} else if (devC_grid.periodic == 2) { // right and left walls period…
t@@ -655,8 +667,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
if (delta_w < 0.0f) {
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_3_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_3_mvfd.y);
}
// Front wall (idx 4)
t@@ -664,8 +676,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
if (delta_w < 0.0f) {
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_…
- delta_w, w_4_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted…
+ delta_w, w_4_mvfd.y);
}
}
diff --git a/src/integration.cuh b/src/integration.cuh
t@@ -11,15 +11,23 @@
// Second order integration scheme based on Taylor expansion of particle kinem…
// Kernel executed on device, and callable from host only.
-__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp…
- Float4* dev_angvel_sorted,
- Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
- Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
- Float4* dev_acc, Float4* dev_angacc,
- Float4* dev_vel0, Float4* dev_angvel0,
- Float4* dev_xyzsum,
- unsigned int* dev_gridParticleIndex, // Input: Sorted-Unsorted key
- unsigned int iter)
+__global__ void integrate(
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ Float4* __restrict__ dev_x,
+ Float4* __restrict__ dev_vel,
+ Float4* __restrict__ dev_angvel,
+ const Float4* __restrict__ dev_force,
+ const Float4* __restrict__ dev_torque,
+ Float4* __restrict__ dev_angpos,
+ Float4* __restrict__ dev_acc,
+ Float4* __restrict__ dev_angacc,
+ Float4* __restrict__ dev_vel0,
+ Float4* __restrict__ dev_angvel0,
+ Float4* __restrict__ dev_xyzsum,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ const unsigned int iter)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
t@@ -59,13 +67,13 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de…
// Coherent read from constant memory to registers
const Float dt = devC_dt;
const Float3 origo = MAKE_FLOAT3(
- devC_grid.origo[0],
- devC_grid.origo[1],
- devC_grid.origo[2]);
+ devC_grid.origo[0],
+ devC_grid.origo[1],
+ devC_grid.origo[2]);
const Float3 L = MAKE_FLOAT3(
- devC_grid.L[0],
- devC_grid.L[1],
- devC_grid.L[2]);
+ devC_grid.L[0],
+ devC_grid.L[1],
+ devC_grid.L[2]);
// Particle mass
Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho;
t@@ -179,14 +187,14 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* …
// Truncation error O(dt^4) for positions, O(dt^3) for velocities
// Approximate acceleration change by backwards difference:
const Float3 dacc_dt = MAKE_FLOAT3(
- (acc.x - acc0.x)/dt,
- (acc.y - acc0.y)/dt,
- (acc.z - acc0.z)/dt);
+ (acc.x - acc0.x)/dt,
+ (acc.y - acc0.y)/dt,
+ (acc.z - acc0.z)/dt);
const Float3 dangacc_dt = MAKE_FLOAT3(
- (angacc.x - angacc0.x)/dt,
- (angacc.y - angacc0.y)/dt,
- (angacc.z - angacc0.z)/dt);
+ (angacc.x - angacc0.x)/dt,
+ (angacc.y - angacc0.y)/dt,
+ (angacc.z - angacc0.z)/dt);
x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*d…
x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*d…
t@@ -253,7 +261,9 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de…
// Reduce wall force contributions from particles to a single value per wall
-__global__ void summation(Float* in, Float *out)
+__global__ void summation(
+ const Float* __restrict__ in,
+ Float* __restrict__ out)
{
__shared__ Float cache[256];
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -287,14 +297,14 @@ __global__ void summation(Float* in, Float *out)
// Update wall positions
__global__ void integrateWalls(
- Float4* dev_walls_nx,
- Float4* dev_walls_mvfd,
- int* dev_walls_wmode,
- Float* dev_walls_force_partial,
- Float* dev_walls_acc,
- unsigned int blocksPerGrid,
- Float t_current,
- unsigned int iter)
+ Float4* __restrict__ dev_walls_nx,
+ Float4* __restrict__ dev_walls_mvfd,
+ const int* __restrict__ dev_walls_wmode,
+ const Float* __restrict__ dev_walls_force_partial,
+ Float* __restrict__ dev_walls_acc,
+ const unsigned int blocksPerGrid,
+ const Float t_current,
+ const unsigned int iter)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
t@@ -25,34 +25,34 @@ __inline__ __device__ Float hmean(Float a, Float b) {
// Helper functions for checking whether a value is NaN or Inf
__device__ int checkFiniteFloat(
- const char* desc,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float s)
+ const char* desc,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float s)
{
- __syncthreads();
- if (!isfinite(s)) {
- printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
- return 1;
- }
- return 0;
+ __syncthreads();
+ if (!isfinite(s)) {
+ printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
+ return 1;
+ }
+ return 0;
}
__device__ int checkFiniteFloat3(
- const char* desc,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float3 v)
+ const char* desc,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float3 v)
{
- __syncthreads();
- if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) {
- printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
- x, y, z, desc, v.x, v.y, v.z);
- return 1;
- }
- return 0;
+ __syncthreads();
+ if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) {
+ printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
+ x, y, z, desc, v.x, v.y, v.z);
+ return 1;
+ }
+ return 0;
}
// Initialize memory
t@@ -143,10 +143,10 @@ void DEM::freeNSmemDev()
void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
{
checkForCudaErrors("Before attempting cudaMemcpy in "
- "transferNStoGlobalDeviceMemory");
+ "transferNStoGlobalDeviceMemory");
//if (verbose == 1 && statusmsg == 1)
- //std::cout << " Transfering fluid data to the device: ";
+ //std::cout << " Transfering fluid data to the device: ";
// memory size for a scalar field
unsigned int memSizeF = sizeof(Float)*NScells();
t@@ -162,7 +162,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
checkForCudaErrors("End of transferNStoGlobalDeviceMemory");
//if (verbose == 1 && statusmsg == 1)
- //std::cout << "Done" << std::endl;
+ //std::cout << "Done" << std::endl;
}
// Transfer from device
t@@ -194,7 +194,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
void DEM::transferNSnormFromGlobalDeviceMemory()
{
cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory");
}
t@@ -202,7 +202,7 @@ void DEM::transferNSnormFromGlobalDeviceMemory()
void DEM::transferNSepsilonFromGlobalDeviceMemory()
{
cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
}
t@@ -210,13 +210,13 @@ void DEM::transferNSepsilonFromGlobalDeviceMemory()
void DEM::transferNSepsilonNewFromGlobalDeviceMemory()
{
cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
}
// Get linear index from 3D grid position
__inline__ __device__ unsigned int idx(
- const int x, const int y, const int z)
+ const int x, const int y, const int z)
{
// without ghost nodes
//return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z;
t@@ -229,11 +229,11 @@ __inline__ __device__ unsigned int idx(
// Get linear index of velocity node from 3D grid position in staggered grid
__inline__ __device__ unsigned int vidx(
- const int x, const int y, const int z)
+ const int x, const int y, const int z)
{
// without ghost nodes
//return x + (devC_grid.num[0]+1)*y
- //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
+ //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
// with ghost nodes
// the ghost nodes are placed at x,y,z = -1 and WIDTH+1
t@@ -246,10 +246,10 @@ __inline__ __device__ unsigned int vidx(
// dev_ns_v or dev_ns_v_p array. This function does not set the averaged
// velocity values in the ghost node cells.
__global__ void findNSavgVel(
- Float3* dev_ns_v, // out
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z) // in
+ Float3* __restrict__ dev_ns_v, // out
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z) // in
{
// 3D thread index
t@@ -272,9 +272,9 @@ __global__ void findNSavgVel(
// Find average velocity using arithmetic means
const Float3 v_bar = MAKE_FLOAT3(
- amean(v_xn, v_xp),
- amean(v_yn, v_yp),
- amean(v_zn, v_zp));
+ amean(v_xn, v_xp),
+ amean(v_yn, v_yp),
+ amean(v_zn, v_zp));
// Save value
__syncthreads();
t@@ -287,10 +287,10 @@ __global__ void findNSavgVel(
// or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are s…
// beforehand.
__global__ void findNScellFaceVel(
- Float3* dev_ns_v, // in
- Float* dev_ns_v_x, // out
- Float* dev_ns_v_y, // out
- Float* dev_ns_v_z) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_v_x, // out
+ Float* __restrict__ dev_ns_v_y, // out
+ Float* __restrict__ dev_ns_v_z) // out
{
// 3D thread index
t@@ -340,7 +340,9 @@ __global__ void findNScellFaceVel(
// Set the initial guess of the values of epsilon.
-__global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
+__global__ void setNSepsilonInterior(
+ Float* __restrict__ dev_ns_epsilon,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -349,7 +351,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon…
// check that we are not outside the fluid grid
if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
- z > 0 && z < devC_grid.num[2]-1) {
+ z > 0 && z < devC_grid.num[2]-1) {
__syncthreads();
const unsigned int cellidx = idx(x,y,z);
dev_ns_epsilon[cellidx] = value;
t@@ -358,7 +360,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon…
// The normalized residuals are given an initial value of 0, since the values …
// the Dirichlet boundaries aren't written during the iterations.
-__global__ void setNSnormZero(Float* dev_ns_norm)
+__global__ void setNSnormZero(Float* __restrict__ dev_ns_norm)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -380,9 +382,9 @@ __global__ void setNSnormZero(Float* dev_ns_norm)
// the Dirichlet boundary condition: the new value should be identical to the
// old value, i.e. the temporal gradient is 0
__global__ void setNSepsilonBottom(
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- const Float value)
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -408,9 +410,9 @@ __global__ void setNSepsilonBottom(
// the Dirichlet boundary condition: the new value should be identical to the
// old value, i.e. the temporal gradient is 0
__global__ void setNSepsilonTop(
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- const Float value)
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -419,7 +421,7 @@ __global__ void setNSepsilonTop(
// check that we are not outside the fluid grid, and at the upper z bounda…
if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
- z == devC_grid.num[2]-1) {
+ z == devC_grid.num[2]-1) {
__syncthreads();
const unsigned int cellidx = idx(x,y,z);
t@@ -428,11 +430,14 @@ __global__ void setNSepsilonTop(
}
}
__device__ void copyNSvalsDev(
- unsigned int read, unsigned int write,
- Float* dev_ns_p,
- Float3* dev_ns_v, Float3* dev_ns_v_p,
- Float* dev_ns_phi, Float* dev_ns_dphi,
- Float* dev_ns_epsilon)
+ const unsigned int read,
+ const unsigned int write,
+ Float* __restrict__ dev_ns_p,
+ Float3* __restrict__ dev_ns_v,
+ Float3* __restrict__ dev_ns_v_p,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float* __restrict__ dev_ns_epsilon)
{
// Coalesced read
const Float p = dev_ns_p[read];
t@@ -457,10 +462,12 @@ __device__ void copyNSvalsDev(
// are not written since they are not read. Launch this kernel for all cells in
// the grid
__global__ void setNSghostNodesDev(
- Float* dev_ns_p,
- Float3* dev_ns_v, Float3* dev_ns_v_p,
- Float* dev_ns_phi, Float* dev_ns_dphi,
- Float* dev_ns_epsilon)
+ Float* __restrict__ dev_ns_p,
+ Float3* __restrict__ dev_ns_v,
+ Float3* __restrict__ dev_ns_v_p,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float* __restrict__ dev_ns_epsilon)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -484,72 +491,72 @@ __global__ void setNSghostNodesDev(
if (x == 0) {
writeidx = idx(nx,y,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (x == nx-1) {
writeidx = idx(-1,y,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (y == 0) {
writeidx = idx(x,ny,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (y == ny-1) {
writeidx = idx(x,-1,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
// Z boundaries fixed
if (z == 0) {
writeidx = idx(x,y,-1);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (z == nz-1) {
writeidx = idx(x,y,nz);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
// Z boundaries periodic
/*if (z == 0) {
- writeidx = idx(x,y,nz);
- copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
- }
- if (z == nz-1) {
- writeidx = idx(x,y,-1);
- copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
- }*/
+ writeidx = idx(x,y,nz);
+ copyNSvalsDev(cellidx, writeidx,
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
+ }
+ if (z == nz-1) {
+ writeidx = idx(x,y,-1);
+ copyNSvalsDev(cellidx, writeidx,
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
+ }*/
}
}
t@@ -557,7 +564,7 @@ __global__ void setNSghostNodesDev(
// (diagonal) cells are not written since they are not read. Launch this kernel
// for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. …
template<typename T>
-__global__ void setNSghostNodes(T* dev_scalarfield)
+__global__ void setNSghostNodes(T* __restrict__ dev_scalarfield)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -586,10 +593,10 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
if (z == 0)
dev_scalarfield[idx(x,y,-1)] = val; // Dirichlet
- //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
+ //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
if (z == nz-1)
dev_scalarfield[idx(x,y,nz)] = val; // Dirichlet
- //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
+ //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
}
}
t@@ -597,9 +604,9 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
// (diagonal) cells are not written since they are not read.
template<typename T>
__global__ void setNSghostNodes(
- T* dev_scalarfield,
- int bc_bot,
- int bc_top)
+ T* __restrict__ dev_scalarfield,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -651,11 +658,11 @@ __global__ void setNSghostNodes(
// According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics"
template<typename T>
__global__ void setNSghostNodesFace(
- T* dev_scalarfield_x,
- T* dev_scalarfield_y,
- T* dev_scalarfield_z,
- int bc_bot,
- int bc_top)
+ T* __restrict__ dev_scalarfield_x,
+ T* __restrict__ dev_scalarfield_y,
+ T* __restrict__ dev_scalarfield_z,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -849,9 +856,9 @@ __global__ void setNSghostNodesFace(
// The edge (diagonal) cells are not written since they are not read. Launch
// this kernel for all cells in the grid.
__global__ void setNSghostNodes_tau(
- Float* dev_ns_tau,
- int bc_bot,
- int bc_top)
+ Float* __restrict__ dev_ns_tau,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -981,108 +988,108 @@ __global__ void setNSghostNodes_tau(
// The edge (diagonal) cells are not written since they are not read. Launch
// this kernel for all cells in the grid.
/*
-__global__ void setNSghostNodesForcing(
- Float* dev_ns_f1,
- Float3* dev_ns_f2,
- Float* dev_ns_f,
- unsigned int nijac)
-
-{
- // 3D thread index
- const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
- const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
- const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
-
- // Grid dimensions
- const unsigned int nx = devC_grid.num[0];
- const unsigned int ny = devC_grid.num[1];
- const unsigned int nz = devC_grid.num[2];
-
- // 1D thread index
- unsigned int cellidx = idx(x,y,z);
-
- // check that we are not outside the fluid grid
- if (x < nx && y < ny && z < nz) {
-
- __syncthreads();
- const Float f = dev_ns_f[cellidx];
- Float f1;
- Float3 f2;
-
- if (nijac == 0) {
- __syncthreads();
- f1 = dev_ns_f1[cellidx];
- f2 = dev_ns_f2[cellidx];
- }
-
- if (x == 0) {
- cellidx = idx(nx,y,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (x == nx-1) {
- cellidx = idx(-1,y,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
-
- if (y == 0) {
- cellidx = idx(x,ny,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (y == ny-1) {
- cellidx = idx(x,-1,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
-
- if (z == 0) {
- cellidx = idx(x,y,nz);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (z == nz-1) {
- cellidx = idx(x,y,-1);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- }
-}
+ __global__ void setNSghostNodesForcing(
+ Float* dev_ns_f1,
+ Float3* dev_ns_f2,
+ Float* dev_ns_f,
+ unsigned int nijac)
+
+ {
+ // 3D thread index
+ const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+ const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+ const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+
+ // Grid dimensions
+ const unsigned int nx = devC_grid.num[0];
+ const unsigned int ny = devC_grid.num[1];
+ const unsigned int nz = devC_grid.num[2];
+
+ // 1D thread index
+ unsigned int cellidx = idx(x,y,z);
+
+ // check that we are not outside the fluid grid
+ if (x < nx && y < ny && z < nz) {
+
+ __syncthreads();
+ const Float f = dev_ns_f[cellidx];
+ Float f1;
+ Float3 f2;
+
+ if (nijac == 0) {
+ __syncthreads();
+ f1 = dev_ns_f1[cellidx];
+ f2 = dev_ns_f2[cellidx];
+ }
+
+ if (x == 0) {
+ cellidx = idx(nx,y,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (x == nx-1) {
+ cellidx = idx(-1,y,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+
+ if (y == 0) {
+ cellidx = idx(x,ny,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (y == ny-1) {
+ cellidx = idx(x,-1,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+
+ if (z == 0) {
+ cellidx = idx(x,y,nz);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (z == nz-1) {
+ cellidx = idx(x,y,-1);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ }
+ }
*/
// Find the porosity in each cell on the base of a sphere, centered at the cell
// center.
__global__ void findPorositiesVelocitiesDiametersSpherical(
- const unsigned int* dev_cellStart,
- const unsigned int* dev_cellEnd,
- const Float4* dev_x_sorted,
- const Float4* dev_vel_sorted,
- Float* dev_ns_phi,
- Float* dev_ns_dphi,
- Float3* dev_ns_vp_avg,
- Float* dev_ns_d_avg,
- const unsigned int iteration,
- const unsigned int np,
- const Float c_phi)
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float3* __restrict__ dev_ns_vp_avg,
+ Float* __restrict__ dev_ns_d_avg,
+ const unsigned int iteration,
+ const unsigned int np,
+ const Float c_phi)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1114,9 +1121,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
// Cell sphere center position
const Float3 X = MAKE_FLOAT3(
- x*dx + 0.5*dx,
- y*dy + 0.5*dy,
- z*dz + 0.5*dz);
+ x*dx + 0.5*dx,
+ y*dy + 0.5*dy,
+ z*dz + 0.5*dz);
Float d, r;
Float phi = 1.00;
t@@ -1143,12 +1150,12 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
// Iterate over 27 neighbor cells, R = cell width
/*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
// Iterate over 27 neighbor cells, R = 2*cell width
for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
- //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
+ //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis
t@@ -1186,9 +1193,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
// Find center distance
dist = MAKE_FLOAT3(
- X.x - xr.x,
- X.y - xr.y,
- X.z - xr.z);
+ X.x - xr.x,
+ X.y - xr.y,
+ X.z - xr.z);
dist += distmod;
d = length(dist);
t@@ -1196,10 +1203,10 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
if ((R - r) < d && d < (R + r)) {
void_volume -=
1.0/(12.0*d) * (
- M_PI*(R + r - d)*(R + r - …
- *(d*d + 2.0*d*r - 3.0*r*r
- + 2.0*d*R + 6.0*r*R
- - 3.0*R*R) );
+ M_PI*(R + r - d)*(R + r - d)
+ *(d*d + 2.0*d*r - 3.0*r*r
+ + 2.0*d*R + 6.0*r*R
+ - 3.0*R*R) );
v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
d_avg += 2.0*r;
n++;
t@@ -1259,8 +1266,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
//Float phi = 0.5;
//Float dphi = 0.0;
//if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) {
- //phi = 0.4;
- //dphi = 0.1;
+ //phi = 0.4;
+ //dphi = 0.1;
//}
//dev_ns_phi[cellidx] = phi;
//dev_ns_dphi[cellidx] = dphi;
t@@ -1276,17 +1283,17 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
// Find the porosity in each cell on the base of a sphere, centered at the cell
// center.
__global__ void findPorositiesVelocitiesDiametersSphericalGradient(
- const unsigned int* dev_cellStart,
- const unsigned int* dev_cellEnd,
- const Float4* dev_x_sorted,
- const Float4* dev_vel_sorted,
- Float* dev_ns_phi,
- Float* dev_ns_dphi,
- Float3* dev_ns_vp_avg,
- Float* dev_ns_d_avg,
- const unsigned int iteration,
- const unsigned int ndem,
- const unsigned int np)
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float3* __restrict__ dev_ns_vp_avg,
+ Float* __restrict__ dev_ns_d_avg,
+ const unsigned int iteration,
+ const unsigned int ndem,
+ const unsigned int np)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1304,7 +1311,7 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
const Float dz = devC_grid.L[2]/nz;
// Cell sphere radius
- const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width
+ const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width
Float4 xr; // particle pos. and radius
t@@ -1315,9 +1322,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
// Cell sphere center position
const Float3 X = MAKE_FLOAT3(
- x*dx + 0.5*dx,
- y*dy + 0.5*dy,
- z*dz + 0.5*dz);
+ x*dx + 0.5*dx,
+ y*dy + 0.5*dy,
+ z*dz + 0.5*dz);
Float d, r;
Float phi = 1.00;
t@@ -1396,9 +1403,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
// Find center distance and normal vector
x_p = MAKE_FLOAT3(
- xr.x - X.x,
- xr.y - X.y,
- xr.z - X.z);
+ xr.x - X.x,
+ xr.y - X.y,
+ xr.z - X.z);
d = length(x_p);
n_p = x_p/d;
q = d/R;
t@@ -1438,8 +1445,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric…
phi = phi_0 + dphi/(ndem*devC_dt);
//if (dot_epsilon_kk != 0.0)
- //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n…
- //x,y,z, dot_epsilon_kk, dphi, phi);
+ //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
+ //x,y,z, dot_epsilon_kk, dphi, phi);
// Make sure that the porosity is in the interval [0.0;1.0]
phi = fmin(1.00, fmax(0.00, phi));
t@@ -1484,11 +1491,11 @@ __global__ void findPorositiesVelocitiesDiametersSpher…
// Modulate the hydraulic pressure at the upper boundary
__global__ void setUpperPressureNS(
- Float* dev_ns_p,
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- Float beta,
- const Float new_pressure)
+ Float* __restrict__ dev_ns_p,
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float beta,
+ const Float new_pressure)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1497,8 +1504,8 @@ __global__ void setUpperPressureNS(
// check that the thread is located at the top boundary
if (x < devC_grid.num[0] &&
- y < devC_grid.num[1] &&
- z == devC_grid.num[2]-1) {
+ y < devC_grid.num[1] &&
+ z == devC_grid.num[2]-1) {
const unsigned int cellidx = idx(x,y,z);
t@@ -1524,13 +1531,13 @@ __global__ void setUpperPressureNS(
// Find the gradient in a cell in a homogeneous, cubic 3D scalar field using
// finite central differences
__device__ Float3 gradient(
- const Float* dev_scalarfield,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_scalarfield,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
// Read 6 neighbor cells
__syncthreads();
t@@ -1544,26 +1551,26 @@ __device__ Float3 gradient(
//__syncthreads();
//if (p != 0.0)
- //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
+ //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
// Calculate central-difference gradients
return MAKE_FLOAT3(
- (xp - xn)/(2.0*dx),
- (yp - yn)/(2.0*dy),
- (zp - zn)/(2.0*dz));
+ (xp - xn)/(2.0*dx),
+ (yp - yn)/(2.0*dy),
+ (zp - zn)/(2.0*dz));
}
// Find the divergence in a cell in a homogeneous, cubic, 3D vector field
__device__ Float divergence(
- const Float* dev_vectorfield_x,
- const Float* dev_vectorfield_y,
- const Float* dev_vectorfield_z,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_vectorfield_x,
+ const Float* __restrict__ dev_vectorfield_y,
+ const Float* __restrict__ dev_vectorfield_z,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
// Read 6 cell-face values
__syncthreads();
t@@ -1583,13 +1590,13 @@ __device__ Float divergence(
// Find the divergence of a tensor field
__device__ Float3 divergence_tensor(
- Float* dev_tensorfield,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_tensorfield,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
__syncthreads();
t@@ -1638,18 +1645,18 @@ __device__ Float3 divergence_tensor(
// Calculate div(phi*tau)
const Float3 div_tensor = MAKE_FLOAT3(
- // x
- (t_xx_xp - t_xx_xn)/dx +
- (t_xy_yp - t_xy_yn)/dy +
- (t_xz_zp - t_xz_zn)/dz,
- // y
- (t_xy_xp - t_xy_xn)/dx +
- (t_yy_yp - t_yy_yn)/dy +
- (t_yz_zp - t_yz_zn)/dz,
- // z
- (t_xz_xp - t_xz_xn)/dx +
- (t_yz_yp - t_yz_yn)/dy +
- (t_zz_zp - t_zz_zn)/dz);
+ // x
+ (t_xx_xp - t_xx_xn)/dx +
+ (t_xy_yp - t_xy_yn)/dy +
+ (t_xz_zp - t_xz_zn)/dz,
+ // y
+ (t_xy_xp - t_xy_xn)/dx +
+ (t_yy_yp - t_yy_yn)/dy +
+ (t_yz_zp - t_yz_zn)/dz,
+ // z
+ (t_xz_xp - t_xz_xn)/dx +
+ (t_yz_yp - t_yz_yn)/dy +
+ (t_zz_zp - t_zz_zn)/dz);
#ifdef CHECK_NS_FINITE
(void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor);
t@@ -1661,8 +1668,8 @@ __device__ Float3 divergence_tensor(
// Find the spatial gradient in e.g. pressures per cell
// using first order central differences
__global__ void findNSgradientsDev(
- Float* dev_scalarfield, // in
- Float3* dev_vectorfield) // out
+ const Float* __restrict__ dev_scalarfield, // in
+ Float3* __restrict__ dev_vectorfield) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1699,8 +1706,8 @@ __global__ void findNSgradientsDev(
// Find the outer product of v v
__global__ void findvvOuterProdNS(
- Float3* dev_ns_v, // in
- Float* dev_ns_v_prod) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_v_prod) // out
{
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
t@@ -1750,8 +1757,8 @@ __global__ void findvvOuterProdNS(
// Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6
// values in 3D.
__global__ void findNSstressTensor(
- Float3* dev_ns_v, // in
- Float* dev_ns_tau) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1809,15 +1816,15 @@ __global__ void findNSstressTensor(
devC_params.mu*((zp.y - zn.y)/(2.0*dz) + (yp.z - yn.z)/(2.0*dy));
/*
- if (x == 0 && y == 0 && z == 0)
- printf("mu = %f\n", mu);
- if (tau_xz > 1.0e-6)
- printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
- if (tau_yz > 1.0e-6)
- printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
- if (tau_zz > 1.0e-6)
- printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
- */
+ if (x == 0 && y == 0 && z == 0)
+ printf("mu = %f\n", mu);
+ if (tau_xz > 1.0e-6)
+ printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
+ if (tau_yz > 1.0e-6)
+ printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
+ if (tau_zz > 1.0e-6)
+ printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
+ */
// Store values in global memory
__syncthreads();
t@@ -1842,9 +1849,9 @@ __global__ void findNSstressTensor(
// Find the divergence of phi*v*v
__global__ void findNSdivphiviv(
- Float* dev_ns_phi, // in
- Float3* dev_ns_v, // in
- Float3* dev_ns_div_phi_vi_v) // out
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float3* __restrict__ dev_ns_v, // in
+ Float3* __restrict__ dev_ns_div_phi_vi_v) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1888,58 +1895,58 @@ __global__ void findNSdivphiviv(
// Calculate upwind coefficients
//*
const Float3 a = MAKE_FLOAT3(
- copysign(1.0, v.x),
- copysign(1.0, v.y),
- copysign(1.0, v.z));
+ copysign(1.0, v.x),
+ copysign(1.0, v.y),
+ copysign(1.0, v.z));
// Calculate the divergence based on the upwind differences (Griebel et
// al. 1998, eq. 3.9)
const Float3 div_uw = MAKE_FLOAT3(
- // x
- ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
+ // x
+ ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
+ ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
- // y
- ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
+ // y
+ ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
+ ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
- // z
- ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
+ // z
+ ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
+ ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
// Calculate the divergence based on the central-difference gradients
const Float3 div_cd = MAKE_FLOAT3(
- // x
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
- // y
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
- // z
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
+ // x
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
+ // y
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
+ // z
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
// Weighting parameter
const Float tau = 0.5;
t@@ -1951,26 +1958,26 @@ __global__ void findNSdivphiviv(
/*
// Calculate the divergence: div(phi*v_i*v)
const Float3 div_phi_vi_v = MAKE_FLOAT3(
- // x
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
- // y
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
- // z
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
- // */
+ // x
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
+ // y
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
+ // z
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
+ // */
// Write divergence
__syncthreads();
dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v;
//printf("div(phi*v*v) [%d,%d,%d] = %f, %f, %f\n", x,y,z,
- //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
+ //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
#ifdef CHECK_NS_FINITE
(void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v);
t@@ -1979,8 +1986,8 @@ __global__ void findNSdivphiviv(
}
__global__ void findNSdivtau(
- Float* dev_ns_tau, // in
- Float3* dev_ns_div_tau) // out
+ const Float* __restrict__ dev_ns_tau, // in
+ Float3* __restrict__ dev_ns_div_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2015,9 +2022,9 @@ __global__ void findNSdivtau(
// Find the divergence of phi*tau
__global__ void findNSdivphitau(
- Float* dev_ns_phi, // in
- Float* dev_ns_tau, // in
- Float3* dev_ns_div_phi_tau) // out
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_tau, // in
+ Float3* __restrict__ dev_ns_div_phi_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2094,18 +2101,18 @@ __global__ void findNSdivphitau(
// Calculate div(phi*tau)
const Float3 div_phi_tau = MAKE_FLOAT3(
- // x
- (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
- (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
- (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
- // y
- (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
- (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
- (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
- // z
- (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
- (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
- (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
+ // x
+ (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
+ (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
+ (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
+ // y
+ (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
+ (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
+ (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
+ // z
+ (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
+ (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
+ (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
// Write divergence
__syncthreads();
t@@ -2120,9 +2127,9 @@ __global__ void findNSdivphitau(
// Find the divergence of phi v v
// Unused
__global__ void findNSdivphivv(
- Float* dev_ns_v_prod, // in
- Float* dev_ns_phi, // in
- Float3* dev_ns_div_phi_v_v) // out
+ const Float* __restrict__ dev_ns_v_prod, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ Float3* __restrict__ dev_ns_div_phi_v_v) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2174,24 +2181,24 @@ __global__ void findNSdivphivv(
// The symmetry described in findvvOuterProdNS is used
__syncthreads();
const Float3 div = MAKE_FLOAT3(
- ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
- ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
- ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
+ ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
//printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z);
t@@ -2209,25 +2216,25 @@ __global__ void findNSdivphivv(
// Find predicted fluid velocity
// Launch per face.
__global__ void findPredNSvelocities(
- Float* dev_ns_p, // in
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_dphi, // in
- Float* dev_ns_div_tau_x, // in
- Float* dev_ns_div_tau_y, // in
- Float* dev_ns_div_tau_z, // in
- Float3* dev_ns_div_phi_vi_v, // in
- int bc_bot, // in
- int bc_top, // in
- Float beta, // in
- Float3* dev_ns_F_pf, // in
- unsigned int ndem, // in
- Float c_grad_p, // in
- Float* dev_ns_v_p_x, // out
- Float* dev_ns_v_p_y, // out
- Float* dev_ns_v_p_z) // out
+ const Float* __restrict__ dev_ns_p, // in
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_dphi, // in
+ const Float* __restrict__ dev_ns_div_tau_x, // in
+ const Float* __restrict__ dev_ns_div_tau_y, // in
+ const Float* __restrict__ dev_ns_div_tau_z, // in
+ const Float3* __restrict__ dev_ns_div_phi_vi_v, // in
+ const int bc_bot, // in
+ const int bc_top, // in
+ const Float beta, // in
+ const Float3* __restrict__ dev_ns_F_pf, // in
+ const unsigned int ndem, // in
+ const Float __restrict__ c_grad_p, // in
+ Float* __restrict__ dev_ns_v_p_x, // out
+ Float* __restrict__ dev_ns_v_p_y, // out
+ Float* __restrict__ dev_ns_v_p_z) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2255,9 +2262,9 @@ __global__ void findPredNSvelocities(
// Values that are needed for calculating the predicted velocity
__syncthreads();
const Float3 v = MAKE_FLOAT3(
- dev_ns_v_x[fidx],
- dev_ns_v_y[fidx],
- dev_ns_v_z[fidx]);
+ dev_ns_v_x[fidx],
+ dev_ns_v_y[fidx],
+ dev_ns_v_z[fidx]);
Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0);
if (devC_params.mu > 0.0) {
t@@ -2285,13 +2292,13 @@ __global__ void findPredNSvelocities(
// component-wise average values
const Float3 phi = MAKE_FLOAT3(
- amean(phi_c, phi_xn),
- amean(phi_c, phi_yn),
- amean(phi_c, phi_zn));
+ amean(phi_c, phi_xn),
+ amean(phi_c, phi_yn),
+ amean(phi_c, phi_zn));
const Float3 dphi = MAKE_FLOAT3(
- amean(dphi_c, dphi_xn),
- amean(dphi_c, dphi_yn),
- amean(dphi_c, dphi_zn));
+ amean(dphi_c, dphi_xn),
+ amean(dphi_c, dphi_yn),
+ amean(dphi_c, dphi_zn));
// The particle-fluid interaction force should only be incoorporated if
// there is a fluid viscosity
t@@ -2308,9 +2315,9 @@ __global__ void findPredNSvelocities(
f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
}
const Float3 f_i = MAKE_FLOAT3(
- amean(f_i_c.x, f_i_xn.x),
- amean(f_i_c.y, f_i_yn.y),
- amean(f_i_c.z, f_i_zn.z));
+ amean(f_i_c.x, f_i_xn.x),
+ amean(f_i_c.y, f_i_yn.y),
+ amean(f_i_c.z, f_i_zn.z));
const Float dt = ndem*devC_dt;
const Float rho = devC_params.rho_f;
t@@ -2326,9 +2333,9 @@ __global__ void findPredNSvelocities(
const Float p_yn = dev_ns_p[idx(x,y-1,z)];
const Float p_zn = dev_ns_p[idx(x,y,z-1)];
const Float3 grad_p = MAKE_FLOAT3(
- (p - p_xn)/dx,
- (p - p_yn)/dy,
- (p - p_zn)/dz) * c_grad_p;
+ (p - p_xn)/dx,
+ (p - p_yn)/dy,
+ (p - p_zn)/dz) * c_grad_p;
#ifdef SET_1
pressure_term = -beta*dt/(rho*phi)*grad_p;
#endif
t@@ -2338,16 +2345,16 @@ __global__ void findPredNSvelocities(
}
const Float3 div_phi_vi_v = MAKE_FLOAT3(
- amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
- amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
- amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
+ amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
+ amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
+ amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
// Determine the predicted velocity
#ifdef SET_1
const Float3 interaction_term = -dt/(rho*phi)*f_i;
const Float3 diffusion_term = dt/(rho*phi)*div_tau;
const Float3 gravity_term = MAKE_FLOAT3(
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
const Float3 porosity_term = -1.0*v*dphi/phi;
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
#endif
t@@ -2355,7 +2362,7 @@ __global__ void findPredNSvelocities(
const Float3 interaction_term = -dt/(rho*phi)*f_i;
const Float3 diffusion_term = dt/rho*div_tau;
const Float3 gravity_term = MAKE_FLOAT3(
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
const Float3 porosity_term = -1.0*v*dphi/phi;
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
#endif
t@@ -2376,31 +2383,31 @@ __global__ void findPredNSvelocities(
// No slip
/*if ((z == 0 && bc_bot == 2) || (z == nz-1 && bc_top == 2)) {
- v_p.x = 0.0;
- v_p.y = 0.0;
- v_p.z = 0.0;
- }*/
+ v_p.x = 0.0;
+ v_p.y = 0.0;
+ v_p.z = 0.0;
+ }*/
#ifdef REPORT_V_P_COMPONENTS
// Report velocity components to stdout for debugging
if (z==0)
- printf("\n[%d,%d,%d]"
- "\tv_p = %+e %+e %+e\n"
- "\tpres = %+e %+e %+e\n"
- "\tinteract = %+e %+e %+e\n"
- "\tdiff = %+e %+e %+e\n"
- "\tgrav = %+e %+e %+e\n"
- "\tporos = %+e %+e %+e\n"
- "\tadv = %+e %+e %+e\n",
- x, y, z,
- v_p.x, v_p.y, v_p.z,
- pressure_term.x, pressure_term.y, pressure_term.z,
- interaction_term.x, interaction_term.y, interaction_term.z,
- diffusion_term.x, diffusion_term.y, diffusion_term.z,
- gravity_term.x, gravity_term.y, gravity_term.z,
- porosity_term.x, porosity_term.y, porosity_term.z,
- advection_term.x, advection_term.y, advection_term.z);
+ printf("\n[%d,%d,%d]"
+ "\tv_p = %+e %+e %+e\n"
+ "\tpres = %+e %+e %+e\n"
+ "\tinteract = %+e %+e %+e\n"
+ "\tdiff = %+e %+e %+e\n"
+ "\tgrav = %+e %+e %+e\n"
+ "\tporos = %+e %+e %+e\n"
+ "\tadv = %+e %+e %+e\n",
+ x, y, z,
+ v_p.x, v_p.y, v_p.z,
+ pressure_term.x, pressure_term.y, pressure_term.z,
+ interaction_term.x, interaction_term.y, interaction_term.z,
+ diffusion_term.x, diffusion_term.y, diffusion_term.z,
+ gravity_term.x, gravity_term.y, gravity_term.z,
+ porosity_term.x, porosity_term.y, porosity_term.z,
+ advection_term.x, advection_term.y, advection_term.z);
#endif
// Save the predicted velocity
t@@ -2421,18 +2428,18 @@ __global__ void findPredNSvelocities(
// At each iteration, the value of the forcing function is found as:
// f = f1 - f2 dot grad(epsilon)
__global__ void findNSforcing(
- Float* dev_ns_epsilon, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_dphi, // in
- Float3* dev_ns_v_p, // in
- Float* dev_ns_v_p_x, // in
- Float* dev_ns_v_p_y, // in
- Float* dev_ns_v_p_z, // in
- unsigned int nijac, // in
- unsigned int ndem, // in
- Float* dev_ns_f1, // out
- Float3* dev_ns_f2, // out
- Float* dev_ns_f) // out
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_dphi, // in
+ const Float3* __restrict__ dev_ns_v_p, // in
+ const Float* __restrict__ dev_ns_v_p_x, // in
+ const Float* __restrict__ dev_ns_v_p_y, // in
+ const Float* __restrict__ dev_ns_v_p_z, // in
+ const unsigned int nijac, // in
+ const unsigned int ndem, // in
+ Float* __restrict__ dev_ns_f1, // out
+ Float3* __restrict__ dev_ns_f2, // out
+ Float* __restrict__ dev_ns_f) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2472,7 +2479,7 @@ __global__ void findNSforcing(
// Calculate derivatives
const Float div_v_p
= divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z,
- x, y, z, dx, dy, dz);
+ x, y, z, dx, dy, dz);
const Float3 grad_phi
= gradient(dev_ns_phi, x, y, z, dx, dy, dz);
t@@ -2497,7 +2504,7 @@ __global__ void findNSforcing(
#ifdef REPORT_FORCING_TERMS
// Report values terms in the forcing function for debugging
printf("[%d,%d,%d]\tt1 = %f\tt2 = %f\tt4 = %f\n",
- x,y,z, t1, t2, t4);
+ x,y,z, t1, t2, t4);
#endif
// Save values
t@@ -2524,7 +2531,7 @@ __global__ void findNSforcing(
const Float t3 = -dot(f2, grad_epsilon);
if (z >= nz-3)
printf("[%d,%d,%d]\tf = %f\tf1 = %f\tt3 = %f\n",
- x,y,z, f, f1, t3);
+ x,y,z, f, f1, t3);
#endif
// Save forcing function value
t@@ -2542,10 +2549,10 @@ __global__ void findNSforcing(
// non-smoothed and smoothed values.
template<typename T>
__global__ void smoothing(
- T* dev_arr,
- const Float gamma,
- const unsigned int bc_bot,
- const unsigned int bc_top)
+ T* __restrict__ dev_arr,
+ const Float gamma,
+ const unsigned int bc_bot,
+ const unsigned int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2602,13 +2609,13 @@ __global__ void smoothing(
// Perform a single Jacobi iteration
__global__ void jacobiIterationNS(
- const Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- Float* dev_ns_norm,
- const Float* dev_ns_f,
- const int bc_bot,
- const int bc_top,
- const Float theta)
+ const Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ Float* __restrict__ dev_ns_norm,
+ const Float* __restrict__ dev_ns_f,
+ const int bc_bot,
+ const int bc_top,
+ const Float theta)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2672,9 +2679,9 @@ __global__ void jacobiIterationNS(
const Float dzdz = dz*dz;
Float e_new
= (-dxdx*dydy*dzdz*f
- + dydy*dzdz*(e_xn + e_xp)
- + dxdx*dzdz*(e_yn + e_yp)
- + dxdx*dydy*(e_zn + e_zp))
+ + dydy*dzdz*(e_xn + e_xp)
+ + dxdx*dzdz*(e_yn + e_yp)
+ + dxdx*dydy*(e_zn + e_zp))
/(2.0*(dxdx*dydy + dxdx*dzdz + dydy*dzdz));
// New value of epsilon in 1D update
t@@ -2682,7 +2689,7 @@ __global__ void jacobiIterationNS(
// Print values for debugging
/*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n",
- x,y,z, e, f, e_new);*/
+ x,y,z, e, f, e_new);*/
const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16);
const Float e_relax = e*(1.0-theta) + e_new*theta;
t@@ -2697,7 +2704,7 @@ __global__ void jacobiIterationNS(
//(void)checkFiniteFloat("res_norm", x, y, z, res_norm);
if (checkFiniteFloat("res_norm", x, y, z, res_norm)) {
printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n",
- x,y,z, e, f, e_new, res_norm);
+ x,y,z, e, f, e_new, res_norm);
}
#endif
}
t@@ -2706,8 +2713,8 @@ __global__ void jacobiIterationNS(
// Copy all values from one array to the other
template<typename T>
__global__ void copyValues(
- T* dev_read,
- T* dev_write)
+ const T* __restrict__ dev_read,
+ T* __restrict__ dev_write)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2717,10 +2724,10 @@ __global__ void copyValues(
// Internal nodes only
if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
- // Internal nodes + ghost nodes
- /*if (x <= devC_grid.num[0]+1 &&
- y <= devC_grid.num[1]+1 &&
- z <= devC_grid.num[2]+1) {*/
+ // Internal nodes + ghost nodes
+ /*if (x <= devC_grid.num[0]+1 &&
+ y <= devC_grid.num[1]+1 &&
+ z <= devC_grid.num[2]+1) {*/
const unsigned int cellidx = idx(x,y,z); // without ghost nodes
//const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes
t@@ -2730,7 +2737,7 @@ __global__ void copyValues(
const T val = dev_read[cellidx];
//if (z == devC_grid.num[2]-1)
- //printf("[%d,%d,%d] = %f\n", x, y, z, val);
+ //printf("[%d,%d,%d] = %f\n", x, y, z, val);
// Write
__syncthreads();
t@@ -2740,11 +2747,11 @@ __global__ void copyValues(
// Find and store the normalized residuals
__global__ void findNormalizedResiduals(
- Float* dev_ns_epsilon_old,
- Float* dev_ns_epsilon,
- Float* dev_ns_norm,
- const unsigned int bc_bot,
- const unsigned int bc_top)
+ const Float* __restrict__ dev_ns_epsilon_old,
+ const Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_norm,
+ const unsigned int bc_bot,
+ const unsigned int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2791,9 +2798,9 @@ __global__ void findNormalizedResiduals(
// Computes the new velocity and pressure using the corrector
__global__ void updateNSpressure(
- Float* dev_ns_epsilon, // in
- Float beta, // in
- Float* dev_ns_p) // out
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float __restrict__ beta, // in
+ Float* __restrict__ dev_ns_p) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2825,19 +2832,19 @@ __global__ void updateNSpressure(
}
__global__ void updateNSvelocity(
- Float* dev_ns_v_p_x, // in
- Float* dev_ns_v_p_y, // in
- Float* dev_ns_v_p_z, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_epsilon, // in
- Float beta, // in
- int bc_bot, // in
- int bc_top, // in
- unsigned int ndem, // in
- Float c_grad_p, // in
- Float* dev_ns_v_x, // out
- Float* dev_ns_v_y, // out
- Float* dev_ns_v_z) // out
+ const Float* __restrict__ dev_ns_v_p_x, // in
+ const Float* __restrict__ dev_ns_v_p_y, // in
+ const Float* __restrict__ dev_ns_v_p_z, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float beta, // in
+ const int bc_bot, // in
+ const int bc_top, // in
+ const unsigned int ndem, // in
+ const Float c_grad_p, // in
+ Float* __restrict__ dev_ns_v_x, // out
+ Float* __restrict__ dev_ns_v_y, // out
+ Float* __restrict__ dev_ns_v_z) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2879,15 +2886,15 @@ __global__ void updateNSvelocity(
const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];
const Float3 phi = MAKE_FLOAT3(
- amean(phi_c, phi_xn),
- amean(phi_c, phi_yn),
- amean(phi_c, phi_zn));
+ amean(phi_c, phi_xn),
+ amean(phi_c, phi_yn),
+ amean(phi_c, phi_zn));
// Find corrector gradient
const Float3 grad_epsilon = MAKE_FLOAT3(
- (epsilon_c - epsilon_xn)/dx,
- (epsilon_c - epsilon_yn)/dy,
- (epsilon_c - epsilon_zn)/dz) * c_grad_p;
+ (epsilon_c - epsilon_xn)/dx,
+ (epsilon_c - epsilon_yn)/dy,
+ (epsilon_c - epsilon_zn)/dz) * c_grad_p;
// Find new velocity
#ifdef SET_1
t@@ -2900,16 +2907,16 @@ __global__ void updateNSvelocity(
// Print values for debugging
/* if (z == 0) {
- Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
- Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
- printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
- x,y,z,
- grad_epsilon.x,
- grad_epsilon.y,
- grad_epsilon.z,
- e_up,
- e_down);
- }*/
+ Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
+ Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
+ printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
+ x,y,z,
+ grad_epsilon.x,
+ grad_epsilon.y,
+ grad_epsilon.z,
+ e_up,
+ e_down);
+ }*/
if ((z == 0 && bc_bot == 1) || (z == nz-1 && bc_top == 1))
v.z = 0.0;
t@@ -2919,11 +2926,11 @@ __global__ void updateNSvelocity(
// Check the advection term using the Courant-Friedrichs-Lewy condition
if (v.x*ndem*devC_dt/dx
- + v.y*ndem*devC_dt/dy
- + v.z*ndem*devC_dt/dz > 1.0) {
+ + v.y*ndem*devC_dt/dy
+ + v.z*ndem*devC_dt/dz > 1.0) {
printf("[%d,%d,%d] Warning: Advection term in fluid may be "
- "unstable (CFL condition), v = %f,%f,%f\n",
- x,y,z, v.x, v.y, v.z);
+ "unstable (CFL condition), v = %f,%f,%f\n",
+ x,y,z, v.x, v.y, v.z);
}
// Write new values
t@@ -2941,12 +2948,12 @@ __global__ void updateNSvelocity(
// Find the average particle diameter and velocity for each CFD cell.
// UNUSED: The values are estimated in the porosity estimation function instead
__global__ void findAvgParticleVelocityDiameter(
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float4* dev_vel_sorted, // in
- Float4* dev_x_sorted, // in
- Float3* dev_ns_vp_avg, // out
- Float* dev_ns_d_avg) // out
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float4* __restrict__ dev_vel_sorted, // in
+ const Float4* __restrict__ dev_x_sorted, // in
+ Float3* dev_ns_vp_avg, // out
+ Float* dev_ns_d_avg) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3023,18 +3030,18 @@ __device__ Float dragCoefficient(Float re)
// Find particle-fluid interaction force as outlined by Zhou et al. 2010, and
// originally by Gidaspow 1992.
__global__ void findInteractionForce(
- Float4* dev_x, // in
- Float4* dev_vel, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_p, // in
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z, // in
- Float* dev_ns_div_tau_x,// in
- Float* dev_ns_div_tau_y,// in
- Float* dev_ns_div_tau_z,// in
- Float3* dev_ns_f_pf, // out
- Float4* dev_force) // out
+ const Float4* __restrict__ dev_x, // in
+ const Float4* __restrict__ dev_vel, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_p, // in
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z, // in
+ const Float* __restrict__ dev_ns_div_tau_x,// in
+ const Float* __restrict__ dev_ns_div_tau_y,// in
+ const Float* __restrict__ dev_ns_div_tau_z,// in
+ Float3* __restrict__ dev_ns_f_pf, // out
+ Float4* __restrict__ dev_force) // out
{
unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index
t@@ -3080,14 +3087,14 @@ __global__ void findInteractionForce(
const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)];
const Float3 v_f = MAKE_FLOAT3(
- amean(v_x, v_x_p),
- amean(v_y, v_y_p),
- amean(v_z, v_z_p));
+ amean(v_x, v_x_p),
+ amean(v_y, v_y_p),
+ amean(v_z, v_z_p));
const Float3 div_tau = MAKE_FLOAT3(
- amean(div_tau_x, div_tau_x_p),
- amean(div_tau_y, div_tau_y_p),
- amean(div_tau_z, div_tau_z_p));
+ amean(div_tau_x, div_tau_x_p),
+ amean(div_tau_y, div_tau_y_p),
+ amean(div_tau_z, div_tau_z_p));
const Float3 v_rel = v_f - v_p;
const Float v_rel_length = length(v_rel);
t@@ -3119,15 +3126,15 @@ __global__ void findInteractionForce(
#ifdef CHECK_NS_FINITE
/*
- printf("\nfindInteractionForce %d [%d,%d,%d]\n"
- "\tV_p = %f Re=%f Cd=%f chi=%f\n"
- "\tf_d = %+e %+e %+e\n"
- "\tf_p = %+e %+e %+e\n"
- "\tf_v = %+e %+e %+e\n",
- i, i_x, i_y, i_z, V_p, Re, Cd, chi,
- f_d.x, f_d.y, f_d.z,
- f_p.x, f_p.y, f_p.z,
- f_v.x, f_v.y, f_v.z);// */
+ printf("\nfindInteractionForce %d [%d,%d,%d]\n"
+ "\tV_p = %f Re=%f Cd=%f chi=%f\n"
+ "\tf_d = %+e %+e %+e\n"
+ "\tf_p = %+e %+e %+e\n"
+ "\tf_v = %+e %+e %+e\n",
+ i, i_x, i_y, i_z, V_p, Re, Cd, chi,
+ f_d.x, f_d.y, f_d.z,
+ f_p.x, f_p.y, f_p.z,
+ f_v.x, f_v.y, f_v.z);// */
checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d);
checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p);
checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v);
t@@ -3149,11 +3156,11 @@ __global__ void findInteractionForce(
// Apply the fluid-particle interaction force to the fluid cell based on the
// interaction forces from each particle in it
__global__ void applyInteractionForceToFluid(
- unsigned int* dev_gridParticleIndex, // in
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float3* dev_ns_f_pf, // in
- Float3* dev_ns_F_pf) // out
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float3* __restrict__ dev_ns_f_pf, // in
+ Float3* __restrict__ dev_ns_F_pf) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3212,10 +3219,10 @@ __global__ void applyInteractionForceToFluid(
// Launch per cell face node.
// Cell center ghost nodes must be set prior to call.
__global__ void interpolateCenterToFace(
- Float3* dev_in,
- Float* dev_out_x,
- Float* dev_out_y,
- Float* dev_out_z)
+ const Float3* __restrict__ dev_in,
+ Float* __restrict__ dev_out_x,
+ Float* __restrict__ dev_out_y,
+ Float* __restrict__ dev_out_z)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3224,8 +3231,8 @@ __global__ void interpolateCenterToFace(
// Check that we are not outside the fluid grid
if (x <= devC_grid.num[0]
- && y <= devC_grid.num[1]
- && z <= devC_grid.num[2]) {
+ && y <= devC_grid.num[1]
+ && z <= devC_grid.num[2]) {
const unsigned int faceidx = vidx(x,y,z);
t@@ -3249,10 +3256,10 @@ __global__ void interpolateCenterToFace(
// Launch per cell center node
__global__ void interpolateFaceToCenter(
- Float* dev_in_x,
- Float* dev_in_y,
- Float* dev_in_z,
- Float3* dev_out)
+ Float* dev_in_x,
+ Float* dev_in_y,
+ Float* dev_in_z,
+ Float3* dev_out)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3277,9 +3284,9 @@ __global__ void interpolateFaceToCenter(
const Float z_p = dev_in_z[vidx(x,y,z+1)];
const Float3 val = MAKE_FLOAT3(
- amean(x_n, x_p),
- amean(y_n, y_p),
- amean(z_n, z_p));
+ amean(x_n, x_p),
+ amean(y_n, y_p),
+ amean(z_n, z_p));
__syncthreads();
//printf("[%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z);
t@@ -3292,12 +3299,12 @@ __global__ void interpolateFaceToCenter(
// Warning: The grid-corner values will be invalid, along with the non-normal
// components of the ghost nodes
__global__ void findFaceDivTau(
- Float* dev_ns_v_x,
- Float* dev_ns_v_y,
- Float* dev_ns_v_z,
- Float* dev_ns_div_tau_x,
- Float* dev_ns_div_tau_y,
- Float* dev_ns_div_tau_z)
+ const Float* __restrict__ dev_ns_v_x,
+ const Float* __restrict__ dev_ns_v_y,
+ const Float* __restrict__ dev_ns_v_z,
+ Float* __restrict__ dev_ns_div_tau_x,
+ Float* __restrict__ dev_ns_div_tau_y,
+ Float* __restrict__ dev_ns_div_tau_z)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3347,19 +3354,19 @@ __global__ void findFaceDivTau(
const Float div_tau_x =
devC_params.mu*(
- (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
- (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
- (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
+ (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
+ (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
+ (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
const Float div_tau_y =
devC_params.mu*(
- (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
- (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
- (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
+ (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
+ (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
+ (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
const Float div_tau_z =
devC_params.mu*(
- (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
- (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
- (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
+ (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
+ (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
+ (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
__syncthreads();
//printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z,
diff --git a/src/raytracer.cuh b/src/raytracer.cuh
t@@ -6,12 +6,14 @@
//#include "cuPrintf.cu"
// Template for discarding the last term in four-component vector structs
-__device__ __inline__ float3 f4_to_f3(float4 in) {
+__device__ __inline__ float3 f4_to_f3(const float4 in) {
return make_float3(in.x, in.y, in.z);
}
// Kernel for initializing image data
-__global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
+__global__ void imageInit(
+ unsigned char* __restrict__ dev_img,
+ const unsigned int pixels)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -24,11 +26,12 @@ __global__ void imageInit(unsigned char* dev_img, unsigned…
}
// Calculate ray origins and directions
-__global__ void rayInitPerspective(float4* dev_ray_origo,
- float4* dev_ray_direction,
- float4 eye,
- unsigned int width,
- unsigned int height)
+__global__ void rayInitPerspective(
+ float4* __restrict__ dev_ray_origo,
+ float4* __restrict__ dev_ray_direction,
+ const float4 eye,
+ const unsigned int width,
+ const unsigned int height)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -47,15 +50,17 @@ __global__ void rayInitPerspective(float4* dev_ray_origo,
// Write ray origo and direction to global memory
dev_ray_origo[mempos] = make_float4(devC_eye, 0.0f);
- dev_ray_direction[mempos] = make_float4(-devC_d*devC_w + p_u*devC_u + p_v*…
+ dev_ray_direction[mempos] =
+ make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f);
}
// Check wether the pixel's viewing ray intersects with the spheres,
// and shade the pixel correspondingly
-__global__ void rayIntersectSpheres(float4* dev_ray_origo,
- float4* dev_ray_direction,
- Float4* dev_x,
- unsigned char* dev_img)
+__global__ void rayIntersectSpheres(
+ const float4* __restrict__ dev_ray_origo,
+ const float4* __restrict__ dev_ray_direction,
+ const Float4* __restrict__ dev_x,
+ unsigned char* __restrict__ dev_img)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -67,7 +72,8 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
float3 d = f4_to_f3(dev_ray_direction[mempos]);
//float step = length(d);
- // Distance, in ray steps, between object and eye initialized with a large…
+ // Distance, in ray steps, between object and eye initialized with
+ // a large value
float tdist = 1e10f;
// Surface normal at closest sphere intersection
t@@ -86,8 +92,6 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
float3 c = make_float3(x.x, x.y, x.z);
float R = x.w;
- //cuPrintf("particle %d at: %f, %f, %f, radius: %f\n", i, c.x, c.y, c.…
-
// Calculate the discriminant: d = B^2 - 4AC
float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
- 4.0f*dot(d,d) // -4*A
t@@ -98,8 +102,10 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
if (Delta > 0.0f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) …
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus = ((dot(-d,(e-c))
+ - sqrt( dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -137,14 +143,15 @@ __global__ void rayIntersectSpheres(float4* dev_ray_orig…
// Check wether the pixel's viewing ray intersects with the spheres,
// and shade the pixel correspondingly using a colormap
-__global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
- float4* dev_ray_direction,
- Float4* dev_x,
- Float4* dev_vel,
- Float* dev_linarr,
- float max_val,
- float lower_cutoff,
- unsigned char* dev_img)
+__global__ void rayIntersectSpheresColormap(
+ const float4* __restrict__ dev_ray_origo,
+ const float4* __restrict__ dev_ray_direction,
+ const Float4* __restrict__ dev_x,
+ const Float4* __restrict__ dev_vel,
+ const Float* __restrict__ dev_linarr,
+ const float max_val,
+ const float lower_cutoff,
+ unsigned char* __restrict__ dev_img)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -155,7 +162,8 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra…
float3 e = f4_to_f3(dev_ray_origo[mempos]);
float3 d = f4_to_f3(dev_ray_direction[mempos]);
- // Distance, in ray steps, between object and eye initialized with a large…
+ // Distance, in ray steps, between object and eye initialized with
+ // a large value
float tdist = 1e10f;
// Surface normal at closest sphere intersection
t@@ -181,8 +189,6 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra…
- 4.0f*dot(d,d) // -4*A
* (dot((e-c),(e-c)) - R*R); // C
-
-
// If the determinant is positive, there are two solutions
// One where the line enters the sphere, and one where it exits
if (lower_cutoff > 0.0) {
t@@ -198,8 +204,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r…
if (Delta > 0.0f && val > lower_cutoff && fixvel == 0.f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-…
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus =
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)…
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -217,8 +225,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r…
if (Delta > 0.0f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-…
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus =
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)…
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -269,25 +279,28 @@ __global__ void rayIntersectSpheresColormap(float4* dev_…
// Write shading model values to pixel color channels
dev_img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*redv);
+ + k_a * I_a)*redv);
dev_img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*greenv);
+ + k_a * I_a)*greenv);
dev_img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*bluev);
+ + k_a * I_a)*bluev);
}
}
__host__ void DEM::cameraInit(
- const float3 eye,
- const float3 lookat,
- const float imgw,
- const float focalLength)
+ const float3 eye,
+ const float3 lookat,
+ const float imgw,
+ const float focalLength)
{
float hw_ratio = height/width;
// Image dimensions in world space (l, r, b, t)
- float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, …
+ float4 imgplane = make_float4(
+ -0.5f*imgw, 0.5f*imgw,
+ -0.5f*imgw*hw_ratio,
+ 0.5f*imgw*hw_ratio);
// The view vector
float3 view = eye - lookat;
t@@ -309,15 +322,6 @@ __host__ void DEM::cameraInit(
if (verbose == 1)
std::cout << " Transfering camera values to constant memory: ";
- /* Reference by string removed in cuda 5.0
- cudaMemcpyToSymbol("devC_u", &u, sizeof(u));
- cudaMemcpyToSymbol("devC_v", &v, sizeof(v));
- cudaMemcpyToSymbol("devC_w", &w, sizeof(w));
- cudaMemcpyToSymbol("devC_eye", &eye, sizeof(eye));
- cudaMemcpyToSymbol("devC_imgplane", &imgplane, sizeof(imgplane));
- cudaMemcpyToSymbol("devC_d", &d, sizeof(d));
- cudaMemcpyToSymbol("devC_light", &light, sizeof(light));
- cudaMemcpyToSymbol("devC_pixels", &pixels, sizeof(pixels));*/
cudaMemcpyToSymbol(devC_u, &u, sizeof(u));
cudaMemcpyToSymbol(devC_v, &v, sizeof(v));
cudaMemcpyToSymbol(devC_w, &w, sizeof(w));
t@@ -333,7 +337,7 @@ __host__ void DEM::cameraInit(
}
// Allocate global device memory
-__host__ void DEM::rt_allocateGlobalDeviceMemory(void)
+__host__ void DEM::rt_allocateGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Allocating device memory: ";
t@@ -347,7 +351,7 @@ __host__ void DEM::rt_allocateGlobalDeviceMemory(void)
// Free dynamically allocated device memory
-__host__ void DEM::rt_freeGlobalDeviceMemory(void)
+__host__ void DEM::rt_freeGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Freeing device memory: ";
t@@ -360,11 +364,12 @@ __host__ void DEM::rt_freeGlobalDeviceMemory(void)
}
// Transfer image data from device to host
-__host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
+__host__ void DEM::rt_transferFromGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Transfering image data: device -> host: ";
- cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char), cudaMemcpyD…
+ cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char),
+ cudaMemcpyDeviceToHost);
if (verbose == 1)
std::cout << "Done" << std::endl;
checkForCudaErrors("During rt_transferFromGlobalDeviceMemory()");
t@@ -372,24 +377,13 @@ __host__ void DEM::rt_transferFromGlobalDeviceMemory(voi…
// Wrapper for the rt kernel
__host__ void DEM::render(
- const int method,
- const float maxval,
- const float lower_cutoff,
- const float focalLength,
- const unsigned int img_width,
- const unsigned int img_height)
- /*float4* p, unsigned int np,
- rgba* img, unsigned int width, unsigned int height,
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
- int visualize, float max_val,
- float* fixvel,
- float* xsum,
- float* pres,
- float* es_dot,
- float* es,
- float* vel)*/
+ const int method,
+ const float maxval,
+ const float lower_cutoff,
+ const float focalLength,
+ const unsigned int img_width,
+ const unsigned int img_height)
{
- // Namespace directives
using std::cout;
using std::cerr;
using std::endl;
t@@ -418,15 +412,15 @@ __host__ void DEM::render(
// Look at the centre of the mean positions
float3 lookat = make_float3(maxpos.x, maxpos.y, maxpos.z) / 2.0f;
float3 eye = make_float3(
- grid.L[0] * 2.3f,
- grid.L[1] * -5.0f,
- grid.L[2] * 1.3f);
+ grid.L[0] * 2.3f,
+ grid.L[1] * -5.0f,
+ grid.L[2] * 1.3f);
cameraInit(eye, lookat, imgw, focalLength);
// Construct rays for perspective projection
rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
+ dev_ray_origo, dev_ray_direction,
+ make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
cudaThreadSynchronize();
Float* linarr; // Linear array to use for color visualization
t@@ -440,8 +434,8 @@ __host__ void DEM::render(
// Visualize spheres without color scale overlay
if (method == 0) {
rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- dev_x, dev_img);
+ dev_ray_origo, dev_ray_direction,
+ dev_x, dev_img);
} else {
if (method == 1) { // Visualize pressure
t@@ -455,8 +449,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.vel[i].x*k.vel[i].x
- + k.vel[i].y*k.vel[i].y
- + k.vel[i].z*k.vel[i].z);
+ + k.vel[i].y*k.vel[i].y
+ + k.vel[i].z*k.vel[i].z);
}
transfer = 1;
desc = "Linear velocity";
t@@ -468,8 +462,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.angvel[i].x*k.angvel[i].x
- + k.angvel[i].y*k.angvel[i].y
- + k.angvel[i].z*k.angvel[i].z);
+ + k.angvel[i].y*k.angvel[i].y
+ + k.angvel[i].z*k.angvel[i].z);
}
transfer = 1;
desc = "Angular velocity";
t@@ -492,8 +486,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.angpos[i].x*k.angpos[i].x
- + k.angpos[i].y*k.angpos[i].y
- + k.angpos[i].z*k.angpos[i].z);
+ + k.angpos[i].y*k.angpos[i].y
+ + k.angpos[i].z*k.angpos[i].z);
}
transfer = 1;
desc = "Angular positions";
t@@ -504,23 +498,24 @@ __host__ void DEM::render(
// Report color visualization method and color map range
if (verbose == 1) {
cout << " " << desc << " color map range: [0, "
- << maxval << "] " << unit << endl;
+ << maxval << "] " << unit << endl;
}
// Copy linarr to dev_linarr if required
if (transfer == 1) {
cudaMalloc((void**)&dev_linarr, np*sizeof(Float));
checkForCudaErrors("Error during cudaMalloc of linear array");
- cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), cudaMemcpyHostToD…
+ cudaMemcpy(dev_linarr, linarr, np*sizeof(Float),
+ cudaMemcpyHostToDevice);
checkForCudaErrors("Error during cudaMemcpy of linear array");
}
// Start raytracing kernel
rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- dev_x, dev_vel,
- dev_linarr, maxval, lower_cutoff,
- dev_img);
+ dev_ray_origo, dev_ray_direction,
+ dev_x, dev_vel,
+ dev_linarr, maxval, lower_cutoff,
+ dev_img);
}
diff --git a/tests/fluid_particle_interaction.py b/tests/fluid_particle_interac…
t@@ -50,5 +50,4 @@ test(sim.vel[0,0] > 0.0, 'Particle 0 velocity:')
test(sim.vel[1,0] > 0.0, 'Particle 1 velocity:')
test(sim.vel[2,0] > 0.0, 'Particle 2 velocity:')
-
-sim.cleanup()
+#sim.cleanup()
You are viewing proxied material from mx1.adamsgaard.dk. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.