tadd const and __restrict__ keywords to facilitate constant memory cache usage … | |
git clone git://src.adamsgaard.dk/sphere | |
Log | |
Files | |
Refs | |
LICENSE | |
--- | |
commit 21ed15ef593a831b2124a48bbc662b9a227f85ac | |
parent 7626d19392ecba63c4b2e5f4d419e8423d7a90a0 | |
Author: Anders Damsgaard <[email protected]> | |
Date: Mon, 11 Aug 2014 14:37:07 +0200 | |
add const and __restrict__ keywords to facilitate constant memory cache usage | |
Diffstat: | |
M src/cohesion.cuh | 143 ++++++++---------------------… | |
M src/contactmodels.cuh | 164 ++++++++++++++++++-----------… | |
M src/contactsearch.cuh | 246 ++++++++++++++++-------------… | |
M src/integration.cuh | 70 ++++++++++++++++++-----------… | |
M src/navierstokes.cuh | 1213 ++++++++++++++++-------------… | |
M src/raytracer.cuh | 173 +++++++++++++++--------------… | |
M tests/fluid_particle_interaction.py | 3 +-- | |
7 files changed, 996 insertions(+), 1016 deletions(-) | |
--- | |
diff --git a/src/cohesion.cuh b/src/cohesion.cuh | |
t@@ -6,14 +6,14 @@ | |
// Check bond pair list, apply linear contact model to pairs | |
__global__ void bondsLinear( | |
- uint2* dev_bonds, | |
- Float4* dev_bonds_delta, // Contact displacement | |
- Float4* dev_bonds_omega, // Contact rotational displacement | |
- Float4* dev_x, | |
- Float4* dev_vel, | |
- Float4* dev_angvel, | |
- Float4* dev_force, | |
- Float4* dev_torque) | |
+ uint2* __restrict__ dev_bonds, | |
+ Float4* __restrict__ dev_bonds_delta, // Contact displacement | |
+ Float4* __restrict__ dev_bonds_omega, // Contact rotational displacement | |
+ const Float4* __restrict__ dev_x, | |
+ const Float4* __restrict__ dev_vel, | |
+ const Float4* __restrict__ dev_angvel, | |
+ Float4* __restrict__ dev_force, | |
+ Float4* __restrict__ dev_torque) | |
{ | |
// Find thread index | |
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
t@@ -35,16 +35,16 @@ __global__ void bondsLinear( | |
// Convert tangential vectors to Float3's | |
// Uncorrected tangential component of displacement | |
Float3 delta0_t = MAKE_FLOAT3( | |
- delta0_4.x, | |
- delta0_4.y, | |
- delta0_4.z); | |
+ delta0_4.x, | |
+ delta0_4.y, | |
+ delta0_4.z); | |
const Float delta0_n = delta0_4.w; | |
// Uncorrected tangential component of rotation | |
Float3 omega0_t = MAKE_FLOAT3( | |
- omega0_4.x, | |
- omega0_4.y, | |
- omega0_4.z); | |
+ omega0_4.x, | |
+ omega0_4.y, | |
+ omega0_4.z); | |
const Float omega0_n = omega0_4.w; | |
// Read particle data | |
t@@ -76,9 +76,9 @@ __global__ void bondsLinear( | |
// Inter-particle vector | |
const Float3 x = MAKE_FLOAT3( | |
- x_i.x - x_j.x, | |
- x_i.y - x_j.y, | |
- x_i.z - x_j.z); | |
+ x_i.x - x_j.x, | |
+ x_i.y - x_j.y, | |
+ x_i.z - x_j.z); | |
const Float x_length = length(x); | |
// Find overlap (negative value if overlapping) | |
t@@ -96,13 +96,13 @@ __global__ void bondsLinear( | |
// Contact displacement, Luding 2008 eq. 10 | |
const Float3 ddelta = ( | |
- MAKE_FLOAT3( | |
- vel_i.x - vel_j.x, | |
- vel_i.y - vel_j.y, | |
- vel_i.z - vel_j.z) | |
- + (x_i.w + overlap/2.0) * cross(n, angvel_i) | |
- + (x_j.w + overlap/2.0) * cross(n, angvel_j) | |
- ) * devC_dt; | |
+ MAKE_FLOAT3( | |
+ vel_i.x - vel_j.x, | |
+ vel_i.y - vel_j.y, | |
+ vel_i.z - vel_j.z) | |
+ + (x_i.w + overlap/2.0) * cross(n, angvel_i) | |
+ + (x_j.w + overlap/2.0) * cross(n, angvel_j) | |
+ ) * devC_dt; | |
// Normal component of the displacement increment | |
//const Float ddelta_n = dot(ddelta, n); | |
t@@ -141,9 +141,9 @@ __global__ void bondsLinear( | |
// Contact rotational velocity | |
Float3 domega = MAKE_FLOAT3( | |
- angvel_j.x - angvel_i.x, | |
- angvel_j.y - angvel_i.y, | |
- angvel_j.z - angvel_i.z) * devC_dt; | |
+ angvel_j.x - angvel_i.x, | |
+ angvel_j.y - angvel_i.y, | |
+ angvel_j.z - angvel_i.z) * devC_dt; | |
/*const Float3 domega = MAKE_FLOAT3( | |
angvel_i.x - angvel_j.x, | |
angvel_i.y - angvel_j.y, | |
t@@ -232,89 +232,16 @@ __global__ void bondsLinear( | |
} | |
} | |
-// Linear-elastic bond: Attractive force with normal- and shear components | |
-// acting upon particle A in a bonded particle pair | |
-__device__ void bondLinear_old(Float3* N, Float3* T, Float* es_dot, Float* p, | |
- unsigned int idx_a, unsigned int idx_b, | |
- Float4* dev_x_sorted, Float4* dev_vel_sorted, | |
- Float4* dev_angvel_sorted, | |
- Float radius_a, Float radius_b, | |
- Float3 x_ab, Float x_ab_length, | |
- Float delta_ab) | |
-{ | |
- | |
- // If particles are not overlapping, apply bond force | |
- if (delta_ab > 0.0f) { | |
- | |
- // Allocate variables and fetch missing time=t values for particle A a… | |
- Float4 vel_a = dev_vel_sorted[idx_a]; | |
- Float4 vel_b = dev_vel_sorted[idx_b]; | |
- Float4 angvel4_a = dev_angvel_sorted[idx_a]; | |
- Float4 angvel4_b = dev_angvel_sorted[idx_b]; | |
- | |
- // Convert to Float3's | |
- Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z); | |
- Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z); | |
- | |
- // Normal vector of contact | |
- Float3 n_ab = x_ab/x_ab_length; | |
- | |
- // Relative contact interface velocity, w/o rolling | |
- Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, | |
- vel_a.y - vel_b.y, | |
- vel_a.z - vel_b.z); | |
- | |
- // Relative contact interface velocity of particle surfaces at | |
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10) | |
- Float3 vel_ab = vel_ab_linear | |
- + radius_a * cross(n_ab, angvel_a) | |
- + radius_b * cross(n_ab, angvel_b); | |
- | |
- // Relative contact interface rolling velocity | |
- //Float3 angvel_ab = angvel_a - angvel_b; | |
- //Float angvel_ab_length = length(angvel_ab); | |
- | |
- // Normal component of the relative contact interface velocity | |
- //Float vel_n_ab = dot(vel_ab_linear, n_ab); | |
- | |
- // Tangential component of the relative contact interface velocity | |
- // Hinrichsen and Wolf 2004, eq. 13.9 | |
- Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab)); | |
- //Float vel_t_ab_length = length(vel_t_ab); | |
- | |
- Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f); | |
- Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f); | |
- | |
- // Mean radius | |
- Float R_bar = (radius_a + radius_b)/2.0f; | |
- | |
- // Normal force component: Elastic | |
- f_n = devC_params.k_n * delta_ab * n_ab; | |
- | |
- if (length(vel_t_ab) > 0.f) { | |
- // Shear force component: Viscous | |
- f_t = -1.0f * devC_params.gamma_t * vel_t_ab; | |
- | |
- // Shear friction production rate [W] | |
- //*es_dot += -dot(vel_t_ab, f_t); | |
- } | |
- | |
- // Add force components from this bond to total force for particle | |
- *N += f_n + f_t; | |
- *T += -R_bar * cross(n_ab, f_t); | |
- | |
- // Pressure excerted onto the particle from this bond | |
- *p += length(f_n) / (4.0f * PI * radius_a*radius_a); | |
- | |
- } | |
-} // End of bondLinear() | |
- | |
// Capillary cohesion after Richefeu et al. (2006) | |
-__device__ void capillaryCohesion_exp(Float3* N, Float radius_a, | |
- Float radius_b, Float delta_ab, | |
- Float3 x_ab, Float x_ab_length, | |
- Float kappa) | |
+__device__ void capillaryCohesion_exp( | |
+ Float3* N, | |
+ const Float radius_a, | |
+ const Float radius_b, | |
+ const Float delta_ab, | |
+ const Float3 x_ab, | |
+ const Float x_ab_length, | |
+ const Float kappa) | |
{ | |
// Normal vector | |
diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh | |
t@@ -7,11 +7,19 @@ | |
// Linear viscoelastic contact model for particle-wall interactions | |
// with tangential friction and rolling resistance | |
-__device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot, | |
- Float* ev_dot, Float* p, | |
- unsigned int idx_a, Float radius_a, | |
- Float4* dev_vel_sorted, Float4* dev_angvel_sorted, | |
- Float3 n, Float delta, Float wvel) | |
+__device__ Float contactLinear_wall( | |
+ Float3* F, | |
+ Float3* T, | |
+ Float* es_dot, | |
+ Float* ev_dot, | |
+ Float* p, | |
+ const unsigned int idx_a, | |
+ const Float radius_a, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ const Float4* __restrict__ dev_angvel_sorted, | |
+ const Float3 n, | |
+ const Float delta, | |
+ const Float wvel) | |
{ | |
// Fetch particle velocities from global memory | |
Float4 vel_tmp = dev_vel_sorted[idx_a]; | |
t@@ -19,13 +27,13 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, … | |
// Convert velocities to three-component vectors | |
Float3 vel_linear = MAKE_FLOAT3( | |
- vel_tmp.x, | |
- vel_tmp.y, | |
- vel_tmp.z); | |
+ vel_tmp.x, | |
+ vel_tmp.y, | |
+ vel_tmp.z); | |
Float3 angvel = MAKE_FLOAT3( | |
- angvel_tmp.x, | |
- angvel_tmp.y, | |
- angvel_tmp.z); | |
+ angvel_tmp.x, | |
+ angvel_tmp.y, | |
+ angvel_tmp.z); | |
// Store the length of the angular velocity for later use | |
Float angvel_length = length(angvel); | |
t@@ -47,7 +55,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Fl… | |
// Normal force component: Elastic - viscous damping | |
Float3 f_n = fmax(0.0, -devC_params.k_n*delta | |
- - devC_params.gamma_wn*vel_n) * n; | |
+ - devC_params.gamma_wn*vel_n) * n; | |
const Float f_n_length = length(f_n); // Save length for later use | |
// Store the energy lost by viscous damping. See derivation in | |
t@@ -103,14 +111,22 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T… | |
// Linear vicoelastic contact model for particle-particle interactions | |
// with tangential friction and rolling resistance | |
-__device__ void contactLinearViscous(Float3* F, Float3* T, | |
- Float* es_dot, Float* ev_dot, Float* p, | |
- unsigned int idx_a, unsigned int idx_b, | |
- Float4* dev_vel_sorted, | |
- Float4* dev_angvel_sorted, | |
- Float radius_a, Float radius_b, | |
- Float3 x_ab, Float x_ab_length, | |
- Float delta_ab, Float kappa) | |
+__device__ void contactLinearViscous( | |
+ Float3* F, | |
+ Float3* T, | |
+ Float* es_dot, | |
+ Float* ev_dot, | |
+ Float* p, | |
+ const unsigned int idx_a, | |
+ const unsigned int idx_b, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ const Float4* __restrict__ dev_angvel_sorted, | |
+ const Float radius_a, | |
+ const Float radius_b, | |
+ const Float3 x_ab, | |
+ const Float x_ab_length, | |
+ const Float delta_ab, | |
+ const Float kappa) | |
{ | |
// Allocate variables and fetch missing time=t values for particle A and B | |
t@@ -131,8 +147,8 @@ __device__ void contactLinearViscous(Float3* F, Float3* T, | |
// Relative contact interface velocity, w/o rolling | |
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, | |
- vel_a.y - vel_b.y, | |
- vel_a.z - vel_b.z); | |
+ vel_a.y - vel_b.y, | |
+ vel_a.z - vel_b.z); | |
// Relative contact interface velocity of particle surfaces at | |
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10) | |
t@@ -209,18 +225,25 @@ __device__ void contactLinearViscous(Float3* F, Float3* … | |
// Linear elastic contact model for particle-particle interactions | |
-__device__ void contactLinear(Float3* F, Float3* T, | |
- Float* es_dot, Float* ev_dot, Float* p, | |
- unsigned int idx_a_orig, | |
- unsigned int idx_b_orig, | |
- Float4 vel_a, | |
- Float4* dev_vel, | |
- Float3 angvel_a, | |
- Float4* dev_angvel, | |
- Float radius_a, Float radius_b, | |
- Float3 x, Float x_length, | |
- Float delta, Float4* dev_delta_t, | |
- unsigned int mempos) | |
+__device__ void contactLinear( | |
+ Float3* F, | |
+ Float3* T, | |
+ Float* es_dot, | |
+ Float* ev_dot, | |
+ Float* p, | |
+ const unsigned int idx_a_orig, | |
+ const unsigned int idx_b_orig, | |
+ const Float4 vel_a, | |
+ const Float4* __restrict__ dev_vel, | |
+ const Float3 angvel_a, | |
+ const Float4* __restrict__ dev_angvel, | |
+ const Float radius_a, | |
+ const Float radius_b, | |
+ const Float3 x, | |
+ const Float x_length, | |
+ const Float delta, | |
+ Float4* __restrict__ dev_delta_t, | |
+ const unsigned int mempos) | |
{ | |
// Allocate variables and fetch missing time=t values for particle A and B | |
t@@ -231,15 +254,15 @@ __device__ void contactLinear(Float3* F, Float3* T, | |
Float4 delta_t0_4 = dev_delta_t[mempos]; | |
Float3 delta_t0_uncor = MAKE_FLOAT3( | |
- delta_t0_4.x, | |
- delta_t0_4.y, | |
- delta_t0_4.z); | |
+ delta_t0_4.x, | |
+ delta_t0_4.y, | |
+ delta_t0_4.z); | |
// Convert to Float3 | |
Float3 angvel_b = MAKE_FLOAT3( | |
- angvel4_b.x, | |
- angvel4_b.y, | |
- angvel4_b.z); | |
+ angvel4_b.x, | |
+ angvel4_b.y, | |
+ angvel4_b.z); | |
// Force between grain pair decomposed into normal- and tangential part | |
Float3 f_n, f_t, f_c; | |
t@@ -249,9 +272,9 @@ __device__ void contactLinear(Float3* F, Float3* T, | |
// Relative contact interface velocity, w/o rolling | |
Float3 vel_linear = MAKE_FLOAT3( | |
- vel_a.x - vel_b.x, | |
- vel_a.y - vel_b.y, | |
- vel_a.z - vel_b.z); | |
+ vel_a.x - vel_b.x, | |
+ vel_a.y - vel_b.y, | |
+ vel_a.z - vel_b.z); | |
// Relative contact interface velocity of particle surfaces at | |
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10, | |
t@@ -335,7 +358,7 @@ __device__ void contactLinear(Float3* F, Float3* T, | |
// 2008) | |
delta_t = -1.0/devC_params.k_t | |
* (devC_params.mu_d * length(f_n-f_c) * t | |
- + devC_params.gamma_t * vel_t); | |
+ + devC_params.gamma_t * vel_t); | |
// Shear friction heat production rate: | |
// The energy lost from the tangential spring is dissipated as heat | |
t@@ -357,10 +380,10 @@ __device__ void contactLinear(Float3* F, Float3* T, | |
// Store sum of tangential displacements | |
dev_delta_t[mempos] = MAKE_FLOAT4( | |
- delta_t.x, | |
- delta_t.y, | |
- delta_t.z, | |
- 0.0f); | |
+ delta_t.x, | |
+ delta_t.y, | |
+ delta_t.z, | |
+ 0.0f); | |
} // End of contactLinear() | |
t@@ -368,18 +391,25 @@ __device__ void contactLinear(Float3* F, Float3* T, | |
// Non-linear contact model for particle-particle interactions | |
// Based on Hertzian and Mindlin contact theories (e.g. Hertz, 1882, Mindlin a… | |
// Deresiewicz, 1953, Johnson, 1985). See Yohannes et al 2012 for example. | |
-__device__ void contactHertz(Float3* F, Float3* T, | |
- Float* es_dot, Float* ev_dot, Float* p, | |
- unsigned int idx_a_orig, | |
- unsigned int idx_b_orig, | |
- Float4 vel_a, | |
- Float4* dev_vel, | |
- Float3 angvel_a, | |
- Float4* dev_angvel, | |
- Float radius_a, Float radius_b, | |
- Float3 x_ab, Float x_ab_length, | |
- Float delta_ab, Float4* dev_delta_t, | |
- unsigned int mempos) | |
+__device__ void contactHertz( | |
+ Float3* F, | |
+ Float3* T, | |
+ Float* es_dot, | |
+ Float* ev_dot, | |
+ Float* p, | |
+ const unsigned int idx_a_orig, | |
+ const unsigned int idx_b_orig, | |
+ const Float4 vel_a, | |
+ const Float4* __restrict__ dev_vel, | |
+ const Float3 angvel_a, | |
+ const Float4* __restrict__ dev_angvel, | |
+ const Float radius_a, | |
+ const Float radius_b, | |
+ const Float3 x_ab, | |
+ const Float x_ab_length, | |
+ const Float delta_ab, | |
+ Float4* __restrict__ dev_delta_t, | |
+ const unsigned int mempos) | |
{ | |
// Allocate variables and fetch missing time=t values for particle A and B | |
t@@ -390,8 +420,8 @@ __device__ void contactHertz(Float3* F, Float3* T, | |
Float4 delta_t0_4 = dev_delta_t[mempos]; | |
Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x, | |
- delta_t0_4.y, | |
- delta_t0_4.z); | |
+ delta_t0_4.y, | |
+ delta_t0_4.z); | |
// Convert to Float3 | |
Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z); | |
t@@ -404,8 +434,8 @@ __device__ void contactHertz(Float3* F, Float3* T, | |
// Relative contact interface velocity, w/o rolling | |
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, | |
- vel_a.y - vel_b.y, | |
- vel_a.z - vel_b.z); | |
+ vel_a.y - vel_b.y, | |
+ vel_a.z - vel_b.z); | |
// Relative contact interface velocity of particle surfaces at | |
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10) | |
t@@ -435,7 +465,7 @@ __device__ void contactHertz(Float3* F, Float3* T, | |
// Normal force component | |
f_n = (-devC_params.k_n * powf(delta_ab, 3.0f/2.0f) | |
- -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab) | |
+ -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab) | |
* n_ab; | |
// Store energy dissipated in normal viscous component | |
t@@ -517,9 +547,9 @@ __device__ void contactHertz(Float3* F, Float3* T, | |
// New rolling resistance model | |
/*T_res = -1.0f * fmin(devC_params.gamma_r * R_bar * angvel_ab_length, | |
devC_params.mu_r * R_bar * f_n_length) | |
- * angvel_ab/angvel_ab_length;*/ | |
+ * angvel_ab/angvel_ab_length;*/ | |
T_res = -1.0f * fmin(devC_params.gamma_r * radius_a * angvel_ab_length, | |
- devC_params.mu_r * radius_a * f_n_length) | |
+ devC_params.mu_r * radius_a * f_n_length) | |
* angvel_ab/angvel_ab_length; | |
} | |
diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh | |
t@@ -85,19 +85,23 @@ __device__ int findDistMod(int3* targetCell, Float3* distm… | |
// Used for contactmodel=1, where contact history is not needed. | |
// Kernel executed on device, and callable from device only. | |
// Function is called from interact(). | |
-__device__ void findAndProcessContactsInCell(int3 targetCell, | |
- unsigned int idx_a, | |
- Float4 x_a, Float radius_a, | |
- Float3* F, Float3* T, | |
- Float* es_dot, Float* ev_dot, | |
- Float* p, | |
- Float4* dev_x_sorted, | |
- Float4* dev_vel_sorted, | |
- Float4* dev_angvel_sorted, | |
- unsigned int* dev_cellStart, | |
- unsigned int* dev_cellEnd, | |
- Float4* dev_walls_nx, | |
- Float4* dev_walls_mvfd) | |
+__device__ void findAndProcessContactsInCell( | |
+ int3 targetCell, | |
+ const unsigned int idx_a, | |
+ const Float4 x_a, | |
+ const Float radius_a, | |
+ Float3* F, | |
+ Float3* T, | |
+ Float* es_dot, | |
+ Float* ev_dot, | |
+ Float* p, | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ const Float4* __restrict__ dev_angvel_sorted, | |
+ const unsigned int* __restrict__ dev_cellStart, | |
+ const unsigned int* __restrict__ dev_cellEnd, | |
+ const Float4* __restrict__ dev_walls_nx, | |
+ Float4* __restrict__ dev_walls_mvfd) | |
//uint4 bonds) | |
{ | |
t@@ -133,8 +137,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCe… | |
// Distance between particle centers (Float4 -> Float3) | |
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, | |
- x_a.y - x_b.y, | |
- x_a.z - x_b.z); | |
+ x_a.y - x_b.y, | |
+ x_a.z - x_b.z); | |
// Adjust interparticle vector if periodic boundary/bounda… | |
// are crossed | |
t@@ -147,16 +151,16 @@ __device__ void findAndProcessContactsInCell(int3 target… | |
// Check for particle overlap | |
if (delta_ab < 0.0f) { | |
contactLinearViscous(F, T, es_dot, ev_dot, p, | |
- idx_a, idx_b, | |
- dev_vel_sorted, | |
- dev_angvel_sorted, | |
- radius_a, radius_b, | |
- x_ab, x_ab_length, | |
- delta_ab, kappa); | |
+ idx_a, idx_b, | |
+ dev_vel_sorted, | |
+ dev_angvel_sorted, | |
+ radius_a, radius_b, | |
+ x_ab, x_ab_length, | |
+ delta_ab, kappa); | |
} else if (delta_ab < devC_params.db) { | |
// Check wether particle distance satisfies the capill… | |
capillaryCohesion_exp(F, radius_a, radius_b, delta_ab, | |
- x_ab, x_ab_length, kappa); | |
+ x_ab, x_ab_length, kappa); | |
} | |
// Check wether particles are bonded together | |
t@@ -183,16 +187,18 @@ __device__ void findAndProcessContactsInCell(int3 target… | |
// Used for contactmodel=2, where bookkeeping of contact history is necessary. | |
// Kernel executed on device, and callable from device only. | |
// Function is called from topology(). | |
-__device__ void findContactsInCell(int3 targetCell, | |
- unsigned int idx_a, | |
- Float4 x_a, Float radius_a, | |
- Float4* dev_x_sorted, | |
- unsigned int* dev_cellStart, | |
- unsigned int* dev_cellEnd, | |
- unsigned int* dev_gridParticleIndex, | |
- int* nc, | |
- unsigned int* dev_contacts, | |
- Float4* dev_distmod) | |
+__device__ void findContactsInCell( | |
+ int3 targetCell, | |
+ const unsigned int idx_a, | |
+ const Float4 x_a, | |
+ const Float radius_a, | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ const unsigned int* __restrict__ dev_cellStart, | |
+ const unsigned int* __restrict__ dev_cellEnd, | |
+ const unsigned int* __restrict__ dev_gridParticleIndex, | |
+ int* nc, | |
+ unsigned int* __restrict__ dev_contacts, | |
+ Float4* __restrict__ dev_distmod) | |
{ | |
// Get distance modifier for interparticle | |
// vector, if it crosses a periodic boundary | |
t@@ -237,8 +243,8 @@ __device__ void findContactsInCell(int3 targetCell, | |
// Distance between particle centers (Float4 -> Float3) | |
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, | |
- x_a.y - x_b.y, | |
- x_a.z - x_b.z); | |
+ x_a.y - x_b.y, | |
+ x_a.z - x_b.z); | |
// Adjust interparticle vector if periodic boundary/bounda… | |
// are crossed | |
t@@ -316,12 +322,13 @@ __device__ void findContactsInCell(int3 targetCell, | |
// Search for neighbors to particle 'idx' inside the 27 closest cells, | |
// and save the contact pairs in global memory. | |
// Function is called from mainGPU loop. | |
-__global__ void topology(unsigned int* dev_cellStart, | |
- unsigned int* dev_cellEnd, // Input: Particles in cell | |
- unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key | |
- Float4* dev_x_sorted, | |
- unsigned int* dev_contacts, | |
- Float4* dev_distmod) | |
+__global__ void topology( | |
+ const unsigned int* __restrict__ dev_cellStart, | |
+ const unsigned int* __restrict__ dev_cellEnd, | |
+ const unsigned int* __restrict__ dev_gridParticleIndex, | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ unsigned int* __restrict__ dev_contacts, | |
+ Float4* __restrict__ dev_distmod) | |
{ | |
// Thread index equals index of particle A | |
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x; | |
t@@ -349,10 +356,10 @@ __global__ void topology(unsigned int* dev_cellStart, | |
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis | |
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim); | |
findContactsInCell(targetPos, idx_a, x_a, radius_a, | |
- dev_x_sorted, | |
- dev_cellStart, dev_cellEnd, | |
- dev_gridParticleIndex, | |
- &nc, dev_contacts, dev_distmod); | |
+ dev_x_sorted, | |
+ dev_cellStart, dev_cellEnd, | |
+ dev_gridParticleIndex, | |
+ &nc, dev_contacts, dev_distmod); | |
} | |
} | |
} | |
t@@ -372,28 +379,28 @@ __global__ void topology(unsigned int* dev_cellStart, | |
// Kernel is executed on device, and is callable from host only. | |
// Function is called from mainGPU loop. | |
__global__ void interact( | |
- unsigned int* dev_gridParticleIndex, // in | |
- unsigned int* dev_cellStart, // in | |
- unsigned int* dev_cellEnd, // in | |
- Float4* dev_x, // in | |
- Float4* dev_x_sorted, // in | |
- Float4* dev_vel_sorted, // in | |
- Float4* dev_angvel_sorted, // in | |
- Float4* dev_vel, // in | |
- Float4* dev_angvel, // in | |
- Float4* dev_force, // out | |
- Float4* dev_torque, // out | |
- Float* dev_es_dot, // out | |
- Float* dev_ev_dot, // out | |
- Float* dev_es, // out | |
- Float* dev_ev, // out | |
- Float* dev_p, // out | |
- Float4* dev_walls_nx, // in | |
- Float4* dev_walls_mvfd, // in | |
- Float* dev_walls_force_pp, // out | |
- unsigned int* dev_contacts, // out | |
- Float4* dev_distmod, // in | |
- Float4* dev_delta_t) // out | |
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in | |
+ const unsigned int* __restrict__ dev_cellStart, // in | |
+ const unsigned int* __restrict__ dev_cellEnd, // in | |
+ const Float4* __restrict__ dev_x, // in | |
+ const Float4* __restrict__ dev_x_sorted, // in | |
+ const Float4* __restrict__ dev_vel_sorted, // in | |
+ const Float4* __restrict__ dev_angvel_sorted, // in | |
+ const Float4* __restrict__ dev_vel, // in | |
+ const Float4* __restrict__ dev_angvel, // in | |
+ Float4* __restrict__ dev_force, // out | |
+ Float4* __restrict__ dev_torque, // out | |
+ Float* __restrict__ dev_es_dot, // out | |
+ Float* __restrict__ dev_ev_dot, // out | |
+ Float* __restrict__ dev_es, // out | |
+ Float* __restrict__ dev_ev, // out | |
+ Float* __restrict__ dev_p, // out | |
+ const Float4* __restrict__ dev_walls_nx, // in | |
+ Float4* __restrict__ dev_walls_mvfd, // in | |
+ Float* __restrict__ dev_walls_force_pp, // out | |
+ unsigned int* __restrict__ dev_contacts, // out | |
+ const Float4* __restrict__ dev_distmod, // in | |
+ Float4* __restrict__ dev_delta_t) // out | |
{ | |
// Thread index equals index of particle A | |
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x; | |
t@@ -407,11 +414,11 @@ __global__ void interact( | |
// Fetch world dimensions in constant memory read | |
Float3 origo = MAKE_FLOAT3(devC_grid.origo[0], | |
- devC_grid.origo[1], | |
- devC_grid.origo[2]); | |
+ devC_grid.origo[1], | |
+ devC_grid.origo[2]); | |
Float3 L = MAKE_FLOAT3(devC_grid.L[0], | |
- devC_grid.L[1], | |
- devC_grid.L[2]); | |
+ devC_grid.L[1], | |
+ devC_grid.L[2]); | |
// Fetch wall data in global read | |
Float4 w_0_nx, w_1_nx, w_2_nx, w_3_nx, w_4_nx; | |
t@@ -497,8 +504,8 @@ __global__ void interact( | |
// Inter-particle vector, corrected for periodic boundaries | |
x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x, | |
- x_a.y - x_b.y + distmod.y, | |
- x_a.z - x_b.z + distmod.z); | |
+ x_a.y - x_b.y + distmod.y, | |
+ x_a.z - x_b.z + distmod.z); | |
x_ab_length = length(x_ab); | |
delta_n = x_ab_length - (radius_a + radius_b); | |
t@@ -507,28 +514,28 @@ __global__ void interact( | |
if (delta_n < 0.0) { | |
if (devC_params.contactmodel == 2) { | |
contactLinear(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a_orig, | |
- idx_b_orig, | |
- vel_a, | |
- dev_vel, | |
- angvel_a, | |
- dev_angvel, | |
- radius_a, radius_b, | |
- x_ab, x_ab_length, | |
- delta_n, dev_delta_t, | |
- mempos); | |
+ idx_a_orig, | |
+ idx_b_orig, | |
+ vel_a, | |
+ dev_vel, | |
+ angvel_a, | |
+ dev_angvel, | |
+ radius_a, radius_b, | |
+ x_ab, x_ab_length, | |
+ delta_n, dev_delta_t, | |
+ mempos); | |
} else if (devC_params.contactmodel == 3) { | |
contactHertz(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a_orig, | |
- idx_b_orig, | |
- vel_a, | |
- dev_vel, | |
- angvel_a, | |
- dev_angvel, | |
- radius_a, radius_b, | |
- x_ab, x_ab_length, | |
- delta_n, dev_delta_t, | |
- mempos); | |
+ idx_a_orig, | |
+ idx_b_orig, | |
+ vel_a, | |
+ dev_vel, | |
+ angvel_a, | |
+ dev_angvel, | |
+ radius_a, radius_b, | |
+ x_ab, x_ab_length, | |
+ delta_n, dev_delta_t, | |
+ mempos); | |
} | |
} else { | |
__syncthreads(); | |
t@@ -556,11 +563,11 @@ __global__ void interact( | |
// Calculate address in grid from position | |
gridPos.x = floor((x_a.x - devC_grid.origo[0]) | |
- / (devC_grid.L[0]/devC_grid.num[0])); | |
+ / (devC_grid.L[0]/devC_grid.num[0])); | |
gridPos.y = floor((x_a.y - devC_grid.origo[1]) | |
- / (devC_grid.L[1]/devC_grid.num[1])); | |
+ / (devC_grid.L[1]/devC_grid.num[1])); | |
gridPos.z = floor((x_a.z - devC_grid.origo[2]) | |
- / (devC_grid.L[2]/devC_grid.num[2])); | |
+ / (devC_grid.L[2]/devC_grid.num[2])); | |
// Find overlaps between particle no. idx and all particles | |
// from its own cell + 26 neighbor cells. | |
t@@ -570,12 +577,17 @@ __global__ void interact( | |
for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis | |
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis | |
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim); | |
- findAndProcessContactsInCell(targetPos, idx_a, x_a, ra… | |
- &F, &T, &es_dot, &ev_dot, &p, | |
- dev_x_sorted, | |
- dev_vel_sorted, dev_angvel_sorted, | |
- dev_cellStart, dev_cellEnd, | |
- dev_walls_nx, dev_walls_mvfd); | |
+ findAndProcessContactsInCell(targetPos, idx_a, | |
+ x_a, radius_a, | |
+ &F, &T, &es_dot, | |
+ &ev_dot, &p, | |
+ dev_x_sorted, | |
+ dev_vel_sorted, | |
+ dev_angvel_sorted, | |
+ dev_cellStart, | |
+ dev_cellEnd, | |
+ dev_walls_nx, | |
+ dev_walls_mvfd); | |
} | |
} | |
} | |
t@@ -596,8 +608,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_0_nx.x, w_0_nx.y, w_0_nx.z); | |
if (delta_w < 0.0f) { | |
w_0_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a, | |
- radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w, | |
- w_0_mvfd.y); | |
+ radius_a, dev_vel_sorted, dev_angve… | |
+ w_0_mvfd.y); | |
} | |
// Lower wall (force on wall not stored) | |
t@@ -605,8 +617,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f); | |
if (delta_w < 0.0f) { | |
(void)contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a, | |
- radius_a, dev_vel_sorted, dev_angvel_sorted, | |
- w_n, delta_w, 0.0f); | |
+ radius_a, dev_vel_sorted, dev_angvel_sort… | |
+ w_n, delta_w, 0.0f); | |
} | |
t@@ -617,8 +629,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_1_nx.x, w_1_nx.y, w_1_nx.z); | |
if (delta_w < 0.0f) { | |
w_1_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_1_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_1_mvfd.y); | |
} | |
// Left wall (idx 2) | |
t@@ -626,8 +638,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_2_nx.x, w_2_nx.y, w_2_nx.z); | |
if (delta_w < 0.0f) { | |
w_2_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_2_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_2_mvfd.y); | |
} | |
// Back wall (idx 3) | |
t@@ -635,8 +647,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z); | |
if (delta_w < 0.0f) { | |
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_3_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_3_mvfd.y); | |
} | |
// Front wall (idx 4) | |
t@@ -644,8 +656,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z); | |
if (delta_w < 0.0f) { | |
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_4_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_4_mvfd.y); | |
} | |
} else if (devC_grid.periodic == 2) { // right and left walls period… | |
t@@ -655,8 +667,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z); | |
if (delta_w < 0.0f) { | |
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_3_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_3_mvfd.y); | |
} | |
// Front wall (idx 4) | |
t@@ -664,8 +676,8 @@ __global__ void interact( | |
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z); | |
if (delta_w < 0.0f) { | |
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, | |
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_… | |
- delta_w, w_4_mvfd.y); | |
+ idx_a, radius_a, dev_vel_sorted… | |
+ delta_w, w_4_mvfd.y); | |
} | |
} | |
diff --git a/src/integration.cuh b/src/integration.cuh | |
t@@ -11,15 +11,23 @@ | |
// Second order integration scheme based on Taylor expansion of particle kinem… | |
// Kernel executed on device, and callable from host only. | |
-__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp… | |
- Float4* dev_angvel_sorted, | |
- Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output | |
- Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input | |
- Float4* dev_acc, Float4* dev_angacc, | |
- Float4* dev_vel0, Float4* dev_angvel0, | |
- Float4* dev_xyzsum, | |
- unsigned int* dev_gridParticleIndex, // Input: Sorted-Unsorted key | |
- unsigned int iter) | |
+__global__ void integrate( | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ const Float4* __restrict__ dev_angvel_sorted, | |
+ Float4* __restrict__ dev_x, | |
+ Float4* __restrict__ dev_vel, | |
+ Float4* __restrict__ dev_angvel, | |
+ const Float4* __restrict__ dev_force, | |
+ const Float4* __restrict__ dev_torque, | |
+ Float4* __restrict__ dev_angpos, | |
+ Float4* __restrict__ dev_acc, | |
+ Float4* __restrict__ dev_angacc, | |
+ Float4* __restrict__ dev_vel0, | |
+ Float4* __restrict__ dev_angvel0, | |
+ Float4* __restrict__ dev_xyzsum, | |
+ const unsigned int* __restrict__ dev_gridParticleIndex, | |
+ const unsigned int iter) | |
{ | |
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id | |
t@@ -59,13 +67,13 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de… | |
// Coherent read from constant memory to registers | |
const Float dt = devC_dt; | |
const Float3 origo = MAKE_FLOAT3( | |
- devC_grid.origo[0], | |
- devC_grid.origo[1], | |
- devC_grid.origo[2]); | |
+ devC_grid.origo[0], | |
+ devC_grid.origo[1], | |
+ devC_grid.origo[2]); | |
const Float3 L = MAKE_FLOAT3( | |
- devC_grid.L[0], | |
- devC_grid.L[1], | |
- devC_grid.L[2]); | |
+ devC_grid.L[0], | |
+ devC_grid.L[1], | |
+ devC_grid.L[2]); | |
// Particle mass | |
Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho; | |
t@@ -179,14 +187,14 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* … | |
// Truncation error O(dt^4) for positions, O(dt^3) for velocities | |
// Approximate acceleration change by backwards difference: | |
const Float3 dacc_dt = MAKE_FLOAT3( | |
- (acc.x - acc0.x)/dt, | |
- (acc.y - acc0.y)/dt, | |
- (acc.z - acc0.z)/dt); | |
+ (acc.x - acc0.x)/dt, | |
+ (acc.y - acc0.y)/dt, | |
+ (acc.z - acc0.z)/dt); | |
const Float3 dangacc_dt = MAKE_FLOAT3( | |
- (angacc.x - angacc0.x)/dt, | |
- (angacc.y - angacc0.y)/dt, | |
- (angacc.z - angacc0.z)/dt); | |
+ (angacc.x - angacc0.x)/dt, | |
+ (angacc.y - angacc0.y)/dt, | |
+ (angacc.z - angacc0.z)/dt); | |
x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*d… | |
x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*d… | |
t@@ -253,7 +261,9 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* de… | |
// Reduce wall force contributions from particles to a single value per wall | |
-__global__ void summation(Float* in, Float *out) | |
+__global__ void summation( | |
+ const Float* __restrict__ in, | |
+ Float* __restrict__ out) | |
{ | |
__shared__ Float cache[256]; | |
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
t@@ -287,14 +297,14 @@ __global__ void summation(Float* in, Float *out) | |
// Update wall positions | |
__global__ void integrateWalls( | |
- Float4* dev_walls_nx, | |
- Float4* dev_walls_mvfd, | |
- int* dev_walls_wmode, | |
- Float* dev_walls_force_partial, | |
- Float* dev_walls_acc, | |
- unsigned int blocksPerGrid, | |
- Float t_current, | |
- unsigned int iter) | |
+ Float4* __restrict__ dev_walls_nx, | |
+ Float4* __restrict__ dev_walls_mvfd, | |
+ const int* __restrict__ dev_walls_wmode, | |
+ const Float* __restrict__ dev_walls_force_partial, | |
+ Float* __restrict__ dev_walls_acc, | |
+ const unsigned int blocksPerGrid, | |
+ const Float t_current, | |
+ const unsigned int iter) | |
{ | |
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id | |
diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh | |
t@@ -25,34 +25,34 @@ __inline__ __device__ Float hmean(Float a, Float b) { | |
// Helper functions for checking whether a value is NaN or Inf | |
__device__ int checkFiniteFloat( | |
- const char* desc, | |
- const unsigned int x, | |
- const unsigned int y, | |
- const unsigned int z, | |
- const Float s) | |
+ const char* desc, | |
+ const unsigned int x, | |
+ const unsigned int y, | |
+ const unsigned int z, | |
+ const Float s) | |
{ | |
- __syncthreads(); | |
- if (!isfinite(s)) { | |
- printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s); | |
- return 1; | |
- } | |
- return 0; | |
+ __syncthreads(); | |
+ if (!isfinite(s)) { | |
+ printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s); | |
+ return 1; | |
+ } | |
+ return 0; | |
} | |
__device__ int checkFiniteFloat3( | |
- const char* desc, | |
- const unsigned int x, | |
- const unsigned int y, | |
- const unsigned int z, | |
- const Float3 v) | |
+ const char* desc, | |
+ const unsigned int x, | |
+ const unsigned int y, | |
+ const unsigned int z, | |
+ const Float3 v) | |
{ | |
- __syncthreads(); | |
- if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) { | |
- printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n", | |
- x, y, z, desc, v.x, v.y, v.z); | |
- return 1; | |
- } | |
- return 0; | |
+ __syncthreads(); | |
+ if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) { | |
+ printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n", | |
+ x, y, z, desc, v.x, v.y, v.z); | |
+ return 1; | |
+ } | |
+ return 0; | |
} | |
// Initialize memory | |
t@@ -143,10 +143,10 @@ void DEM::freeNSmemDev() | |
void DEM::transferNStoGlobalDeviceMemory(int statusmsg) | |
{ | |
checkForCudaErrors("Before attempting cudaMemcpy in " | |
- "transferNStoGlobalDeviceMemory"); | |
+ "transferNStoGlobalDeviceMemory"); | |
//if (verbose == 1 && statusmsg == 1) | |
- //std::cout << " Transfering fluid data to the device: "; | |
+ //std::cout << " Transfering fluid data to the device: "; | |
// memory size for a scalar field | |
unsigned int memSizeF = sizeof(Float)*NScells(); | |
t@@ -162,7 +162,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg) | |
checkForCudaErrors("End of transferNStoGlobalDeviceMemory"); | |
//if (verbose == 1 && statusmsg == 1) | |
- //std::cout << "Done" << std::endl; | |
+ //std::cout << "Done" << std::endl; | |
} | |
// Transfer from device | |
t@@ -194,7 +194,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg) | |
void DEM::transferNSnormFromGlobalDeviceMemory() | |
{ | |
cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(), | |
- cudaMemcpyDeviceToHost); | |
+ cudaMemcpyDeviceToHost); | |
checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory"); | |
} | |
t@@ -202,7 +202,7 @@ void DEM::transferNSnormFromGlobalDeviceMemory() | |
void DEM::transferNSepsilonFromGlobalDeviceMemory() | |
{ | |
cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(), | |
- cudaMemcpyDeviceToHost); | |
+ cudaMemcpyDeviceToHost); | |
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory"); | |
} | |
t@@ -210,13 +210,13 @@ void DEM::transferNSepsilonFromGlobalDeviceMemory() | |
void DEM::transferNSepsilonNewFromGlobalDeviceMemory() | |
{ | |
cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(), | |
- cudaMemcpyDeviceToHost); | |
+ cudaMemcpyDeviceToHost); | |
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory"); | |
} | |
// Get linear index from 3D grid position | |
__inline__ __device__ unsigned int idx( | |
- const int x, const int y, const int z) | |
+ const int x, const int y, const int z) | |
{ | |
// without ghost nodes | |
//return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z; | |
t@@ -229,11 +229,11 @@ __inline__ __device__ unsigned int idx( | |
// Get linear index of velocity node from 3D grid position in staggered grid | |
__inline__ __device__ unsigned int vidx( | |
- const int x, const int y, const int z) | |
+ const int x, const int y, const int z) | |
{ | |
// without ghost nodes | |
//return x + (devC_grid.num[0]+1)*y | |
- //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z; | |
+ //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z; | |
// with ghost nodes | |
// the ghost nodes are placed at x,y,z = -1 and WIDTH+1 | |
t@@ -246,10 +246,10 @@ __inline__ __device__ unsigned int vidx( | |
// dev_ns_v or dev_ns_v_p array. This function does not set the averaged | |
// velocity values in the ghost node cells. | |
__global__ void findNSavgVel( | |
- Float3* dev_ns_v, // out | |
- Float* dev_ns_v_x, // in | |
- Float* dev_ns_v_y, // in | |
- Float* dev_ns_v_z) // in | |
+ Float3* __restrict__ dev_ns_v, // out | |
+ const Float* __restrict__ dev_ns_v_x, // in | |
+ const Float* __restrict__ dev_ns_v_y, // in | |
+ const Float* __restrict__ dev_ns_v_z) // in | |
{ | |
// 3D thread index | |
t@@ -272,9 +272,9 @@ __global__ void findNSavgVel( | |
// Find average velocity using arithmetic means | |
const Float3 v_bar = MAKE_FLOAT3( | |
- amean(v_xn, v_xp), | |
- amean(v_yn, v_yp), | |
- amean(v_zn, v_zp)); | |
+ amean(v_xn, v_xp), | |
+ amean(v_yn, v_yp), | |
+ amean(v_zn, v_zp)); | |
// Save value | |
__syncthreads(); | |
t@@ -287,10 +287,10 @@ __global__ void findNSavgVel( | |
// or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are s… | |
// beforehand. | |
__global__ void findNScellFaceVel( | |
- Float3* dev_ns_v, // in | |
- Float* dev_ns_v_x, // out | |
- Float* dev_ns_v_y, // out | |
- Float* dev_ns_v_z) // out | |
+ const Float3* __restrict__ dev_ns_v, // in | |
+ Float* __restrict__ dev_ns_v_x, // out | |
+ Float* __restrict__ dev_ns_v_y, // out | |
+ Float* __restrict__ dev_ns_v_z) // out | |
{ | |
// 3D thread index | |
t@@ -340,7 +340,9 @@ __global__ void findNScellFaceVel( | |
// Set the initial guess of the values of epsilon. | |
-__global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value) | |
+__global__ void setNSepsilonInterior( | |
+ Float* __restrict__ dev_ns_epsilon, | |
+ const Float value) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -349,7 +351,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon… | |
// check that we are not outside the fluid grid | |
if (x < devC_grid.num[0] && y < devC_grid.num[1] && | |
- z > 0 && z < devC_grid.num[2]-1) { | |
+ z > 0 && z < devC_grid.num[2]-1) { | |
__syncthreads(); | |
const unsigned int cellidx = idx(x,y,z); | |
dev_ns_epsilon[cellidx] = value; | |
t@@ -358,7 +360,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon… | |
// The normalized residuals are given an initial value of 0, since the values … | |
// the Dirichlet boundaries aren't written during the iterations. | |
-__global__ void setNSnormZero(Float* dev_ns_norm) | |
+__global__ void setNSnormZero(Float* __restrict__ dev_ns_norm) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -380,9 +382,9 @@ __global__ void setNSnormZero(Float* dev_ns_norm) | |
// the Dirichlet boundary condition: the new value should be identical to the | |
// old value, i.e. the temporal gradient is 0 | |
__global__ void setNSepsilonBottom( | |
- Float* dev_ns_epsilon, | |
- Float* dev_ns_epsilon_new, | |
- const Float value) | |
+ Float* __restrict__ dev_ns_epsilon, | |
+ Float* __restrict__ dev_ns_epsilon_new, | |
+ const Float value) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -408,9 +410,9 @@ __global__ void setNSepsilonBottom( | |
// the Dirichlet boundary condition: the new value should be identical to the | |
// old value, i.e. the temporal gradient is 0 | |
__global__ void setNSepsilonTop( | |
- Float* dev_ns_epsilon, | |
- Float* dev_ns_epsilon_new, | |
- const Float value) | |
+ Float* __restrict__ dev_ns_epsilon, | |
+ Float* __restrict__ dev_ns_epsilon_new, | |
+ const Float value) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -419,7 +421,7 @@ __global__ void setNSepsilonTop( | |
// check that we are not outside the fluid grid, and at the upper z bounda… | |
if (x < devC_grid.num[0] && y < devC_grid.num[1] && | |
- z == devC_grid.num[2]-1) { | |
+ z == devC_grid.num[2]-1) { | |
__syncthreads(); | |
const unsigned int cellidx = idx(x,y,z); | |
t@@ -428,11 +430,14 @@ __global__ void setNSepsilonTop( | |
} | |
} | |
__device__ void copyNSvalsDev( | |
- unsigned int read, unsigned int write, | |
- Float* dev_ns_p, | |
- Float3* dev_ns_v, Float3* dev_ns_v_p, | |
- Float* dev_ns_phi, Float* dev_ns_dphi, | |
- Float* dev_ns_epsilon) | |
+ const unsigned int read, | |
+ const unsigned int write, | |
+ Float* __restrict__ dev_ns_p, | |
+ Float3* __restrict__ dev_ns_v, | |
+ Float3* __restrict__ dev_ns_v_p, | |
+ Float* __restrict__ dev_ns_phi, | |
+ Float* __restrict__ dev_ns_dphi, | |
+ Float* __restrict__ dev_ns_epsilon) | |
{ | |
// Coalesced read | |
const Float p = dev_ns_p[read]; | |
t@@ -457,10 +462,12 @@ __device__ void copyNSvalsDev( | |
// are not written since they are not read. Launch this kernel for all cells in | |
// the grid | |
__global__ void setNSghostNodesDev( | |
- Float* dev_ns_p, | |
- Float3* dev_ns_v, Float3* dev_ns_v_p, | |
- Float* dev_ns_phi, Float* dev_ns_dphi, | |
- Float* dev_ns_epsilon) | |
+ Float* __restrict__ dev_ns_p, | |
+ Float3* __restrict__ dev_ns_v, | |
+ Float3* __restrict__ dev_ns_v_p, | |
+ Float* __restrict__ dev_ns_phi, | |
+ Float* __restrict__ dev_ns_dphi, | |
+ Float* __restrict__ dev_ns_epsilon) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -484,72 +491,72 @@ __global__ void setNSghostNodesDev( | |
if (x == 0) { | |
writeidx = idx(nx,y,z); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
if (x == nx-1) { | |
writeidx = idx(-1,y,z); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
if (y == 0) { | |
writeidx = idx(x,ny,z); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
if (y == ny-1) { | |
writeidx = idx(x,-1,z); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
// Z boundaries fixed | |
if (z == 0) { | |
writeidx = idx(x,y,-1); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
if (z == nz-1) { | |
writeidx = idx(x,y,nz); | |
copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
} | |
// Z boundaries periodic | |
/*if (z == 0) { | |
- writeidx = idx(x,y,nz); | |
- copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
- } | |
- if (z == nz-1) { | |
- writeidx = idx(x,y,-1); | |
- copyNSvalsDev(cellidx, writeidx, | |
- dev_ns_p, | |
- dev_ns_v, dev_ns_v_p, | |
- dev_ns_phi, dev_ns_dphi, | |
- dev_ns_epsilon); | |
- }*/ | |
+ writeidx = idx(x,y,nz); | |
+ copyNSvalsDev(cellidx, writeidx, | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
+ } | |
+ if (z == nz-1) { | |
+ writeidx = idx(x,y,-1); | |
+ copyNSvalsDev(cellidx, writeidx, | |
+ dev_ns_p, | |
+ dev_ns_v, dev_ns_v_p, | |
+ dev_ns_phi, dev_ns_dphi, | |
+ dev_ns_epsilon); | |
+ }*/ | |
} | |
} | |
t@@ -557,7 +564,7 @@ __global__ void setNSghostNodesDev( | |
// (diagonal) cells are not written since they are not read. Launch this kernel | |
// for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. … | |
template<typename T> | |
-__global__ void setNSghostNodes(T* dev_scalarfield) | |
+__global__ void setNSghostNodes(T* __restrict__ dev_scalarfield) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -586,10 +593,10 @@ __global__ void setNSghostNodes(T* dev_scalarfield) | |
if (z == 0) | |
dev_scalarfield[idx(x,y,-1)] = val; // Dirichlet | |
- //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z | |
+ //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z | |
if (z == nz-1) | |
dev_scalarfield[idx(x,y,nz)] = val; // Dirichlet | |
- //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z | |
+ //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z | |
} | |
} | |
t@@ -597,9 +604,9 @@ __global__ void setNSghostNodes(T* dev_scalarfield) | |
// (diagonal) cells are not written since they are not read. | |
template<typename T> | |
__global__ void setNSghostNodes( | |
- T* dev_scalarfield, | |
- int bc_bot, | |
- int bc_top) | |
+ T* __restrict__ dev_scalarfield, | |
+ const int bc_bot, | |
+ const int bc_top) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -651,11 +658,11 @@ __global__ void setNSghostNodes( | |
// According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics" | |
template<typename T> | |
__global__ void setNSghostNodesFace( | |
- T* dev_scalarfield_x, | |
- T* dev_scalarfield_y, | |
- T* dev_scalarfield_z, | |
- int bc_bot, | |
- int bc_top) | |
+ T* __restrict__ dev_scalarfield_x, | |
+ T* __restrict__ dev_scalarfield_y, | |
+ T* __restrict__ dev_scalarfield_z, | |
+ const int bc_bot, | |
+ const int bc_top) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -849,9 +856,9 @@ __global__ void setNSghostNodesFace( | |
// The edge (diagonal) cells are not written since they are not read. Launch | |
// this kernel for all cells in the grid. | |
__global__ void setNSghostNodes_tau( | |
- Float* dev_ns_tau, | |
- int bc_bot, | |
- int bc_top) | |
+ Float* __restrict__ dev_ns_tau, | |
+ const int bc_bot, | |
+ const int bc_top) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -981,108 +988,108 @@ __global__ void setNSghostNodes_tau( | |
// The edge (diagonal) cells are not written since they are not read. Launch | |
// this kernel for all cells in the grid. | |
/* | |
-__global__ void setNSghostNodesForcing( | |
- Float* dev_ns_f1, | |
- Float3* dev_ns_f2, | |
- Float* dev_ns_f, | |
- unsigned int nijac) | |
- | |
-{ | |
- // 3D thread index | |
- const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
- const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; | |
- const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z; | |
- | |
- // Grid dimensions | |
- const unsigned int nx = devC_grid.num[0]; | |
- const unsigned int ny = devC_grid.num[1]; | |
- const unsigned int nz = devC_grid.num[2]; | |
- | |
- // 1D thread index | |
- unsigned int cellidx = idx(x,y,z); | |
- | |
- // check that we are not outside the fluid grid | |
- if (x < nx && y < ny && z < nz) { | |
- | |
- __syncthreads(); | |
- const Float f = dev_ns_f[cellidx]; | |
- Float f1; | |
- Float3 f2; | |
- | |
- if (nijac == 0) { | |
- __syncthreads(); | |
- f1 = dev_ns_f1[cellidx]; | |
- f2 = dev_ns_f2[cellidx]; | |
- } | |
- | |
- if (x == 0) { | |
- cellidx = idx(nx,y,z); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- if (x == nx-1) { | |
- cellidx = idx(-1,y,z); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- | |
- if (y == 0) { | |
- cellidx = idx(x,ny,z); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- if (y == ny-1) { | |
- cellidx = idx(x,-1,z); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- | |
- if (z == 0) { | |
- cellidx = idx(x,y,nz); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- if (z == nz-1) { | |
- cellidx = idx(x,y,-1); | |
- dev_ns_f[cellidx] = f; | |
- if (nijac == 0) { | |
- dev_ns_f1[cellidx] = f1; | |
- dev_ns_f2[cellidx] = f2; | |
- } | |
- } | |
- } | |
-} | |
+ __global__ void setNSghostNodesForcing( | |
+ Float* dev_ns_f1, | |
+ Float3* dev_ns_f2, | |
+ Float* dev_ns_f, | |
+ unsigned int nijac) | |
+ | |
+ { | |
+ // 3D thread index | |
+ const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
+ const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; | |
+ const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z; | |
+ | |
+ // Grid dimensions | |
+ const unsigned int nx = devC_grid.num[0]; | |
+ const unsigned int ny = devC_grid.num[1]; | |
+ const unsigned int nz = devC_grid.num[2]; | |
+ | |
+ // 1D thread index | |
+ unsigned int cellidx = idx(x,y,z); | |
+ | |
+ // check that we are not outside the fluid grid | |
+ if (x < nx && y < ny && z < nz) { | |
+ | |
+ __syncthreads(); | |
+ const Float f = dev_ns_f[cellidx]; | |
+ Float f1; | |
+ Float3 f2; | |
+ | |
+ if (nijac == 0) { | |
+ __syncthreads(); | |
+ f1 = dev_ns_f1[cellidx]; | |
+ f2 = dev_ns_f2[cellidx]; | |
+ } | |
+ | |
+ if (x == 0) { | |
+ cellidx = idx(nx,y,z); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ if (x == nx-1) { | |
+ cellidx = idx(-1,y,z); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ | |
+ if (y == 0) { | |
+ cellidx = idx(x,ny,z); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ if (y == ny-1) { | |
+ cellidx = idx(x,-1,z); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ | |
+ if (z == 0) { | |
+ cellidx = idx(x,y,nz); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ if (z == nz-1) { | |
+ cellidx = idx(x,y,-1); | |
+ dev_ns_f[cellidx] = f; | |
+ if (nijac == 0) { | |
+ dev_ns_f1[cellidx] = f1; | |
+ dev_ns_f2[cellidx] = f2; | |
+ } | |
+ } | |
+ } | |
+ } | |
*/ | |
// Find the porosity in each cell on the base of a sphere, centered at the cell | |
// center. | |
__global__ void findPorositiesVelocitiesDiametersSpherical( | |
- const unsigned int* dev_cellStart, | |
- const unsigned int* dev_cellEnd, | |
- const Float4* dev_x_sorted, | |
- const Float4* dev_vel_sorted, | |
- Float* dev_ns_phi, | |
- Float* dev_ns_dphi, | |
- Float3* dev_ns_vp_avg, | |
- Float* dev_ns_d_avg, | |
- const unsigned int iteration, | |
- const unsigned int np, | |
- const Float c_phi) | |
+ const unsigned int* __restrict__ dev_cellStart, | |
+ const unsigned int* __restrict__ dev_cellEnd, | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ Float* __restrict__ dev_ns_phi, | |
+ Float* __restrict__ dev_ns_dphi, | |
+ Float3* __restrict__ dev_ns_vp_avg, | |
+ Float* __restrict__ dev_ns_d_avg, | |
+ const unsigned int iteration, | |
+ const unsigned int np, | |
+ const Float c_phi) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1114,9 +1121,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
// Cell sphere center position | |
const Float3 X = MAKE_FLOAT3( | |
- x*dx + 0.5*dx, | |
- y*dy + 0.5*dy, | |
- z*dz + 0.5*dz); | |
+ x*dx + 0.5*dx, | |
+ y*dy + 0.5*dy, | |
+ z*dz + 0.5*dz); | |
Float d, r; | |
Float phi = 1.00; | |
t@@ -1143,12 +1150,12 @@ __global__ void findPorositiesVelocitiesDiametersSpher… | |
// Iterate over 27 neighbor cells, R = cell width | |
/*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis | |
- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis | |
- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/ | |
+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis | |
+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/ | |
// Iterate over 27 neighbor cells, R = 2*cell width | |
for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis | |
- //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis | |
+ //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis | |
for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis | |
for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis | |
t@@ -1186,9 +1193,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
// Find center distance | |
dist = MAKE_FLOAT3( | |
- X.x - xr.x, | |
- X.y - xr.y, | |
- X.z - xr.z); | |
+ X.x - xr.x, | |
+ X.y - xr.y, | |
+ X.z - xr.z); | |
dist += distmod; | |
d = length(dist); | |
t@@ -1196,10 +1203,10 @@ __global__ void findPorositiesVelocitiesDiametersSpher… | |
if ((R - r) < d && d < (R + r)) { | |
void_volume -= | |
1.0/(12.0*d) * ( | |
- M_PI*(R + r - d)*(R + r - … | |
- *(d*d + 2.0*d*r - 3.0*r*r | |
- + 2.0*d*R + 6.0*r*R | |
- - 3.0*R*R) ); | |
+ M_PI*(R + r - d)*(R + r - d) | |
+ *(d*d + 2.0*d*r - 3.0*r*r | |
+ + 2.0*d*R + 6.0*r*R | |
+ - 3.0*R*R) ); | |
v_avg += MAKE_FLOAT3(v.x, v.y, v.z); | |
d_avg += 2.0*r; | |
n++; | |
t@@ -1259,8 +1266,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
//Float phi = 0.5; | |
//Float dphi = 0.0; | |
//if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) { | |
- //phi = 0.4; | |
- //dphi = 0.1; | |
+ //phi = 0.4; | |
+ //dphi = 0.1; | |
//} | |
//dev_ns_phi[cellidx] = phi; | |
//dev_ns_dphi[cellidx] = dphi; | |
t@@ -1276,17 +1283,17 @@ __global__ void findPorositiesVelocitiesDiametersSpher… | |
// Find the porosity in each cell on the base of a sphere, centered at the cell | |
// center. | |
__global__ void findPorositiesVelocitiesDiametersSphericalGradient( | |
- const unsigned int* dev_cellStart, | |
- const unsigned int* dev_cellEnd, | |
- const Float4* dev_x_sorted, | |
- const Float4* dev_vel_sorted, | |
- Float* dev_ns_phi, | |
- Float* dev_ns_dphi, | |
- Float3* dev_ns_vp_avg, | |
- Float* dev_ns_d_avg, | |
- const unsigned int iteration, | |
- const unsigned int ndem, | |
- const unsigned int np) | |
+ const unsigned int* __restrict__ dev_cellStart, | |
+ const unsigned int* __restrict__ dev_cellEnd, | |
+ const Float4* __restrict__ dev_x_sorted, | |
+ const Float4* __restrict__ dev_vel_sorted, | |
+ Float* __restrict__ dev_ns_phi, | |
+ Float* __restrict__ dev_ns_dphi, | |
+ Float3* __restrict__ dev_ns_vp_avg, | |
+ Float* __restrict__ dev_ns_d_avg, | |
+ const unsigned int iteration, | |
+ const unsigned int ndem, | |
+ const unsigned int np) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1304,7 +1311,7 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
const Float dz = devC_grid.L[2]/nz; | |
// Cell sphere radius | |
- const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width | |
+ const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width | |
Float4 xr; // particle pos. and radius | |
t@@ -1315,9 +1322,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
// Cell sphere center position | |
const Float3 X = MAKE_FLOAT3( | |
- x*dx + 0.5*dx, | |
- y*dy + 0.5*dy, | |
- z*dz + 0.5*dz); | |
+ x*dx + 0.5*dx, | |
+ y*dy + 0.5*dy, | |
+ z*dz + 0.5*dz); | |
Float d, r; | |
Float phi = 1.00; | |
t@@ -1396,9 +1403,9 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
// Find center distance and normal vector | |
x_p = MAKE_FLOAT3( | |
- xr.x - X.x, | |
- xr.y - X.y, | |
- xr.z - X.z); | |
+ xr.x - X.x, | |
+ xr.y - X.y, | |
+ xr.z - X.z); | |
d = length(x_p); | |
n_p = x_p/d; | |
q = d/R; | |
t@@ -1438,8 +1445,8 @@ __global__ void findPorositiesVelocitiesDiametersSpheric… | |
phi = phi_0 + dphi/(ndem*devC_dt); | |
//if (dot_epsilon_kk != 0.0) | |
- //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n… | |
- //x,y,z, dot_epsilon_kk, dphi, phi); | |
+ //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n", | |
+ //x,y,z, dot_epsilon_kk, dphi, phi); | |
// Make sure that the porosity is in the interval [0.0;1.0] | |
phi = fmin(1.00, fmax(0.00, phi)); | |
t@@ -1484,11 +1491,11 @@ __global__ void findPorositiesVelocitiesDiametersSpher… | |
// Modulate the hydraulic pressure at the upper boundary | |
__global__ void setUpperPressureNS( | |
- Float* dev_ns_p, | |
- Float* dev_ns_epsilon, | |
- Float* dev_ns_epsilon_new, | |
- Float beta, | |
- const Float new_pressure) | |
+ Float* __restrict__ dev_ns_p, | |
+ Float* __restrict__ dev_ns_epsilon, | |
+ Float* __restrict__ dev_ns_epsilon_new, | |
+ const Float beta, | |
+ const Float new_pressure) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1497,8 +1504,8 @@ __global__ void setUpperPressureNS( | |
// check that the thread is located at the top boundary | |
if (x < devC_grid.num[0] && | |
- y < devC_grid.num[1] && | |
- z == devC_grid.num[2]-1) { | |
+ y < devC_grid.num[1] && | |
+ z == devC_grid.num[2]-1) { | |
const unsigned int cellidx = idx(x,y,z); | |
t@@ -1524,13 +1531,13 @@ __global__ void setUpperPressureNS( | |
// Find the gradient in a cell in a homogeneous, cubic 3D scalar field using | |
// finite central differences | |
__device__ Float3 gradient( | |
- const Float* dev_scalarfield, | |
- const unsigned int x, | |
- const unsigned int y, | |
- const unsigned int z, | |
- const Float dx, | |
- const Float dy, | |
- const Float dz) | |
+ const Float* __restrict__ dev_scalarfield, | |
+ const unsigned int x, | |
+ const unsigned int y, | |
+ const unsigned int z, | |
+ const Float dx, | |
+ const Float dy, | |
+ const Float dz) | |
{ | |
// Read 6 neighbor cells | |
__syncthreads(); | |
t@@ -1544,26 +1551,26 @@ __device__ Float3 gradient( | |
//__syncthreads(); | |
//if (p != 0.0) | |
- //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p); | |
+ //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p); | |
// Calculate central-difference gradients | |
return MAKE_FLOAT3( | |
- (xp - xn)/(2.0*dx), | |
- (yp - yn)/(2.0*dy), | |
- (zp - zn)/(2.0*dz)); | |
+ (xp - xn)/(2.0*dx), | |
+ (yp - yn)/(2.0*dy), | |
+ (zp - zn)/(2.0*dz)); | |
} | |
// Find the divergence in a cell in a homogeneous, cubic, 3D vector field | |
__device__ Float divergence( | |
- const Float* dev_vectorfield_x, | |
- const Float* dev_vectorfield_y, | |
- const Float* dev_vectorfield_z, | |
- const unsigned int x, | |
- const unsigned int y, | |
- const unsigned int z, | |
- const Float dx, | |
- const Float dy, | |
- const Float dz) | |
+ const Float* __restrict__ dev_vectorfield_x, | |
+ const Float* __restrict__ dev_vectorfield_y, | |
+ const Float* __restrict__ dev_vectorfield_z, | |
+ const unsigned int x, | |
+ const unsigned int y, | |
+ const unsigned int z, | |
+ const Float dx, | |
+ const Float dy, | |
+ const Float dz) | |
{ | |
// Read 6 cell-face values | |
__syncthreads(); | |
t@@ -1583,13 +1590,13 @@ __device__ Float divergence( | |
// Find the divergence of a tensor field | |
__device__ Float3 divergence_tensor( | |
- Float* dev_tensorfield, | |
- const unsigned int x, | |
- const unsigned int y, | |
- const unsigned int z, | |
- const Float dx, | |
- const Float dy, | |
- const Float dz) | |
+ const Float* __restrict__ dev_tensorfield, | |
+ const unsigned int x, | |
+ const unsigned int y, | |
+ const unsigned int z, | |
+ const Float dx, | |
+ const Float dy, | |
+ const Float dz) | |
{ | |
__syncthreads(); | |
t@@ -1638,18 +1645,18 @@ __device__ Float3 divergence_tensor( | |
// Calculate div(phi*tau) | |
const Float3 div_tensor = MAKE_FLOAT3( | |
- // x | |
- (t_xx_xp - t_xx_xn)/dx + | |
- (t_xy_yp - t_xy_yn)/dy + | |
- (t_xz_zp - t_xz_zn)/dz, | |
- // y | |
- (t_xy_xp - t_xy_xn)/dx + | |
- (t_yy_yp - t_yy_yn)/dy + | |
- (t_yz_zp - t_yz_zn)/dz, | |
- // z | |
- (t_xz_xp - t_xz_xn)/dx + | |
- (t_yz_yp - t_yz_yn)/dy + | |
- (t_zz_zp - t_zz_zn)/dz); | |
+ // x | |
+ (t_xx_xp - t_xx_xn)/dx + | |
+ (t_xy_yp - t_xy_yn)/dy + | |
+ (t_xz_zp - t_xz_zn)/dz, | |
+ // y | |
+ (t_xy_xp - t_xy_xn)/dx + | |
+ (t_yy_yp - t_yy_yn)/dy + | |
+ (t_yz_zp - t_yz_zn)/dz, | |
+ // z | |
+ (t_xz_xp - t_xz_xn)/dx + | |
+ (t_yz_yp - t_yz_yn)/dy + | |
+ (t_zz_zp - t_zz_zn)/dz); | |
#ifdef CHECK_NS_FINITE | |
(void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor); | |
t@@ -1661,8 +1668,8 @@ __device__ Float3 divergence_tensor( | |
// Find the spatial gradient in e.g. pressures per cell | |
// using first order central differences | |
__global__ void findNSgradientsDev( | |
- Float* dev_scalarfield, // in | |
- Float3* dev_vectorfield) // out | |
+ const Float* __restrict__ dev_scalarfield, // in | |
+ Float3* __restrict__ dev_vectorfield) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1699,8 +1706,8 @@ __global__ void findNSgradientsDev( | |
// Find the outer product of v v | |
__global__ void findvvOuterProdNS( | |
- Float3* dev_ns_v, // in | |
- Float* dev_ns_v_prod) // out | |
+ const Float3* __restrict__ dev_ns_v, // in | |
+ Float* __restrict__ dev_ns_v_prod) // out | |
{ | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; | |
t@@ -1750,8 +1757,8 @@ __global__ void findvvOuterProdNS( | |
// Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6 | |
// values in 3D. | |
__global__ void findNSstressTensor( | |
- Float3* dev_ns_v, // in | |
- Float* dev_ns_tau) // out | |
+ const Float3* __restrict__ dev_ns_v, // in | |
+ Float* __restrict__ dev_ns_tau) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1809,15 +1816,15 @@ __global__ void findNSstressTensor( | |
devC_params.mu*((zp.y - zn.y)/(2.0*dz) + (yp.z - yn.z)/(2.0*dy)); | |
/* | |
- if (x == 0 && y == 0 && z == 0) | |
- printf("mu = %f\n", mu); | |
- if (tau_xz > 1.0e-6) | |
- printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx); | |
- if (tau_yz > 1.0e-6) | |
- printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy); | |
- if (tau_zz > 1.0e-6) | |
- printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz); | |
- */ | |
+ if (x == 0 && y == 0 && z == 0) | |
+ printf("mu = %f\n", mu); | |
+ if (tau_xz > 1.0e-6) | |
+ printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx); | |
+ if (tau_yz > 1.0e-6) | |
+ printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy); | |
+ if (tau_zz > 1.0e-6) | |
+ printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz); | |
+ */ | |
// Store values in global memory | |
__syncthreads(); | |
t@@ -1842,9 +1849,9 @@ __global__ void findNSstressTensor( | |
// Find the divergence of phi*v*v | |
__global__ void findNSdivphiviv( | |
- Float* dev_ns_phi, // in | |
- Float3* dev_ns_v, // in | |
- Float3* dev_ns_div_phi_vi_v) // out | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float3* __restrict__ dev_ns_v, // in | |
+ Float3* __restrict__ dev_ns_div_phi_vi_v) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -1888,58 +1895,58 @@ __global__ void findNSdivphiviv( | |
// Calculate upwind coefficients | |
//* | |
const Float3 a = MAKE_FLOAT3( | |
- copysign(1.0, v.x), | |
- copysign(1.0, v.y), | |
- copysign(1.0, v.z)); | |
+ copysign(1.0, v.x), | |
+ copysign(1.0, v.y), | |
+ copysign(1.0, v.z)); | |
// Calculate the divergence based on the upwind differences (Griebel et | |
// al. 1998, eq. 3.9) | |
const Float3 div_uw = MAKE_FLOAT3( | |
- // x | |
- ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) + | |
- (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) + | |
+ // x | |
+ ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) + | |
+ (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) + | |
- ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) + | |
- (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) + | |
+ ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) + | |
+ (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) + | |
- ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) + | |
- (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz), | |
+ ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) + | |
+ (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz), | |
- // y | |
- ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) + | |
- (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) + | |
+ // y | |
+ ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) + | |
+ (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) + | |
- ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) + | |
- (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) + | |
+ ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) + | |
+ (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) + | |
- ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) + | |
- (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz), | |
+ ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) + | |
+ (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz), | |
- // z | |
- ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) + | |
- (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) + | |
+ // z | |
+ ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) + | |
+ (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) + | |
- ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) + | |
- (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) + | |
+ ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) + | |
+ (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) + | |
- ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) + | |
- (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz)); | |
+ ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) + | |
+ (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz)); | |
// Calculate the divergence based on the central-difference gradients | |
const Float3 div_cd = MAKE_FLOAT3( | |
- // x | |
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz), | |
- // y | |
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz), | |
- // z | |
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz)); | |
+ // x | |
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz), | |
+ // y | |
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz), | |
+ // z | |
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz)); | |
// Weighting parameter | |
const Float tau = 0.5; | |
t@@ -1951,26 +1958,26 @@ __global__ void findNSdivphiviv( | |
/* | |
// Calculate the divergence: div(phi*v_i*v) | |
const Float3 div_phi_vi_v = MAKE_FLOAT3( | |
- // x | |
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz), | |
- // y | |
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz), | |
- // z | |
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) + | |
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) + | |
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz)); | |
- // */ | |
+ // x | |
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz), | |
+ // y | |
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz), | |
+ // z | |
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) + | |
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) + | |
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz)); | |
+ // */ | |
// Write divergence | |
__syncthreads(); | |
dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v; | |
//printf("div(phi*v*v) [%d,%d,%d] = %f, %f, %f\n", x,y,z, | |
- //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z); | |
+ //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z); | |
#ifdef CHECK_NS_FINITE | |
(void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v); | |
t@@ -1979,8 +1986,8 @@ __global__ void findNSdivphiviv( | |
} | |
__global__ void findNSdivtau( | |
- Float* dev_ns_tau, // in | |
- Float3* dev_ns_div_tau) // out | |
+ const Float* __restrict__ dev_ns_tau, // in | |
+ Float3* __restrict__ dev_ns_div_tau) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2015,9 +2022,9 @@ __global__ void findNSdivtau( | |
// Find the divergence of phi*tau | |
__global__ void findNSdivphitau( | |
- Float* dev_ns_phi, // in | |
- Float* dev_ns_tau, // in | |
- Float3* dev_ns_div_phi_tau) // out | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float* __restrict__ dev_ns_tau, // in | |
+ Float3* __restrict__ dev_ns_div_phi_tau) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2094,18 +2101,18 @@ __global__ void findNSdivphitau( | |
// Calculate div(phi*tau) | |
const Float3 div_phi_tau = MAKE_FLOAT3( | |
- // x | |
- (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx + | |
- (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy + | |
- (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz, | |
- // y | |
- (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx + | |
- (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy + | |
- (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz, | |
- // z | |
- (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx + | |
- (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy + | |
- (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz); | |
+ // x | |
+ (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx + | |
+ (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy + | |
+ (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz, | |
+ // y | |
+ (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx + | |
+ (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy + | |
+ (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz, | |
+ // z | |
+ (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx + | |
+ (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy + | |
+ (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz); | |
// Write divergence | |
__syncthreads(); | |
t@@ -2120,9 +2127,9 @@ __global__ void findNSdivphitau( | |
// Find the divergence of phi v v | |
// Unused | |
__global__ void findNSdivphivv( | |
- Float* dev_ns_v_prod, // in | |
- Float* dev_ns_phi, // in | |
- Float3* dev_ns_div_phi_v_v) // out | |
+ const Float* __restrict__ dev_ns_v_prod, // in | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ Float3* __restrict__ dev_ns_div_phi_v_v) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2174,24 +2181,24 @@ __global__ void findNSdivphivv( | |
// The symmetry described in findvvOuterProdNS is used | |
__syncthreads(); | |
const Float3 div = MAKE_FLOAT3( | |
- ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp | |
- - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) + | |
- (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp | |
- - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) + | |
- (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp | |
- - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)), | |
- ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp | |
- - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) + | |
- (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp | |
- - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) + | |
- (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp | |
- - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)), | |
- ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp | |
- - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) + | |
- (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp | |
- - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) + | |
- (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp | |
- - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) ); | |
+ ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp | |
+ - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) + | |
+ (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp | |
+ - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) + | |
+ (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp | |
+ - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)), | |
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp | |
+ - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) + | |
+ (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp | |
+ - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) + | |
+ (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp | |
+ - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)), | |
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp | |
+ - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) + | |
+ (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp | |
+ - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) + | |
+ (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp | |
+ - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) ); | |
//printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z); | |
t@@ -2209,25 +2216,25 @@ __global__ void findNSdivphivv( | |
// Find predicted fluid velocity | |
// Launch per face. | |
__global__ void findPredNSvelocities( | |
- Float* dev_ns_p, // in | |
- Float* dev_ns_v_x, // in | |
- Float* dev_ns_v_y, // in | |
- Float* dev_ns_v_z, // in | |
- Float* dev_ns_phi, // in | |
- Float* dev_ns_dphi, // in | |
- Float* dev_ns_div_tau_x, // in | |
- Float* dev_ns_div_tau_y, // in | |
- Float* dev_ns_div_tau_z, // in | |
- Float3* dev_ns_div_phi_vi_v, // in | |
- int bc_bot, // in | |
- int bc_top, // in | |
- Float beta, // in | |
- Float3* dev_ns_F_pf, // in | |
- unsigned int ndem, // in | |
- Float c_grad_p, // in | |
- Float* dev_ns_v_p_x, // out | |
- Float* dev_ns_v_p_y, // out | |
- Float* dev_ns_v_p_z) // out | |
+ const Float* __restrict__ dev_ns_p, // in | |
+ const Float* __restrict__ dev_ns_v_x, // in | |
+ const Float* __restrict__ dev_ns_v_y, // in | |
+ const Float* __restrict__ dev_ns_v_z, // in | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float* __restrict__ dev_ns_dphi, // in | |
+ const Float* __restrict__ dev_ns_div_tau_x, // in | |
+ const Float* __restrict__ dev_ns_div_tau_y, // in | |
+ const Float* __restrict__ dev_ns_div_tau_z, // in | |
+ const Float3* __restrict__ dev_ns_div_phi_vi_v, // in | |
+ const int bc_bot, // in | |
+ const int bc_top, // in | |
+ const Float beta, // in | |
+ const Float3* __restrict__ dev_ns_F_pf, // in | |
+ const unsigned int ndem, // in | |
+ const Float __restrict__ c_grad_p, // in | |
+ Float* __restrict__ dev_ns_v_p_x, // out | |
+ Float* __restrict__ dev_ns_v_p_y, // out | |
+ Float* __restrict__ dev_ns_v_p_z) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2255,9 +2262,9 @@ __global__ void findPredNSvelocities( | |
// Values that are needed for calculating the predicted velocity | |
__syncthreads(); | |
const Float3 v = MAKE_FLOAT3( | |
- dev_ns_v_x[fidx], | |
- dev_ns_v_y[fidx], | |
- dev_ns_v_z[fidx]); | |
+ dev_ns_v_x[fidx], | |
+ dev_ns_v_y[fidx], | |
+ dev_ns_v_z[fidx]); | |
Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0); | |
if (devC_params.mu > 0.0) { | |
t@@ -2285,13 +2292,13 @@ __global__ void findPredNSvelocities( | |
// component-wise average values | |
const Float3 phi = MAKE_FLOAT3( | |
- amean(phi_c, phi_xn), | |
- amean(phi_c, phi_yn), | |
- amean(phi_c, phi_zn)); | |
+ amean(phi_c, phi_xn), | |
+ amean(phi_c, phi_yn), | |
+ amean(phi_c, phi_zn)); | |
const Float3 dphi = MAKE_FLOAT3( | |
- amean(dphi_c, dphi_xn), | |
- amean(dphi_c, dphi_yn), | |
- amean(dphi_c, dphi_zn)); | |
+ amean(dphi_c, dphi_xn), | |
+ amean(dphi_c, dphi_yn), | |
+ amean(dphi_c, dphi_zn)); | |
// The particle-fluid interaction force should only be incoorporated if | |
// there is a fluid viscosity | |
t@@ -2308,9 +2315,9 @@ __global__ void findPredNSvelocities( | |
f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0); | |
} | |
const Float3 f_i = MAKE_FLOAT3( | |
- amean(f_i_c.x, f_i_xn.x), | |
- amean(f_i_c.y, f_i_yn.y), | |
- amean(f_i_c.z, f_i_zn.z)); | |
+ amean(f_i_c.x, f_i_xn.x), | |
+ amean(f_i_c.y, f_i_yn.y), | |
+ amean(f_i_c.z, f_i_zn.z)); | |
const Float dt = ndem*devC_dt; | |
const Float rho = devC_params.rho_f; | |
t@@ -2326,9 +2333,9 @@ __global__ void findPredNSvelocities( | |
const Float p_yn = dev_ns_p[idx(x,y-1,z)]; | |
const Float p_zn = dev_ns_p[idx(x,y,z-1)]; | |
const Float3 grad_p = MAKE_FLOAT3( | |
- (p - p_xn)/dx, | |
- (p - p_yn)/dy, | |
- (p - p_zn)/dz) * c_grad_p; | |
+ (p - p_xn)/dx, | |
+ (p - p_yn)/dy, | |
+ (p - p_zn)/dz) * c_grad_p; | |
#ifdef SET_1 | |
pressure_term = -beta*dt/(rho*phi)*grad_p; | |
#endif | |
t@@ -2338,16 +2345,16 @@ __global__ void findPredNSvelocities( | |
} | |
const Float3 div_phi_vi_v = MAKE_FLOAT3( | |
- amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x), | |
- amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y), | |
- amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z)); | |
+ amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x), | |
+ amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y), | |
+ amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z)); | |
// Determine the predicted velocity | |
#ifdef SET_1 | |
const Float3 interaction_term = -dt/(rho*phi)*f_i; | |
const Float3 diffusion_term = dt/(rho*phi)*div_tau; | |
const Float3 gravity_term = MAKE_FLOAT3( | |
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt; | |
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt; | |
const Float3 porosity_term = -1.0*v*dphi/phi; | |
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi; | |
#endif | |
t@@ -2355,7 +2362,7 @@ __global__ void findPredNSvelocities( | |
const Float3 interaction_term = -dt/(rho*phi)*f_i; | |
const Float3 diffusion_term = dt/rho*div_tau; | |
const Float3 gravity_term = MAKE_FLOAT3( | |
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt; | |
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt; | |
const Float3 porosity_term = -1.0*v*dphi/phi; | |
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi; | |
#endif | |
t@@ -2376,31 +2383,31 @@ __global__ void findPredNSvelocities( | |
// No slip | |
/*if ((z == 0 && bc_bot == 2) || (z == nz-1 && bc_top == 2)) { | |
- v_p.x = 0.0; | |
- v_p.y = 0.0; | |
- v_p.z = 0.0; | |
- }*/ | |
+ v_p.x = 0.0; | |
+ v_p.y = 0.0; | |
+ v_p.z = 0.0; | |
+ }*/ | |
#ifdef REPORT_V_P_COMPONENTS | |
// Report velocity components to stdout for debugging | |
if (z==0) | |
- printf("\n[%d,%d,%d]" | |
- "\tv_p = %+e %+e %+e\n" | |
- "\tpres = %+e %+e %+e\n" | |
- "\tinteract = %+e %+e %+e\n" | |
- "\tdiff = %+e %+e %+e\n" | |
- "\tgrav = %+e %+e %+e\n" | |
- "\tporos = %+e %+e %+e\n" | |
- "\tadv = %+e %+e %+e\n", | |
- x, y, z, | |
- v_p.x, v_p.y, v_p.z, | |
- pressure_term.x, pressure_term.y, pressure_term.z, | |
- interaction_term.x, interaction_term.y, interaction_term.z, | |
- diffusion_term.x, diffusion_term.y, diffusion_term.z, | |
- gravity_term.x, gravity_term.y, gravity_term.z, | |
- porosity_term.x, porosity_term.y, porosity_term.z, | |
- advection_term.x, advection_term.y, advection_term.z); | |
+ printf("\n[%d,%d,%d]" | |
+ "\tv_p = %+e %+e %+e\n" | |
+ "\tpres = %+e %+e %+e\n" | |
+ "\tinteract = %+e %+e %+e\n" | |
+ "\tdiff = %+e %+e %+e\n" | |
+ "\tgrav = %+e %+e %+e\n" | |
+ "\tporos = %+e %+e %+e\n" | |
+ "\tadv = %+e %+e %+e\n", | |
+ x, y, z, | |
+ v_p.x, v_p.y, v_p.z, | |
+ pressure_term.x, pressure_term.y, pressure_term.z, | |
+ interaction_term.x, interaction_term.y, interaction_term.z, | |
+ diffusion_term.x, diffusion_term.y, diffusion_term.z, | |
+ gravity_term.x, gravity_term.y, gravity_term.z, | |
+ porosity_term.x, porosity_term.y, porosity_term.z, | |
+ advection_term.x, advection_term.y, advection_term.z); | |
#endif | |
// Save the predicted velocity | |
t@@ -2421,18 +2428,18 @@ __global__ void findPredNSvelocities( | |
// At each iteration, the value of the forcing function is found as: | |
// f = f1 - f2 dot grad(epsilon) | |
__global__ void findNSforcing( | |
- Float* dev_ns_epsilon, // in | |
- Float* dev_ns_phi, // in | |
- Float* dev_ns_dphi, // in | |
- Float3* dev_ns_v_p, // in | |
- Float* dev_ns_v_p_x, // in | |
- Float* dev_ns_v_p_y, // in | |
- Float* dev_ns_v_p_z, // in | |
- unsigned int nijac, // in | |
- unsigned int ndem, // in | |
- Float* dev_ns_f1, // out | |
- Float3* dev_ns_f2, // out | |
- Float* dev_ns_f) // out | |
+ const Float* __restrict__ dev_ns_epsilon, // in | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float* __restrict__ dev_ns_dphi, // in | |
+ const Float3* __restrict__ dev_ns_v_p, // in | |
+ const Float* __restrict__ dev_ns_v_p_x, // in | |
+ const Float* __restrict__ dev_ns_v_p_y, // in | |
+ const Float* __restrict__ dev_ns_v_p_z, // in | |
+ const unsigned int nijac, // in | |
+ const unsigned int ndem, // in | |
+ Float* __restrict__ dev_ns_f1, // out | |
+ Float3* __restrict__ dev_ns_f2, // out | |
+ Float* __restrict__ dev_ns_f) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2472,7 +2479,7 @@ __global__ void findNSforcing( | |
// Calculate derivatives | |
const Float div_v_p | |
= divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z, | |
- x, y, z, dx, dy, dz); | |
+ x, y, z, dx, dy, dz); | |
const Float3 grad_phi | |
= gradient(dev_ns_phi, x, y, z, dx, dy, dz); | |
t@@ -2497,7 +2504,7 @@ __global__ void findNSforcing( | |
#ifdef REPORT_FORCING_TERMS | |
// Report values terms in the forcing function for debugging | |
printf("[%d,%d,%d]\tt1 = %f\tt2 = %f\tt4 = %f\n", | |
- x,y,z, t1, t2, t4); | |
+ x,y,z, t1, t2, t4); | |
#endif | |
// Save values | |
t@@ -2524,7 +2531,7 @@ __global__ void findNSforcing( | |
const Float t3 = -dot(f2, grad_epsilon); | |
if (z >= nz-3) | |
printf("[%d,%d,%d]\tf = %f\tf1 = %f\tt3 = %f\n", | |
- x,y,z, f, f1, t3); | |
+ x,y,z, f, f1, t3); | |
#endif | |
// Save forcing function value | |
t@@ -2542,10 +2549,10 @@ __global__ void findNSforcing( | |
// non-smoothed and smoothed values. | |
template<typename T> | |
__global__ void smoothing( | |
- T* dev_arr, | |
- const Float gamma, | |
- const unsigned int bc_bot, | |
- const unsigned int bc_top) | |
+ T* __restrict__ dev_arr, | |
+ const Float gamma, | |
+ const unsigned int bc_bot, | |
+ const unsigned int bc_top) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2602,13 +2609,13 @@ __global__ void smoothing( | |
// Perform a single Jacobi iteration | |
__global__ void jacobiIterationNS( | |
- const Float* dev_ns_epsilon, | |
- Float* dev_ns_epsilon_new, | |
- Float* dev_ns_norm, | |
- const Float* dev_ns_f, | |
- const int bc_bot, | |
- const int bc_top, | |
- const Float theta) | |
+ const Float* __restrict__ dev_ns_epsilon, | |
+ Float* __restrict__ dev_ns_epsilon_new, | |
+ Float* __restrict__ dev_ns_norm, | |
+ const Float* __restrict__ dev_ns_f, | |
+ const int bc_bot, | |
+ const int bc_top, | |
+ const Float theta) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2672,9 +2679,9 @@ __global__ void jacobiIterationNS( | |
const Float dzdz = dz*dz; | |
Float e_new | |
= (-dxdx*dydy*dzdz*f | |
- + dydy*dzdz*(e_xn + e_xp) | |
- + dxdx*dzdz*(e_yn + e_yp) | |
- + dxdx*dydy*(e_zn + e_zp)) | |
+ + dydy*dzdz*(e_xn + e_xp) | |
+ + dxdx*dzdz*(e_yn + e_yp) | |
+ + dxdx*dydy*(e_zn + e_zp)) | |
/(2.0*(dxdx*dydy + dxdx*dzdz + dydy*dzdz)); | |
// New value of epsilon in 1D update | |
t@@ -2682,7 +2689,7 @@ __global__ void jacobiIterationNS( | |
// Print values for debugging | |
/*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n", | |
- x,y,z, e, f, e_new);*/ | |
+ x,y,z, e, f, e_new);*/ | |
const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16); | |
const Float e_relax = e*(1.0-theta) + e_new*theta; | |
t@@ -2697,7 +2704,7 @@ __global__ void jacobiIterationNS( | |
//(void)checkFiniteFloat("res_norm", x, y, z, res_norm); | |
if (checkFiniteFloat("res_norm", x, y, z, res_norm)) { | |
printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n", | |
- x,y,z, e, f, e_new, res_norm); | |
+ x,y,z, e, f, e_new, res_norm); | |
} | |
#endif | |
} | |
t@@ -2706,8 +2713,8 @@ __global__ void jacobiIterationNS( | |
// Copy all values from one array to the other | |
template<typename T> | |
__global__ void copyValues( | |
- T* dev_read, | |
- T* dev_write) | |
+ const T* __restrict__ dev_read, | |
+ T* __restrict__ dev_write) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2717,10 +2724,10 @@ __global__ void copyValues( | |
// Internal nodes only | |
if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) { | |
- // Internal nodes + ghost nodes | |
- /*if (x <= devC_grid.num[0]+1 && | |
- y <= devC_grid.num[1]+1 && | |
- z <= devC_grid.num[2]+1) {*/ | |
+ // Internal nodes + ghost nodes | |
+ /*if (x <= devC_grid.num[0]+1 && | |
+ y <= devC_grid.num[1]+1 && | |
+ z <= devC_grid.num[2]+1) {*/ | |
const unsigned int cellidx = idx(x,y,z); // without ghost nodes | |
//const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes | |
t@@ -2730,7 +2737,7 @@ __global__ void copyValues( | |
const T val = dev_read[cellidx]; | |
//if (z == devC_grid.num[2]-1) | |
- //printf("[%d,%d,%d] = %f\n", x, y, z, val); | |
+ //printf("[%d,%d,%d] = %f\n", x, y, z, val); | |
// Write | |
__syncthreads(); | |
t@@ -2740,11 +2747,11 @@ __global__ void copyValues( | |
// Find and store the normalized residuals | |
__global__ void findNormalizedResiduals( | |
- Float* dev_ns_epsilon_old, | |
- Float* dev_ns_epsilon, | |
- Float* dev_ns_norm, | |
- const unsigned int bc_bot, | |
- const unsigned int bc_top) | |
+ const Float* __restrict__ dev_ns_epsilon_old, | |
+ const Float* __restrict__ dev_ns_epsilon, | |
+ Float* __restrict__ dev_ns_norm, | |
+ const unsigned int bc_bot, | |
+ const unsigned int bc_top) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2791,9 +2798,9 @@ __global__ void findNormalizedResiduals( | |
// Computes the new velocity and pressure using the corrector | |
__global__ void updateNSpressure( | |
- Float* dev_ns_epsilon, // in | |
- Float beta, // in | |
- Float* dev_ns_p) // out | |
+ const Float* __restrict__ dev_ns_epsilon, // in | |
+ const Float __restrict__ beta, // in | |
+ Float* __restrict__ dev_ns_p) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2825,19 +2832,19 @@ __global__ void updateNSpressure( | |
} | |
__global__ void updateNSvelocity( | |
- Float* dev_ns_v_p_x, // in | |
- Float* dev_ns_v_p_y, // in | |
- Float* dev_ns_v_p_z, // in | |
- Float* dev_ns_phi, // in | |
- Float* dev_ns_epsilon, // in | |
- Float beta, // in | |
- int bc_bot, // in | |
- int bc_top, // in | |
- unsigned int ndem, // in | |
- Float c_grad_p, // in | |
- Float* dev_ns_v_x, // out | |
- Float* dev_ns_v_y, // out | |
- Float* dev_ns_v_z) // out | |
+ const Float* __restrict__ dev_ns_v_p_x, // in | |
+ const Float* __restrict__ dev_ns_v_p_y, // in | |
+ const Float* __restrict__ dev_ns_v_p_z, // in | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float* __restrict__ dev_ns_epsilon, // in | |
+ const Float beta, // in | |
+ const int bc_bot, // in | |
+ const int bc_top, // in | |
+ const unsigned int ndem, // in | |
+ const Float c_grad_p, // in | |
+ Float* __restrict__ dev_ns_v_x, // out | |
+ Float* __restrict__ dev_ns_v_y, // out | |
+ Float* __restrict__ dev_ns_v_z) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -2879,15 +2886,15 @@ __global__ void updateNSvelocity( | |
const Float phi_zn = dev_ns_phi[idx(x,y,z-1)]; | |
const Float3 phi = MAKE_FLOAT3( | |
- amean(phi_c, phi_xn), | |
- amean(phi_c, phi_yn), | |
- amean(phi_c, phi_zn)); | |
+ amean(phi_c, phi_xn), | |
+ amean(phi_c, phi_yn), | |
+ amean(phi_c, phi_zn)); | |
// Find corrector gradient | |
const Float3 grad_epsilon = MAKE_FLOAT3( | |
- (epsilon_c - epsilon_xn)/dx, | |
- (epsilon_c - epsilon_yn)/dy, | |
- (epsilon_c - epsilon_zn)/dz) * c_grad_p; | |
+ (epsilon_c - epsilon_xn)/dx, | |
+ (epsilon_c - epsilon_yn)/dy, | |
+ (epsilon_c - epsilon_zn)/dz) * c_grad_p; | |
// Find new velocity | |
#ifdef SET_1 | |
t@@ -2900,16 +2907,16 @@ __global__ void updateNSvelocity( | |
// Print values for debugging | |
/* if (z == 0) { | |
- Float e_up = dev_ns_epsilon[idx(x,y,z+1)]; | |
- Float e_down = dev_ns_epsilon[idx(x,y,z-1)]; | |
- printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n", | |
- x,y,z, | |
- grad_epsilon.x, | |
- grad_epsilon.y, | |
- grad_epsilon.z, | |
- e_up, | |
- e_down); | |
- }*/ | |
+ Float e_up = dev_ns_epsilon[idx(x,y,z+1)]; | |
+ Float e_down = dev_ns_epsilon[idx(x,y,z-1)]; | |
+ printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n", | |
+ x,y,z, | |
+ grad_epsilon.x, | |
+ grad_epsilon.y, | |
+ grad_epsilon.z, | |
+ e_up, | |
+ e_down); | |
+ }*/ | |
if ((z == 0 && bc_bot == 1) || (z == nz-1 && bc_top == 1)) | |
v.z = 0.0; | |
t@@ -2919,11 +2926,11 @@ __global__ void updateNSvelocity( | |
// Check the advection term using the Courant-Friedrichs-Lewy condition | |
if (v.x*ndem*devC_dt/dx | |
- + v.y*ndem*devC_dt/dy | |
- + v.z*ndem*devC_dt/dz > 1.0) { | |
+ + v.y*ndem*devC_dt/dy | |
+ + v.z*ndem*devC_dt/dz > 1.0) { | |
printf("[%d,%d,%d] Warning: Advection term in fluid may be " | |
- "unstable (CFL condition), v = %f,%f,%f\n", | |
- x,y,z, v.x, v.y, v.z); | |
+ "unstable (CFL condition), v = %f,%f,%f\n", | |
+ x,y,z, v.x, v.y, v.z); | |
} | |
// Write new values | |
t@@ -2941,12 +2948,12 @@ __global__ void updateNSvelocity( | |
// Find the average particle diameter and velocity for each CFD cell. | |
// UNUSED: The values are estimated in the porosity estimation function instead | |
__global__ void findAvgParticleVelocityDiameter( | |
- unsigned int* dev_cellStart, // in | |
- unsigned int* dev_cellEnd, // in | |
- Float4* dev_vel_sorted, // in | |
- Float4* dev_x_sorted, // in | |
- Float3* dev_ns_vp_avg, // out | |
- Float* dev_ns_d_avg) // out | |
+ const unsigned int* __restrict__ dev_cellStart, // in | |
+ const unsigned int* __restrict__ dev_cellEnd, // in | |
+ const Float4* __restrict__ dev_vel_sorted, // in | |
+ const Float4* __restrict__ dev_x_sorted, // in | |
+ Float3* dev_ns_vp_avg, // out | |
+ Float* dev_ns_d_avg) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -3023,18 +3030,18 @@ __device__ Float dragCoefficient(Float re) | |
// Find particle-fluid interaction force as outlined by Zhou et al. 2010, and | |
// originally by Gidaspow 1992. | |
__global__ void findInteractionForce( | |
- Float4* dev_x, // in | |
- Float4* dev_vel, // in | |
- Float* dev_ns_phi, // in | |
- Float* dev_ns_p, // in | |
- Float* dev_ns_v_x, // in | |
- Float* dev_ns_v_y, // in | |
- Float* dev_ns_v_z, // in | |
- Float* dev_ns_div_tau_x,// in | |
- Float* dev_ns_div_tau_y,// in | |
- Float* dev_ns_div_tau_z,// in | |
- Float3* dev_ns_f_pf, // out | |
- Float4* dev_force) // out | |
+ const Float4* __restrict__ dev_x, // in | |
+ const Float4* __restrict__ dev_vel, // in | |
+ const Float* __restrict__ dev_ns_phi, // in | |
+ const Float* __restrict__ dev_ns_p, // in | |
+ const Float* __restrict__ dev_ns_v_x, // in | |
+ const Float* __restrict__ dev_ns_v_y, // in | |
+ const Float* __restrict__ dev_ns_v_z, // in | |
+ const Float* __restrict__ dev_ns_div_tau_x,// in | |
+ const Float* __restrict__ dev_ns_div_tau_y,// in | |
+ const Float* __restrict__ dev_ns_div_tau_z,// in | |
+ Float3* __restrict__ dev_ns_f_pf, // out | |
+ Float4* __restrict__ dev_force) // out | |
{ | |
unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index | |
t@@ -3080,14 +3087,14 @@ __global__ void findInteractionForce( | |
const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)]; | |
const Float3 v_f = MAKE_FLOAT3( | |
- amean(v_x, v_x_p), | |
- amean(v_y, v_y_p), | |
- amean(v_z, v_z_p)); | |
+ amean(v_x, v_x_p), | |
+ amean(v_y, v_y_p), | |
+ amean(v_z, v_z_p)); | |
const Float3 div_tau = MAKE_FLOAT3( | |
- amean(div_tau_x, div_tau_x_p), | |
- amean(div_tau_y, div_tau_y_p), | |
- amean(div_tau_z, div_tau_z_p)); | |
+ amean(div_tau_x, div_tau_x_p), | |
+ amean(div_tau_y, div_tau_y_p), | |
+ amean(div_tau_z, div_tau_z_p)); | |
const Float3 v_rel = v_f - v_p; | |
const Float v_rel_length = length(v_rel); | |
t@@ -3119,15 +3126,15 @@ __global__ void findInteractionForce( | |
#ifdef CHECK_NS_FINITE | |
/* | |
- printf("\nfindInteractionForce %d [%d,%d,%d]\n" | |
- "\tV_p = %f Re=%f Cd=%f chi=%f\n" | |
- "\tf_d = %+e %+e %+e\n" | |
- "\tf_p = %+e %+e %+e\n" | |
- "\tf_v = %+e %+e %+e\n", | |
- i, i_x, i_y, i_z, V_p, Re, Cd, chi, | |
- f_d.x, f_d.y, f_d.z, | |
- f_p.x, f_p.y, f_p.z, | |
- f_v.x, f_v.y, f_v.z);// */ | |
+ printf("\nfindInteractionForce %d [%d,%d,%d]\n" | |
+ "\tV_p = %f Re=%f Cd=%f chi=%f\n" | |
+ "\tf_d = %+e %+e %+e\n" | |
+ "\tf_p = %+e %+e %+e\n" | |
+ "\tf_v = %+e %+e %+e\n", | |
+ i, i_x, i_y, i_z, V_p, Re, Cd, chi, | |
+ f_d.x, f_d.y, f_d.z, | |
+ f_p.x, f_p.y, f_p.z, | |
+ f_v.x, f_v.y, f_v.z);// */ | |
checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d); | |
checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p); | |
checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v); | |
t@@ -3149,11 +3156,11 @@ __global__ void findInteractionForce( | |
// Apply the fluid-particle interaction force to the fluid cell based on the | |
// interaction forces from each particle in it | |
__global__ void applyInteractionForceToFluid( | |
- unsigned int* dev_gridParticleIndex, // in | |
- unsigned int* dev_cellStart, // in | |
- unsigned int* dev_cellEnd, // in | |
- Float3* dev_ns_f_pf, // in | |
- Float3* dev_ns_F_pf) // out | |
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in | |
+ const unsigned int* __restrict__ dev_cellStart, // in | |
+ const unsigned int* __restrict__ dev_cellEnd, // in | |
+ const Float3* __restrict__ dev_ns_f_pf, // in | |
+ Float3* __restrict__ dev_ns_F_pf) // out | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -3212,10 +3219,10 @@ __global__ void applyInteractionForceToFluid( | |
// Launch per cell face node. | |
// Cell center ghost nodes must be set prior to call. | |
__global__ void interpolateCenterToFace( | |
- Float3* dev_in, | |
- Float* dev_out_x, | |
- Float* dev_out_y, | |
- Float* dev_out_z) | |
+ const Float3* __restrict__ dev_in, | |
+ Float* __restrict__ dev_out_x, | |
+ Float* __restrict__ dev_out_y, | |
+ Float* __restrict__ dev_out_z) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -3224,8 +3231,8 @@ __global__ void interpolateCenterToFace( | |
// Check that we are not outside the fluid grid | |
if (x <= devC_grid.num[0] | |
- && y <= devC_grid.num[1] | |
- && z <= devC_grid.num[2]) { | |
+ && y <= devC_grid.num[1] | |
+ && z <= devC_grid.num[2]) { | |
const unsigned int faceidx = vidx(x,y,z); | |
t@@ -3249,10 +3256,10 @@ __global__ void interpolateCenterToFace( | |
// Launch per cell center node | |
__global__ void interpolateFaceToCenter( | |
- Float* dev_in_x, | |
- Float* dev_in_y, | |
- Float* dev_in_z, | |
- Float3* dev_out) | |
+ Float* dev_in_x, | |
+ Float* dev_in_y, | |
+ Float* dev_in_z, | |
+ Float3* dev_out) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -3277,9 +3284,9 @@ __global__ void interpolateFaceToCenter( | |
const Float z_p = dev_in_z[vidx(x,y,z+1)]; | |
const Float3 val = MAKE_FLOAT3( | |
- amean(x_n, x_p), | |
- amean(y_n, y_p), | |
- amean(z_n, z_p)); | |
+ amean(x_n, x_p), | |
+ amean(y_n, y_p), | |
+ amean(z_n, z_p)); | |
__syncthreads(); | |
//printf("[%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z); | |
t@@ -3292,12 +3299,12 @@ __global__ void interpolateFaceToCenter( | |
// Warning: The grid-corner values will be invalid, along with the non-normal | |
// components of the ghost nodes | |
__global__ void findFaceDivTau( | |
- Float* dev_ns_v_x, | |
- Float* dev_ns_v_y, | |
- Float* dev_ns_v_z, | |
- Float* dev_ns_div_tau_x, | |
- Float* dev_ns_div_tau_y, | |
- Float* dev_ns_div_tau_z) | |
+ const Float* __restrict__ dev_ns_v_x, | |
+ const Float* __restrict__ dev_ns_v_y, | |
+ const Float* __restrict__ dev_ns_v_z, | |
+ Float* __restrict__ dev_ns_div_tau_x, | |
+ Float* __restrict__ dev_ns_div_tau_y, | |
+ Float* __restrict__ dev_ns_div_tau_z) | |
{ | |
// 3D thread index | |
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; | |
t@@ -3347,19 +3354,19 @@ __global__ void findFaceDivTau( | |
const Float div_tau_x = | |
devC_params.mu*( | |
- (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) + | |
- (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) + | |
- (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz)); | |
+ (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) + | |
+ (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) + | |
+ (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz)); | |
const Float div_tau_y = | |
devC_params.mu*( | |
- (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) + | |
- (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) + | |
- (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz)); | |
+ (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) + | |
+ (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) + | |
+ (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz)); | |
const Float div_tau_z = | |
devC_params.mu*( | |
- (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) + | |
- (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) + | |
- (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz)); | |
+ (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) + | |
+ (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) + | |
+ (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz)); | |
__syncthreads(); | |
//printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z, | |
diff --git a/src/raytracer.cuh b/src/raytracer.cuh | |
t@@ -6,12 +6,14 @@ | |
//#include "cuPrintf.cu" | |
// Template for discarding the last term in four-component vector structs | |
-__device__ __inline__ float3 f4_to_f3(float4 in) { | |
+__device__ __inline__ float3 f4_to_f3(const float4 in) { | |
return make_float3(in.x, in.y, in.z); | |
} | |
// Kernel for initializing image data | |
-__global__ void imageInit(unsigned char* dev_img, unsigned int pixels) | |
+__global__ void imageInit( | |
+ unsigned char* __restrict__ dev_img, | |
+ const unsigned int pixels) | |
{ | |
// Compute pixel position from threadIdx/blockIdx | |
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
t@@ -24,11 +26,12 @@ __global__ void imageInit(unsigned char* dev_img, unsigned… | |
} | |
// Calculate ray origins and directions | |
-__global__ void rayInitPerspective(float4* dev_ray_origo, | |
- float4* dev_ray_direction, | |
- float4 eye, | |
- unsigned int width, | |
- unsigned int height) | |
+__global__ void rayInitPerspective( | |
+ float4* __restrict__ dev_ray_origo, | |
+ float4* __restrict__ dev_ray_direction, | |
+ const float4 eye, | |
+ const unsigned int width, | |
+ const unsigned int height) | |
{ | |
// Compute pixel position from threadIdx/blockIdx | |
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
t@@ -47,15 +50,17 @@ __global__ void rayInitPerspective(float4* dev_ray_origo, | |
// Write ray origo and direction to global memory | |
dev_ray_origo[mempos] = make_float4(devC_eye, 0.0f); | |
- dev_ray_direction[mempos] = make_float4(-devC_d*devC_w + p_u*devC_u + p_v*… | |
+ dev_ray_direction[mempos] = | |
+ make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f); | |
} | |
// Check wether the pixel's viewing ray intersects with the spheres, | |
// and shade the pixel correspondingly | |
-__global__ void rayIntersectSpheres(float4* dev_ray_origo, | |
- float4* dev_ray_direction, | |
- Float4* dev_x, | |
- unsigned char* dev_img) | |
+__global__ void rayIntersectSpheres( | |
+ const float4* __restrict__ dev_ray_origo, | |
+ const float4* __restrict__ dev_ray_direction, | |
+ const Float4* __restrict__ dev_x, | |
+ unsigned char* __restrict__ dev_img) | |
{ | |
// Compute pixel position from threadIdx/blockIdx | |
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
t@@ -67,7 +72,8 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo, | |
float3 d = f4_to_f3(dev_ray_direction[mempos]); | |
//float step = length(d); | |
- // Distance, in ray steps, between object and eye initialized with a large… | |
+ // Distance, in ray steps, between object and eye initialized with | |
+ // a large value | |
float tdist = 1e10f; | |
// Surface normal at closest sphere intersection | |
t@@ -86,8 +92,6 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo, | |
float3 c = make_float3(x.x, x.y, x.z); | |
float R = x.w; | |
- //cuPrintf("particle %d at: %f, %f, %f, radius: %f\n", i, c.x, c.y, c.… | |
- | |
// Calculate the discriminant: d = B^2 - 4AC | |
float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
- 4.0f*dot(d,d) // -4*A | |
t@@ -98,8 +102,10 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo, | |
if (Delta > 0.0f) { | |
// Calculate roots, Shirley 2009 p. 77 | |
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) … | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
+ float t_minus = ((dot(-d,(e-c)) | |
+ - sqrt( dot(d,(e-c))*dot(d,(e-c)) | |
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R))) | |
+ / dot(d,d)); | |
// Check wether intersection is closer than previous values | |
if (fabs(t_minus) < tdist) { | |
t@@ -137,14 +143,15 @@ __global__ void rayIntersectSpheres(float4* dev_ray_orig… | |
// Check wether the pixel's viewing ray intersects with the spheres, | |
// and shade the pixel correspondingly using a colormap | |
-__global__ void rayIntersectSpheresColormap(float4* dev_ray_origo, | |
- float4* dev_ray_direction, | |
- Float4* dev_x, | |
- Float4* dev_vel, | |
- Float* dev_linarr, | |
- float max_val, | |
- float lower_cutoff, | |
- unsigned char* dev_img) | |
+__global__ void rayIntersectSpheresColormap( | |
+ const float4* __restrict__ dev_ray_origo, | |
+ const float4* __restrict__ dev_ray_direction, | |
+ const Float4* __restrict__ dev_x, | |
+ const Float4* __restrict__ dev_vel, | |
+ const Float* __restrict__ dev_linarr, | |
+ const float max_val, | |
+ const float lower_cutoff, | |
+ unsigned char* __restrict__ dev_img) | |
{ | |
// Compute pixel position from threadIdx/blockIdx | |
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
t@@ -155,7 +162,8 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra… | |
float3 e = f4_to_f3(dev_ray_origo[mempos]); | |
float3 d = f4_to_f3(dev_ray_direction[mempos]); | |
- // Distance, in ray steps, between object and eye initialized with a large… | |
+ // Distance, in ray steps, between object and eye initialized with | |
+ // a large value | |
float tdist = 1e10f; | |
// Surface normal at closest sphere intersection | |
t@@ -181,8 +189,6 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ra… | |
- 4.0f*dot(d,d) // -4*A | |
* (dot((e-c),(e-c)) - R*R); // C | |
- | |
- | |
// If the determinant is positive, there are two solutions | |
// One where the line enters the sphere, and one where it exits | |
if (lower_cutoff > 0.0) { | |
t@@ -198,8 +204,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r… | |
if (Delta > 0.0f && val > lower_cutoff && fixvel == 0.f) { | |
// Calculate roots, Shirley 2009 p. 77 | |
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-… | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
+ float t_minus = | |
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c)) | |
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)… | |
+ / dot(d,d)); | |
// Check wether intersection is closer than previous values | |
if (fabs(t_minus) < tdist) { | |
t@@ -217,8 +225,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_r… | |
if (Delta > 0.0f) { | |
// Calculate roots, Shirley 2009 p. 77 | |
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-… | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
+ float t_minus = | |
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c)) | |
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)… | |
+ / dot(d,d)); | |
// Check wether intersection is closer than previous values | |
if (fabs(t_minus) < tdist) { | |
t@@ -269,25 +279,28 @@ __global__ void rayIntersectSpheresColormap(float4* dev_… | |
// Write shading model values to pixel color channels | |
dev_img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*redv); | |
+ + k_a * I_a)*redv); | |
dev_img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*greenv); | |
+ + k_a * I_a)*greenv); | |
dev_img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*bluev); | |
+ + k_a * I_a)*bluev); | |
} | |
} | |
__host__ void DEM::cameraInit( | |
- const float3 eye, | |
- const float3 lookat, | |
- const float imgw, | |
- const float focalLength) | |
+ const float3 eye, | |
+ const float3 lookat, | |
+ const float imgw, | |
+ const float focalLength) | |
{ | |
float hw_ratio = height/width; | |
// Image dimensions in world space (l, r, b, t) | |
- float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, … | |
+ float4 imgplane = make_float4( | |
+ -0.5f*imgw, 0.5f*imgw, | |
+ -0.5f*imgw*hw_ratio, | |
+ 0.5f*imgw*hw_ratio); | |
// The view vector | |
float3 view = eye - lookat; | |
t@@ -309,15 +322,6 @@ __host__ void DEM::cameraInit( | |
if (verbose == 1) | |
std::cout << " Transfering camera values to constant memory: "; | |
- /* Reference by string removed in cuda 5.0 | |
- cudaMemcpyToSymbol("devC_u", &u, sizeof(u)); | |
- cudaMemcpyToSymbol("devC_v", &v, sizeof(v)); | |
- cudaMemcpyToSymbol("devC_w", &w, sizeof(w)); | |
- cudaMemcpyToSymbol("devC_eye", &eye, sizeof(eye)); | |
- cudaMemcpyToSymbol("devC_imgplane", &imgplane, sizeof(imgplane)); | |
- cudaMemcpyToSymbol("devC_d", &d, sizeof(d)); | |
- cudaMemcpyToSymbol("devC_light", &light, sizeof(light)); | |
- cudaMemcpyToSymbol("devC_pixels", &pixels, sizeof(pixels));*/ | |
cudaMemcpyToSymbol(devC_u, &u, sizeof(u)); | |
cudaMemcpyToSymbol(devC_v, &v, sizeof(v)); | |
cudaMemcpyToSymbol(devC_w, &w, sizeof(w)); | |
t@@ -333,7 +337,7 @@ __host__ void DEM::cameraInit( | |
} | |
// Allocate global device memory | |
-__host__ void DEM::rt_allocateGlobalDeviceMemory(void) | |
+__host__ void DEM::rt_allocateGlobalDeviceMemory() | |
{ | |
if (verbose == 1) | |
std::cout << " Allocating device memory: "; | |
t@@ -347,7 +351,7 @@ __host__ void DEM::rt_allocateGlobalDeviceMemory(void) | |
// Free dynamically allocated device memory | |
-__host__ void DEM::rt_freeGlobalDeviceMemory(void) | |
+__host__ void DEM::rt_freeGlobalDeviceMemory() | |
{ | |
if (verbose == 1) | |
std::cout << " Freeing device memory: "; | |
t@@ -360,11 +364,12 @@ __host__ void DEM::rt_freeGlobalDeviceMemory(void) | |
} | |
// Transfer image data from device to host | |
-__host__ void DEM::rt_transferFromGlobalDeviceMemory(void) | |
+__host__ void DEM::rt_transferFromGlobalDeviceMemory() | |
{ | |
if (verbose == 1) | |
std::cout << " Transfering image data: device -> host: "; | |
- cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char), cudaMemcpyD… | |
+ cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char), | |
+ cudaMemcpyDeviceToHost); | |
if (verbose == 1) | |
std::cout << "Done" << std::endl; | |
checkForCudaErrors("During rt_transferFromGlobalDeviceMemory()"); | |
t@@ -372,24 +377,13 @@ __host__ void DEM::rt_transferFromGlobalDeviceMemory(voi… | |
// Wrapper for the rt kernel | |
__host__ void DEM::render( | |
- const int method, | |
- const float maxval, | |
- const float lower_cutoff, | |
- const float focalLength, | |
- const unsigned int img_width, | |
- const unsigned int img_height) | |
- /*float4* p, unsigned int np, | |
- rgba* img, unsigned int width, unsigned int height, | |
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw, | |
- int visualize, float max_val, | |
- float* fixvel, | |
- float* xsum, | |
- float* pres, | |
- float* es_dot, | |
- float* es, | |
- float* vel)*/ | |
+ const int method, | |
+ const float maxval, | |
+ const float lower_cutoff, | |
+ const float focalLength, | |
+ const unsigned int img_width, | |
+ const unsigned int img_height) | |
{ | |
- // Namespace directives | |
using std::cout; | |
using std::cerr; | |
using std::endl; | |
t@@ -418,15 +412,15 @@ __host__ void DEM::render( | |
// Look at the centre of the mean positions | |
float3 lookat = make_float3(maxpos.x, maxpos.y, maxpos.z) / 2.0f; | |
float3 eye = make_float3( | |
- grid.L[0] * 2.3f, | |
- grid.L[1] * -5.0f, | |
- grid.L[2] * 1.3f); | |
+ grid.L[0] * 2.3f, | |
+ grid.L[1] * -5.0f, | |
+ grid.L[2] * 1.3f); | |
cameraInit(eye, lookat, imgw, focalLength); | |
// Construct rays for perspective projection | |
rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>( | |
- dev_ray_origo, dev_ray_direction, | |
- make_float4(eye.x, eye.y, eye.z, 0.0f), width, height); | |
+ dev_ray_origo, dev_ray_direction, | |
+ make_float4(eye.x, eye.y, eye.z, 0.0f), width, height); | |
cudaThreadSynchronize(); | |
Float* linarr; // Linear array to use for color visualization | |
t@@ -440,8 +434,8 @@ __host__ void DEM::render( | |
// Visualize spheres without color scale overlay | |
if (method == 0) { | |
rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>( | |
- dev_ray_origo, dev_ray_direction, | |
- dev_x, dev_img); | |
+ dev_ray_origo, dev_ray_direction, | |
+ dev_x, dev_img); | |
} else { | |
if (method == 1) { // Visualize pressure | |
t@@ -455,8 +449,8 @@ __host__ void DEM::render( | |
#pragma omp parallel for if(np>100) | |
for (i = 0; i<np; ++i) { | |
linarr[i] = sqrt(k.vel[i].x*k.vel[i].x | |
- + k.vel[i].y*k.vel[i].y | |
- + k.vel[i].z*k.vel[i].z); | |
+ + k.vel[i].y*k.vel[i].y | |
+ + k.vel[i].z*k.vel[i].z); | |
} | |
transfer = 1; | |
desc = "Linear velocity"; | |
t@@ -468,8 +462,8 @@ __host__ void DEM::render( | |
#pragma omp parallel for if(np>100) | |
for (i = 0; i<np; ++i) { | |
linarr[i] = sqrt(k.angvel[i].x*k.angvel[i].x | |
- + k.angvel[i].y*k.angvel[i].y | |
- + k.angvel[i].z*k.angvel[i].z); | |
+ + k.angvel[i].y*k.angvel[i].y | |
+ + k.angvel[i].z*k.angvel[i].z); | |
} | |
transfer = 1; | |
desc = "Angular velocity"; | |
t@@ -492,8 +486,8 @@ __host__ void DEM::render( | |
#pragma omp parallel for if(np>100) | |
for (i = 0; i<np; ++i) { | |
linarr[i] = sqrt(k.angpos[i].x*k.angpos[i].x | |
- + k.angpos[i].y*k.angpos[i].y | |
- + k.angpos[i].z*k.angpos[i].z); | |
+ + k.angpos[i].y*k.angpos[i].y | |
+ + k.angpos[i].z*k.angpos[i].z); | |
} | |
transfer = 1; | |
desc = "Angular positions"; | |
t@@ -504,23 +498,24 @@ __host__ void DEM::render( | |
// Report color visualization method and color map range | |
if (verbose == 1) { | |
cout << " " << desc << " color map range: [0, " | |
- << maxval << "] " << unit << endl; | |
+ << maxval << "] " << unit << endl; | |
} | |
// Copy linarr to dev_linarr if required | |
if (transfer == 1) { | |
cudaMalloc((void**)&dev_linarr, np*sizeof(Float)); | |
checkForCudaErrors("Error during cudaMalloc of linear array"); | |
- cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), cudaMemcpyHostToD… | |
+ cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), | |
+ cudaMemcpyHostToDevice); | |
checkForCudaErrors("Error during cudaMemcpy of linear array"); | |
} | |
// Start raytracing kernel | |
rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- dev_ray_origo, dev_ray_direction, | |
- dev_x, dev_vel, | |
- dev_linarr, maxval, lower_cutoff, | |
- dev_img); | |
+ dev_ray_origo, dev_ray_direction, | |
+ dev_x, dev_vel, | |
+ dev_linarr, maxval, lower_cutoff, | |
+ dev_img); | |
} | |
diff --git a/tests/fluid_particle_interaction.py b/tests/fluid_particle_interac… | |
t@@ -50,5 +50,4 @@ test(sim.vel[0,0] > 0.0, 'Particle 0 velocity:') | |
test(sim.vel[1,0] > 0.0, 'Particle 1 velocity:') | |
test(sim.vel[2,0] > 0.0, 'Particle 2 velocity:') | |
- | |
-sim.cleanup() | |
+#sim.cleanup() |