GopherProxy

	tAdded LBM fluid simulation, values off - sphere - GPU-based 3D discrete elemen…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit 18da38f2435da18e108b208f2054eb4c11107207
	parent d456b79cf318820c11ea7fbf63ea807591a44cc4
	Author: Anders Damsgaard <[email protected]>
	Date: Wed, 1 May 2013 17:07:48 +0200

	Added LBM fluid simulation, values off

	Diffstat:
	M CMakeLists.txt \| 7 ++++++-
	M python/sphere.py \| 110 ++++++++++++++++++++++-------…
	M src/datatypes.h \| 1 +
	M src/device.cu \| 139 ++++++++++++++++++++++++++---…
	M src/file_io.cpp \| 24 ++++++++++++++++++++++++
	M src/latticeboltzmann.cuh \| 5 ++---
	M src/sorting.cuh \| 5 ++++-
	M src/sphere.h \| 11 ++++++++++-
	M tests/bond_tests.py \| 5 +++--
	M tests/io_tests.py \| 1 -

	10 files changed, 247 insertions(+), 61 deletions(-)
	---
	diff --git a/CMakeLists.txt b/CMakeLists.txt
	t@@ -23,8 +23,13 @@ find_package(OpenMP)
	# Uncomment to enable testing
	enable_testing()

	-# Set build type
	+# Set build type = Debug
	#set(CMAKE_BUILD_TYPE Debug)
	+#if (CUDA_FOUND)
	+# set(CUDA_NVCC_FLAGS -g;-G)
	+#endif()
	+
	+# Set build type = Release
	set(CMAKE_BUILD_TYPE Release)

	# Add source directory to project.
	diff --git a/python/sphere.py b/python/sphere.py
	t@@ -108,9 +108,18 @@ class Spherebin:
	self.tau_b = numpy.ones(1, dtype=numpy.uint32) * numpy.infty
	self.bonds = numpy.zeros((self.nb0, 2), dtype=numpy.uint32)
	self.bonds_delta_n = numpy.zeros(self.nb0, dtype=numpy.float64)
	- self.bonds_delta_t = numpy.zeros((self.nb0, self.nd), dtype=numpy.floa…
	+ self.bonds_delta_t = numpy.zeros((self.nb0, self.nd),
	+ dtype=numpy.float64)
	self.bonds_omega_n = numpy.zeros(self.nb0, dtype=numpy.float64)
	- self.bonds_omega_t = numpy.zeros((self.nb0, self.nd), dtype=numpy.floa…
	+ self.bonds_omega_t = numpy.zeros((self.nb0, self.nd),
	+ dtype=numpy.float64)
	+
	+ self.nu = numpy.zeros(1, dtype=numpy.float64)
	+ self.f_v = numpy.zeros(
	+ (self.num[0] * self.num[1] * self.num[2], self.nd),
	+ dtype=numpy.float64)
	+ self.f_rho = numpy.zeros(self.num[0] * self.num[1] * self.num[2],
	+ dtype=numpy.float64)

	def __cmp__(self, other):
	""" Called when to Spherebin objects are compared.
	t@@ -176,13 +185,16 @@ class Spherebin:
	self.bonds_delta_n == other.bonds_delta_n and\
	self.bonds_delta_t == other.bonds_delta_t and\
	self.bonds_omega_n == other.bonds_omega_n and\
	- self.bonds_omega_t == other.bonds_omega_t\
	+ self.bonds_omega_t == other.bonds_omega_t and\
	+ self.nu == other.nu and\
	+ (self.f_v == other.f_v).all() and\
	+ (self.f_rho == other.f_rho).all()\
	).all() == True):
	return 0 # All equal
	- else :
	+ else:
	return 1

	- def readbin(self, targetbin, verbose = True, bonds = True, devsmod = True):
	+ def readbin(self, targetbin, verbose = True, bonds = True, devsmod = True,…
	'Reads a target SPHERE binary file'

	fh = None
	t@@ -203,20 +215,16 @@ class Spherebin:
	self.time_step_count = numpy.fromfile(fh, dtype=numpy.uint32, coun…

	# Allocate array memory for particles
	- self.x = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
	- self.radius = numpy.zeros(self.np, dtype=numpy.float64)
	- self.xysum = numpy.zeros((self.np, 2), dtype=numpy.float64)
	- self.vel = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
	- self.fixvel = numpy.zeros(self.np, dtype=numpy.float64)
	- #self.force = numpy.zeros((self.np, self.nd), dtype=numpy.float6…
	- #self.angpos = numpy.zeros((self.np, self.nd), dtype=numpy.float6…
	- #self.angvel = numpy.zeros((self.np, self.nd), dtype=numpy.float6…
	- #self.torque = numpy.zeros((self.np, self.nd), dtype=numpy.float6…
	- self.es_dot = numpy.zeros(self.np, dtype=numpy.float64)
	- self.es = numpy.zeros(self.np, dtype=numpy.float64)
	- self.ev_dot = numpy.zeros(self.np, dtype=numpy.float64)
	- self.ev = numpy.zeros(self.np, dtype=numpy.float64)
	- self.p = numpy.zeros(self.np, dtype=numpy.float64)
	+ self.x = numpy.empty((self.np, self.nd), dtype=numpy.float64)
	+ self.radius = numpy.empty(self.np, dtype=numpy.float64)
	+ self.xysum = numpy.empty((self.np, 2), dtype=numpy.float64)
	+ self.vel = numpy.empty((self.np, self.nd), dtype=numpy.float64)
	+ self.fixvel = numpy.empty(self.np, dtype=numpy.float64)
	+ self.es_dot = numpy.empty(self.np, dtype=numpy.float64)
	+ self.es = numpy.empty(self.np, dtype=numpy.float64)
	+ self.ev_dot = numpy.empty(self.np, dtype=numpy.float64)
	+ self.ev = numpy.empty(self.np, dtype=numpy.float64)
	+ self.p = numpy.empty(self.np, dtype=numpy.float64)

	# Read remaining data from binary
	self.origo = numpy.fromfile(fh, dtype=numpy.float64, count=self…
	t@@ -271,13 +279,13 @@ class Spherebin:

	# Wall data
	self.nw = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
	- self.wmode = numpy.zeros(self.nw, dtype=numpy.int32)
	- self.w_n = numpy.zeros(self.nw*self.nd, dtype=numpy.float64).r…
	- self.w_x = numpy.zeros(self.nw, dtype=numpy.float64)
	- self.w_m = numpy.zeros(self.nw, dtype=numpy.float64)
	- self.w_vel = numpy.zeros(self.nw, dtype=numpy.float64)
	- self.w_force = numpy.zeros(self.nw, dtype=numpy.float64)
	- self.w_devs = numpy.zeros(self.nw, dtype=numpy.float64)
	+ self.wmode = numpy.empty(self.nw, dtype=numpy.int32)
	+ self.w_n = numpy.empty(self.nw*self.nd, dtype=numpy.float64).r…
	+ self.w_x = numpy.empty(self.nw, dtype=numpy.float64)
	+ self.w_m = numpy.empty(self.nw, dtype=numpy.float64)
	+ self.w_vel = numpy.empty(self.nw, dtype=numpy.float64)
	+ self.w_force = numpy.empty(self.nw, dtype=numpy.float64)
	+ self.w_devs = numpy.empty(self.nw, dtype=numpy.float64)

	self.wmode = numpy.fromfile(fh, dtype=numpy.int32, count=self.nw)
	for i in range(self.nw):
	t@@ -298,17 +306,25 @@ class Spherebin:
	self.nb0 = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
	self.sigma_b = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	self.tau_b = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	- self.bonds = numpy.zeros((self.nb0, 2), dtype=numpy.uint32)
	+ self.bonds = numpy.empty((self.nb0, 2), dtype=numpy.uint32)
	for i in range(self.nb0):
	self.bonds[i,0] = numpy.fromfile(fh, dtype=numpy.uint32, c…
	self.bonds[i,1] = numpy.fromfile(fh, dtype=numpy.uint32, c…
	self.bonds_delta_n = numpy.fromfile(fh, dtype=numpy.float64, c…
	- self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64, c…
	+ self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64, c…
	self.bonds_omega_n = numpy.fromfile(fh, dtype=numpy.float64, c…
	- self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64, c…
	+ self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64, c…
	else:
	self.nb0 = numpy.zeros(1, dtype=numpy.uint32)

	+ if (fluid == True):
	+ ncells = self.num[0]self.num[1]self.num[2]
	+ self.nu = numpy.fromfile(fh, dtype=numpy.float64, count=1)
	+ self.f_v = numpy.empty(ncells*self.nd, dtype=numpy.float64)
	+ self.f_rho = numpy.empty(ncells, dtype=numpy.float64)
	+ self.f_v = numpy.fromfile(fh, dtype=numpy.float64, count=ncell…
	+ self.f_rho = numpy.fromfile(fh, dtype=numpy.float64, count=nce…
	+
	finally:
	if fh is not None:
	fh.close()
	t@@ -412,11 +428,18 @@ class Spherebin:
	fh.write(self.bonds_omega_n.astype(numpy.float64))
	fh.write(self.bonds_omega_t.astype(numpy.float64))

	+ fh.write(self.nu.astype(numpy.float64))
	+ fh.write(self.f_v.astype(numpy.float64))
	+ fh.write(self.f_rho.astype(numpy.float64))

	finally:
	if fh is not None:
	fh.close()

	+ def readfirst(self, verbose=True):
	+ fn = "../output/{0}.output00000.bin".format(self.sid)
	+ self.readbin(fn, verbose)
	+
	def readlast(self, verbose=True):
	lastfile = status(self.sid)
	fn = "../output/{0}.output{1:0=5}.bin".format(self.sid, lastfile)
	t@@ -508,6 +531,11 @@ class Spherebin:
	cellsize = 2 * r_max
	self.L = self.num * cellsize

	+ # Init fluid arrays
	+ self.f_v = numpy.zeros((self.num[0]self.num[1]self.num[2],self.nd), …
	+ self.f_rho = numpy.ones(self.num[0]self.num[1]self.num[2], dtype=num…
	+
	+
	# Particle positions randomly distributed without overlap
	for i in range(self.np):
	overlaps = True
	t@@ -551,6 +579,11 @@ class Spherebin:
	print("Error: The grid must be at least 3 cells in each direction")
	print(" Grid: x={}, y={}, z={}".format(self.num[0], self.num[1], s…

	+ # Init fluid arrays
	+ self.f_v = numpy.zeros((self.num[0]self.num[1]self.num[2],self.nd), …
	+ self.f_rho = numpy.ones(self.num[0]self.num[1]self.num[2], dtype=num…
	+
	+
	# Put upper wall at top boundary
	if (self.nw > 0):
	self.w_x[0] = self.L[0]
	t@@ -591,6 +624,11 @@ class Spherebin:
	print("Error: The grid must be at least 3 cells in each direction")
	print(self.num)

	+ # Init fluid arrays
	+ self.f_v = numpy.zeros((self.num[0]self.num[1]self.num[2],self.nd), …
	+ self.f_rho = numpy.ones(self.num[0]self.num[1]self.num[2], dtype=num…
	+
	+
	self.contactmodel[0] = contactmodel

	# Put upper wall at top boundary
	t@@ -656,6 +694,10 @@ class Spherebin:
	self.x[i,0] += 0.5*cellsize
	self.x[i,1] += 0.5*cellsize

	+ # Init fluid arrays
	+ self.f_v = numpy.zeros((self.num[0]self.num[1]self.num[2],self.nd), …
	+ self.f_rho = numpy.ones(self.num[0]self.num[1]self.num[2], dtype=num…
	+
	self.contactmodel[0] = contactmodel

	# Readjust grid to correct size
	t@@ -717,6 +759,10 @@ class Spherebin:
	self.num[2] = numpy.ceil(z_max/cellsize)
	self.L = self.num * cellsize

	+ # Init fluid arrays
	+ self.f_v = numpy.zeros((self.num[0]self.num[1]self.num[2],self.nd), …
	+ self.f_rho = numpy.ones(self.num[0]self.num[1]self.num[2], dtype=num…
	+
	def createBondPair(self, i, j, spacing=-0.1):
	""" Bond particles i and j. Particle j is moved adjacent to particle i,
	and oriented randomly.
	t@@ -940,7 +986,8 @@ class Spherebin:
	gamma_r = 0,
	gamma_wn = 1e4,
	gamma_wt = 1e4,
	- capillaryCohesion = 0):
	+ capillaryCohesion = 0,
	+ nu = 0.0):
	""" Initialize particle parameters to default values.
	Radii must be set prior to calling this function.
	"""
	t@@ -1007,6 +1054,9 @@ class Spherebin:
	# Debonding distance
	self.db[0] = (1.0 + theta/2.0) * self.V_b**(1.0/3.0)

	+ # Fluid dynamic viscosity
	+ self.nu[0] = nu
	+

	def bond(self, i, j):
	""" Create a bond between particles i and j """
	diff --git a/src/datatypes.h b/src/datatypes.h
	t@@ -96,6 +96,7 @@ struct Params {
	Float tau_b; // Bond shear strength
	Float devs_A; // Amplitude of modulations in deviatoric normal str…
	Float devs_f; // Frequency of modulations in deviatoric normal str…
	+ Float nu; // Fluid dynamic viscosity
	};

	// Structure containing wall parameters
	diff --git a/src/device.cu b/src/device.cu
	t@@ -25,6 +25,7 @@
	#include "contactsearch.cuh"
	#include "integration.cuh"
	#include "raytracer.cuh"
	+#include "latticeboltzmann.cuh"


	// Wrapper function for initializing the CUDA components.
	t@@ -140,7 +141,8 @@ __global__ void checkConstantValues(int* dev_equal,
	dev_params->db != devC_params.db \|\|
	dev_params->V_b != devC_params.V_b \|\|
	dev_params->lambda_bar != devC_params.lambda_bar \|\|
	- dev_params->nb0 != devC_params.nb0)
	+ dev_params->nb0 != devC_params.nb0 \|\|
	+ dev_params->nu != devC_params.nu)
	*dev_equal = 2; // Not ok

	}
	t@@ -184,11 +186,12 @@ __host__ void DEM::checkConstantMemory()
	// Are the values equal?
	if (*equal != 0) {
	std::cerr << "Error! The values in constant memory do not "
	- << "seem to be correct (" << *equal << ").\n";
	+ << "seem to be correct (" << *equal << ")." << std::endl;
	exit(1);
	} else {
	if (verbose == 1)
	- std::cout << " Constant values ok (" << *equal << ").\n";
	+ std::cout << " Constant values ok (" << *equal << ")."
	+ << std::endl;
	}
	}

	t@@ -256,7 +259,8 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
	cudaMalloc((void**)&dev_torque, memSizeF4);

	// Particle contact bookkeeping arrays
	- cudaMalloc((void*)&dev_contacts, sizeof(unsigned int)np*NC); // Max NC c…
	+ cudaMalloc((void**)&dev_contacts,
	+ sizeof(unsigned int)npNC); // Max NC contacts per particle
	cudaMalloc((void*)&dev_distmod, memSizeF4NC);
	cudaMalloc((void*)&dev_delta_t, memSizeF4NC);
	cudaMalloc((void*)&dev_bonds, sizeof(uint2)params.nb0);
	t@@ -278,8 +282,10 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
	// Cell-related arrays
	cudaMalloc((void*)&dev_gridParticleCellID, sizeof(unsigned int)np);
	cudaMalloc((void*)&dev_gridParticleIndex, sizeof(unsigned int)np);
	- cudaMalloc((void*)&dev_cellStart, sizeof(unsigned int)grid.num[0]*grid.n…
	- cudaMalloc((void*)&dev_cellEnd, sizeof(unsigned int)grid.num[0]*grid.num…
	+ cudaMalloc((void**)&dev_cellStart,
	+ sizeof(unsigned int)grid.num[0]grid.num[1]*grid.num[2]);
	+ cudaMalloc((void**)&dev_cellEnd,
	+ sizeof(unsigned int)grid.num[0]grid.num[1]*grid.num[2]);

	// Host contact bookkeeping arrays
	k.contacts = new unsigned int[np*NC];
	t@@ -298,9 +304,17 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
	cudaMalloc((void*)&dev_walls_vel0, sizeof(Float)walls.nw);
	// dev_walls_force_partial allocated later

	+ // Fluid arrays
	+ cudaMalloc((void**)&dev_f,
	+ sizeof(Float)grid.num[0]grid.num[1]grid.num[2]19);
	+ cudaMalloc((void**)&dev_f_new,
	+ sizeof(Float)grid.num[0]grid.num[1]grid.num[2]19);
	+ cudaMalloc((void**)&dev_v_rho,
	+ sizeof(Float4)grid.num[0]grid.num[1]*grid.num[2]);
	+
	checkForCudaErrors("End of allocateGlobalDeviceMemory");
	if (verbose == 1)
	- std::cout << "Done\n";
	+ std::cout << "Done" << std::endl;
	}

	__host__ void DEM::freeGlobalDeviceMemory()
	t@@ -348,10 +362,16 @@ __host__ void DEM::freeGlobalDeviceMemory()
	cudaFree(dev_walls_force_partial);
	cudaFree(dev_walls_force_pp);
	cudaFree(dev_walls_vel0);
	+
	+ // Fluid arrays
	+ cudaFree(dev_f);
	+ cudaFree(dev_f_new);
	+ cudaFree(dev_v_rho);
	+
	checkForCudaErrors("During cudaFree calls");

	if (verbose == 1)
	- printf("Done\n");
	+ std::cout << "Done" << std::endl;
	}


	t@@ -427,9 +447,18 @@ __host__ void DEM::transferToGlobalDeviceMemory()
	sizeof(Float), cudaMemcpyHostToDevice);
	}

	+ // Fluid arrays
	+ if (params.nu > 0.0) {
	+ cudaMemcpy( dev_f, f, sizeof(Float)grid.num[0]grid.num[1]*grid.num[2…
	+ cudaMemcpyHostToDevice);
	+ cudaMemcpy( dev_v_rho, v_rho,
	+ sizeof(Float)grid.num[0]grid.num[1]*grid.num[2],
	+ cudaMemcpyHostToDevice);
	+ }
	+
	checkForCudaErrors("End of transferToGlobalDeviceMemory");
	if (verbose == 1)
	- std::cout << "Done\n";
	+ std::cout << "Done" << std::endl;
	}

	__host__ void DEM::transferFromGlobalDeviceMemory()
	t@@ -495,8 +524,15 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
	cudaMemcpy( walls.mvfd, dev_walls_mvfd,
	sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);

	- // Bond parameters
	-
	+ // Fluid arrays
	+ if (params.nu > 0.0) {
	+ cudaMemcpy( f, dev_f,
	+ sizeof(Float)grid.num[0]grid.num[1]grid.num[2]19,
	+ cudaMemcpyDeviceToHost);
	+ cudaMemcpy( v_rho, dev_v_rho,
	+ sizeof(Float)grid.num[0]grid.num[1]*grid.num[2],
	+ cudaMemcpyDeviceToHost);
	+ }

	checkForCudaErrors("End of transferFromGlobalDeviceMemory");
	}
	t@@ -506,6 +542,8 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
	__host__ void DEM::startTime()
	{
	using std::cout; // Namespace directive
	+ using std::cerr; // Namespace directive
	+ using std::endl; // Namespace directive
	std::string outfile;
	char file[200];
	FILE *fp;
	t@@ -523,14 +561,34 @@ __host__ void DEM::startTime()
	// Start CPU clock
	tic = clock();

	- // GPU workload configuration
	- unsigned int threadsPerBlock = 256;
	+ //// GPU workload configuration
	+ //unsigned int threadsPerBlock = 256;
	+ unsigned int threadsPerBlock = 512;
	+
	// Create enough blocks to accomodate the particles
	unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
	dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
	dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block
	+
	unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
	dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
	+
	+ // Use 3D block and grid layout for Lattice-Boltzmann fluid calculations
	+ dim3 dimBlockFluid(8, 8, 8); // 512 threads per block
	+ //dim3 dimGridFluid(
	+ //(grid.num[2]+8-1)/8, // Use x as the z
	+ //(grid.num[1]+8-1)/8,
	+ //(grid.num[0]+8-1)/8);
	+ dim3 dimGridFluid(
	+ iDivUp(grid.num[2],dimBlockFluid.x), // Use z as x
	+ iDivUp(grid.num[1],dimBlockFluid.y),
	+ iDivUp(grid.num[0],dimBlockFluid.z)); // Use x as z
	+ if (dimGridFluid.z > 64) {
	+ cerr << "Error: dimGridFluid.z > 64" << endl;
	+ exit(1);
	+ }
	+
	+
	// Shared memory per block
	unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);

	t@@ -544,7 +602,16 @@ __host__ void DEM::startTime()
	<< dimGrid.x << "" << dimGrid.y << "" << dimGrid.z << "\n"
	<< " - Threads per block: "
	<< dimBlock.x << "" << dimBlock.y << "" << dimBlock.z << "\n"
	- << " - Shared memory required per block: " << smemSize << " bytes…
	+ << " - Shared memory required per block: " << smemSize << " bytes"
	+ << endl;
	+ if (params.nu > 0.0) {
	+ cout << " - Blocks per fluid grid: "
	+ << dimGridFluid.x << "" << dimGridFluid.y << "" <<
	+ dimGridFluid.z << "\n"
	+ << " - Threads per fluid block: "
	+ << dimBlockFluid.x << "" << dimBlockFluid.y << "" <<
	+ dimBlockFluid.z << endl;
	+ }
	}

	// Initialize counter variable values
	t@@ -561,17 +628,17 @@ __host__ void DEM::startTime()
	time.step_count);
	fclose(fp);

	- // Write first output data file: output0.bin, thus testing writing of bin …
	- //outfile = "output/" + sid + ".output0.bin";
	- //sprintf(file,"output/%s.output0.bin", sid);
	- //writebin(outfile.c_str());
	+ // Initialize fluid distribution array
	+ initfluid<<< dimGridFluid, dimBlockFluid >>>(dev_v_rho, dev_f);
	+ cudaThreadSynchronize();
	+ checkForCudaErrors("Post initfluid");

	if (verbose == 1) {
	cout << "\n Entering the main calculation time loop...\n\n"
	<< " IMPORTANT: Do not close this terminal, doing so will \n"
	<< " terminate this SPHERE process. Follow the \n"
	<< " progress by executing:\n"
	- << " $ ./sphere_status " << sid << "\n\n";
	+ << " $ ./sphere_status " << sid << endl << endl;
	}


	t@@ -590,6 +657,7 @@ __host__ void DEM::startTime()
	double t_topology = 0.0;
	double t_interact = 0.0;
	double t_bondsLinear = 0.0;
	+ double t_latticeBoltzmannD3Q19 = 0.0;
	double t_integrate = 0.0;
	double t_summation = 0.0;
	double t_integrateWalls = 0.0;
	t@@ -606,8 +674,6 @@ __host__ void DEM::startTime()
	// MAIN CALCULATION TIME LOOP
	while (time.current <= time.total) {

	- // Increment iteration counter
	- ++iter;

	// Print current step number to terminal
	//printf("Step: %d\n", time.step_count);
	t@@ -733,7 +799,8 @@ __host__ void DEM::startTime()

	// Process particle pairs
	if (params.nb0 > 0) {
	- //bondsLinear<<< 1, params.nb0 >>>(
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	bondsLinear<<<dimGridBonds, dimBlock>>>(
	dev_bonds,
	dev_bonds_delta,
	t@@ -751,6 +818,31 @@ __host__ void DEM::startTime()
	checkForCudaErrors("Post bondsLinear", iter);
	}

	+ // Process fluid and particle interaction in each cell
	+ if (params.nu > 0.0) {
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	+ latticeBoltzmannD3Q19<<< dimGridFluid, dimBlockFluid>>> (
	+ dev_f,
	+ dev_f_new,
	+ dev_v_rho,
	+ dev_cellStart,
	+ dev_cellEnd,
	+ dev_x_sorted,
	+ dev_vel_sorted,
	+ dev_force,
	+ dev_gridParticleIndex);
	+ cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
	+ &t_latticeBoltzmannD3Q19);
	+ checkForCudaErrors("Post latticeBoltzmannD3Q19", iter);
	+
	+ // Flip flop
	+ swapFloatArrays(dev_f, dev_f_new);
	+ }
	+
	+

	// Update particle kinematics
	if (PROFILING == 1)
	t@@ -822,6 +914,7 @@ __host__ void DEM::startTime()
	// Update timers and counters
	time.current += time.dt;
	filetimeclock += time.dt;
	+ ++iter;

	// Report time to console
	if (verbose == 1) {
	t@@ -938,6 +1031,8 @@ __host__ void DEM::startTime()
	<< "\t(" << 100.0*t_interact/t_sum << " %)\n"
	<< " - bondsLinear:\t" << t_bondsLinear/1000.0 << " s"
	<< "\t(" << 100.0*t_bondsLinear/t_sum << " %)\n"
	+ << " - latticeBoltzmann:\t" << t_latticeBoltzmannD3Q19/1000.0 << …
	+ << "\t(" << 100.0*t_latticeBoltzmannD3Q19/t_sum << " %)\n"
	<< " - integrate:\t\t" << t_integrate/1000.0 << " s"
	<< "\t(" << 100.0*t_integrate/t_sum << " %)\n"
	<< " - summation:\t\t" << t_summation/1000.0 << " s"
	diff --git a/src/file_io.cpp b/src/file_io.cpp
	t@@ -236,6 +236,21 @@ void DEM::readbin(const char *target)
	ifs.read(as_bytes(k.bonds_omega[i].z), sizeof(Float));
	}

	+ // Read fluid parameters
	+ ifs.read(as_bytes(params.nu), sizeof(params.nu));
	+ v_rho = new Float4[grid.num[0]grid.num[1]grid.num[2]];
	+ f = new Float[grid.num[0]grid.num[1]grid.num[2]*19];
	+ if (params.nu > 0.0) {
	+ for (i = 0; i<grid.num[0]grid.num[1]grid.num[2]; ++i) {
	+ ifs.read(as_bytes(v_rho[i].x), sizeof(Float));
	+ ifs.read(as_bytes(v_rho[i].y), sizeof(Float));
	+ ifs.read(as_bytes(v_rho[i].z), sizeof(Float));
	+ }
	+ for (i = 0; i<grid.num[0]grid.num[1]grid.num[2]; ++i)
	+ ifs.read(as_bytes(v_rho[i].w), sizeof(Float));
	+ }
	+
	+
	// Close file if it is still open
	if (ifs.is_open())
	ifs.close();
	t@@ -394,6 +409,15 @@ void DEM::writebin(const char *target)
	ofs.write(as_bytes(k.bonds_omega[i].z), sizeof(Float));
	}

	+ ofs.write(as_bytes(params.nu), sizeof(params.nu));
	+ for (i = 0; i<grid.num[0]grid.num[1]grid.num[2]; ++i) {
	+ ofs.write(as_bytes(v_rho[i].x), sizeof(Float));
	+ ofs.write(as_bytes(v_rho[i].y), sizeof(Float));
	+ ofs.write(as_bytes(v_rho[i].z), sizeof(Float));
	+ }
	+ for (i = 0; i<grid.num[0]grid.num[1]grid.num[2]; ++i)
	+ ofs.write(as_bytes(v_rho[i].w), sizeof(Float));
	+
	// Close file if it is still open
	if (ofs.is_open())
	ofs.close();
	diff --git a/src/latticeboltzmann.cuh b/src/latticeboltzmann.cuh
	t@@ -360,7 +360,7 @@ __global__ void latticeBoltzmannD3Q19(
	dev_f_new[grid2index(nx-1, 0, z, 9)] = fmax(0.0, f_9);

	// Edge 10 (+x,-y): Periodic
	- /*if (x < nx-1 && y > 0) // not at boundary
	+ if (x < nx-1 && y > 0) // not at boundary
	dev_f_new[grid2index( x+1, y-1, z, 10)] = fmax(0.0, f_10);
	else if (x < nx-1) // at -y boundary
	dev_f_new[grid2index( x+1,ny-1, z, 10)] = fmax(0.0, f_10);
	t@@ -445,11 +445,10 @@ __global__ void latticeBoltzmannD3Q19(
	else if (y < ny-1) // at -z boundary
	dev_f_new[grid2index( x, y+1, 0, 17)] = fmax(0.0, f_18);
	else if (z > 0) // at +y boundary
	- dev_f_new[grid2index( x, 0, z+1, 18)] = fmax(0.0, f_18);
	+ dev_f_new[grid2index( x, 0, z-1, 18)] = fmax(0.0, f_18);
	else // at +y-z boundary
	dev_f_new[grid2index( x, 0, 0, 17)] = fmax(0.0, f_18);

	- // */
	}
	}

	diff --git a/src/sorting.cuh b/src/sorting.cuh
	t@@ -1,3 +1,6 @@
	+#ifndef SORTING_CUH_
	+#define SORTING_CUH_
	+
	// Returns the cellID containing the particle, based cubic grid
	// See Bayraktar et al. 2009
	// Kernel is executed on the device, and is callable from the device only
	t@@ -118,5 +121,5 @@ __global__ void reorderArrays(unsigned int* dev_cellStart,
	} // End of reorderArrays(...)


	-
	+#endif
	// vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
	diff --git a/src/sphere.h b/src/sphere.h
	t@@ -134,9 +134,16 @@ class DEM {
	const Float *z_pos,
	const Float *porosity);

	+ // Lattice-Boltzmann data arrays (D3Q19)
	+ Float *f; // Fluid distribution (f0..f18)
	+ Float *dev_f; // Device equivalent
	+ Float *dev_f_new; // Device equivalent
	+ Float4 *v_rho; // Fluid velocity v (xyz), and pressure rho (w)
	+ Float4 *dev_v_rho; // Device equivalent
	+

	- // Values and functions accessible from the outside
	public:
	+ // Values and functions accessible from the outside

	// Constructor, some parameters with default values
	DEM(std::string inputbin,
	t@@ -201,5 +208,7 @@ class DEM {

	};

	+
	+
	#endif
	// vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
	diff --git a/tests/bond_tests.py b/tests/bond_tests.py
	t@@ -45,10 +45,10 @@ for d in distances:
	sb.bond(0, 1)
	sb.defaultParams(gamma_n = 0.0, gamma_t = 0.0)
	#sb.initTemporal(total=0.5, file_dt=0.01)
	-
	#sb.render(verbose=False)
	#visualize(sb.sid, "energy")

	+
	print("# Stability test")
	sb.x[0,:] = numpy.array((2.0, 2.0, 2.0))
	sb.x[1,:] = numpy.array((2.0+2.0*radii+d, 2.0, 2.0))
	t@@ -85,8 +85,9 @@ for d in distances:
	print(passed())
	compareNumpyArrays(sb.vel[:,1:2], z2_2, "vel[:,1:2]")
	compareNumpyArrays(sb.angvel, z2_3, "angvel\t")
	- #printKinematics(sb)
	+ printKinematics(sb)
	#visualize(sb.sid, "energy")
	+ continue

	print("# Normal contraction")
	sb.zeroKinematics()
	diff --git a/tests/io_tests.py b/tests/io_tests.py
	t@@ -21,7 +21,6 @@ compare(orig, py, "Python IO:")

	# Test C++ IO routines
	orig.run(verbose=False, hideinputfile=True)
	-#orig.run()
	cpp = Spherebin()
	cpp.readbin("../output/" + orig.sid + ".output00000.bin", verbose=False)
	compare(orig, cpp, "C++ IO: ")