GopherProxy

	tUpdated sources - sphere - GPU-based 3D discrete element method algorithm with…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit d9c25987416182f283d170f16d97a4afa5685f41
	parent 6ff7614d0fd5bbd0306f7ed63a1ea1bfd0a9b3a0
	Author: Anders Damsgaard <[email protected]>
	Date: Wed, 22 Aug 2012 13:19:49 +0200

	Updated sources

	Diffstat:
	M raytracer/header.h \| 11 ++---------
	M raytracer/main.cpp \| 158 ++++++++++++++++-------------…
	A raytracer/o-ppm.h \| 7 +++++++
	M raytracer/rt-kernel-cpu.cpp \| 106 ++++++++++++++++++-----------…
	M raytracer/rt-kernel.h \| 16 +++++++++++-----
	M src/Makefile \| 26 ++++++++++++++++++++------
	M src/constants.cuh \| 37 +++++++++++++++++------------…
	M src/datatypes.h \| 150 +++++++++++++++++++----------…
	M src/device.cu \| 1783 ++++-------------------------…
	M src/file_io.cpp \| 388 +++++++++++++++++++++++------…
	M src/main.cpp \| 127 +++++++++++++++++------------…

	11 files changed, 849 insertions(+), 1960 deletions(-)
	---
	diff --git a/raytracer/header.h b/raytracer/header.h
	t@@ -4,6 +4,8 @@
	#ifndef HEADER_H_
	#define HEADER_H_

	+// Type declaration
	+typedef unsigned int Inttype;

	//// Structure declarations ////

	t@@ -21,13 +23,4 @@ struct rgb {
	unsigned char alpha;
	};

	-
	-
	-//// Prototype functions ////
	-
	-// io.cpp
	-int image_to_ppm(rgb* rgb_img, const char* filename,
	- const unsigned int width, const unsigned int height);
	-
	-
	#endif
	diff --git a/raytracer/main.cpp b/raytracer/main.cpp
	t@@ -5,18 +5,19 @@
	#include <cmath>
	#include <vector_functions.h>
	#include "header.h"
	-#include "rt_kernel.h"
	-#include "rt_kernel_cpu.h"
	+#include "rt-kernel.h"
	+#include "rt-kernel-cpu.h"
	+#include "colorbar.h"
	+#include "o-ppm.h"

	int main(const int argc, const char* argv[])
	{
	using std::cout;

	if (argc < 6 \|\| argc > 8 ){
	- cout << "Usage: " << argv[0] << " <GPU or CPU> <sphere-binary.bin> <width>…
	+ cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width>…
	<< "or\n"
	- << "Usage: " << argv[0] << " GPU <sphere-binary.bin \| colorbar> <widt…
	- << " <pressure \| pressure50 \| es \| es_dot> <max value in color range…
	+ << "Usage: " << argv[0] << " GPU <particle-data.txt \| colorbar> <widt…
	return 1;
	}

	t@@ -37,22 +38,16 @@ int main(const int argc, const char* argv[])
	// Render colorbar image
	if (strcmp(argv[2],"colorbar") == 0) {

	- for (unsigned int y=0; y<height; y++) {
	- for (unsigned int x=0; x<width; x++) {
	+ for (unsigned int y=0; y<height; ++y) {
	+ for (unsigned int x=0; x<width; ++x) {

	// Colormap value is relative to width position
	float ratio = (float) (x+1)/width;

	- // Determine Blue-White-Red color components
	- float red = fmin(1.0f, 0.209fratioratioratio - 2.49fratio*ratio …
	- float green = fmin(1.0f, -2.44fratioratio + 2.15f*ratio + 0.369f);
	- float blue = fmin(1.0f, -2.21fratioratio + 1.61f*ratio + 0.573f);
	-
	// Write pixel value to image array
	- //img[x + ywidth].r = red 250.f;
	- img[x + ywidth].r = red 250.f;
	- img[x + ywidth].g = green 250.f;
	- img[x + ywidth].b = blue 250.f;
	+ img[x + ywidth].r = red(ratio) 250.f;
	+ img[x + ywidth].g = green(ratio) 250.f;
	+ img[x + ywidth].b = blue(ratio) 250.f;

	}
	}
	t@@ -81,21 +76,17 @@ int main(const int argc, const char* argv[])

	// Allocate particle structure array
	cout << " Allocating host memory\n";
	+
	+ // Single precision arrays used for computations
	float4* p = new float4[np];
	float* fixvel = new float[np];
	float* es_dot = new float[np];
	float* es = new float[np];
	float* pres = new float[np];
	- for (unsigned int i=0; i<np; i++) {
	- fixvel[i] = 0.0f;
	- pres[i] = 0.0f; // Initialize values to zero
	- es_dot[i] = 0.0f;
	- es[i] = 0.0f;
	- }
	+ float* vel = new float[np];

	// Read temporal information
	- float dt, file_dt;
	- double current, total;
	+ double dt, file_dt, current, total;
	unsigned int step_count;
	(void)fread(&dt, sizeof(dt), 1, fin);
	(void)fread(&current, sizeof(current), 1, fin);
	t@@ -103,46 +94,77 @@ int main(const int argc, const char* argv[])
	(void)fread(&file_dt, sizeof(file_dt), 1, fin);
	(void)fread(&step_count, sizeof(step_count), 1, fin);

	+ double d; // Double precision temporary value holder
	+
	// Canonical coordinate system origo
	f3 origo;
	- (void)fread(&origo, sizeof(float)*3, 1, fin);
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ origo.x = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ origo.y = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ origo.z = (float)d;

	// Canonical coordinate system dimensions
	f3 L;
	- (void)fread(&L, sizeof(float)*3, 1, fin);
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ L.x = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ L.y = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ L.z = (float)d;
	+

	// Skip over irrelevant data
	- float blankf;
	- //f3 blankf3;
	+ double blankd;
	+ //f3 blankd3;
	unsigned int blankui;
	for (int j=0; j<3; j++) // Skip over grid.num data
	(void)fread(&blankui, sizeof(blankui), 1, fin);

	+ // Velocity vector, later converted to lenght
	+ float3 v;

	// Load data into particle array
	for (unsigned int i=0; i<np; i++) {
	- (void)fread(&p[i].x, sizeof(float), 1, fin);
	- for (int j=0; j<4; j++)
	- (void)fread(&blankf, sizeof(blankf), 1, fin);
	-
	- (void)fread(&p[i].y, sizeof(float), 1, fin);
	- for (int j=0; j<4; j++)
	- (void)fread(&blankf, sizeof(blankf), 1, fin);
	-
	- (void)fread(&p[i].z, sizeof(float), 1, fin);
	- for (int j=0; j<4; j++)
	- (void)fread(&blankf, sizeof(blankf), 1, fin);
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ p[i].x = (float)d; // Typecast to single precision
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ v.x = (float)d;
	+ for (int j=0; j<3; j++)
	+ (void)fread(&blankd, sizeof(blankd), 1, fin);
	+
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ p[i].y = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ v.y = (float)d;
	+ for (int j=0; j<3; j++)
	+ (void)fread(&blankd, sizeof(blankd), 1, fin);
	+
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ p[i].z = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ v.z = (float)d;
	+ for (int j=0; j<3; j++)
	+ (void)fread(&blankd, sizeof(blankd), 1, fin);
	+
	+ // Save velocity vector length
	+ vel[i] = sqrt(v.xv.x + v.yv.y + v.z*v.z);
	}
	for (unsigned int i=0; i<np; i++) {
	- //(void)fread(&blankf, sizeof(blankf), 1, fin); // fixvel
	- (void)fread(&fixvel[i], sizeof(float), 1, fin); // fixvel
	- (void)fread(&blankf, sizeof(blankf), 1, fin); // xsum
	- (void)fread(&p[i].w, sizeof(float), 1, fin); // radius
	+ (void)fread(&d, sizeof(d), 1, fin); // fixvel
	+ fixvel[i] = (float)d;
	+ (void)fread(&blankd, sizeof(blankd), 1, fin); // xsum
	+ (void)fread(&d, sizeof(d), 1, fin); // radius
	+ p[i].w = (float)d;
	for (int j=0; j<10; j++)
	- (void)fread(&blankf, sizeof(blankf), 1, fin);
	- (void)fread(&es_dot[i], sizeof(float), 1, fin);
	- (void)fread(&es[i], sizeof(float), 1, fin);
	- (void)fread(&pres[i], sizeof(float), 1, fin);
	+ (void)fread(&blankd, sizeof(blankd), 1, fin);
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ es_dot[i] = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ es[i] = (float)d;
	+ (void)fread(&d, sizeof(d), 1, fin);
	+ pres[i] = (float)d;
	}

	fclose(fin); // Binary read complete
	t@@ -151,12 +173,14 @@ int main(const int argc, const char* argv[])
	<< L.x << "" << L.y << "" << L.z << " m\n";

	// Eye position and focus point
	- //f3 eye = {0.5fL.x, -4.2L.y, 0.5f*L.z};
	- f3 eye = {2.5fL.x, -4.2L.y, 2.0f*L.z};
	- f3 lookat = {0.45fL.x, 0.5fL.y, 0.45f*L.z};
	+ //f3 eye = {0.5fL.x, -4.2fL.y, 0.5f*L.z};
	+ //f3 eye = {2.5fL.x, -4.2fL.y, 2.0f*L.z};
	+ //f3 eye = {0.5fL.x, -5.0fL.y, 0.5f*L.z};
	+ f3 eye = {2.5fL.x, -5.0fL.y, 1.5f*L.z};
	+ f3 lookat = {0.5fL.x, 0.5fL.y, 0.5f*L.z};
	if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetu…
	- eye.x = 1.1f*L.x;
	- eye.y = 15.1*L.y;
	+ eye.x = 4.1f*L.x;
	+ eye.y = -15.1*L.y;
	eye.z = 1.1*L.z;
	/lookat.x = 0.45fL.x;
	lookat.y = 0.5f*L.y;
	t@@ -179,40 +203,19 @@ int main(const int argc, const char* argv[])
	visualize = 2; // 2: es_dot view
	if(strcmp(argv[6],"es") == 0)
	visualize = 3; // 3: es view
	- if(strcmp(argv[6],"pressure50") == 0)
	- visualize = 4; // 4: pressure50 view
	+ if(strcmp(argv[6],"vel") == 0)
	+ visualize = 4; // 4: velocity view

	// Read max. value specified in command args.
	max_val = atof(argv[7]);
	- }
	-
	- // Render colorbar image
	- if (strcmp(argv[2],"colorbar") == 0) {
	-
	- for (unsigned int x=0; x<width; x++) {
	- for (unsigned int y=0; y<height; y++) {
	-
	- // Colormap value is relative to width position
	- float ratio = (float)x/width;

	- // Determine Blue-White-Red color components
	- float red = fmin(1.0f, 0.209fratioratioratio - 2.49fratio*rati…
	- float green = fmin(1.0f, -2.44fratioratio + 2.15f*ratio + 0.369f);
	- float blue = fmin(1.0f, -2.21fratioratio + 1.61f*ratio + 0.573f);
	-
	- // Write pixel value to image array
	- img[yheight + x].r = (unsigned char) red 255;
	- img[yheight + x].g = (unsigned char) green 255;
	- img[yheight + x].b = (unsigned char) blue 255;
	-
	- }
	- }
	}

	+
	if (strcmp(argv[1],"GPU") == 0) {

	// Call cuda wrapper
	- if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize…
	+ if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize…
	cout << "\nError in rt()\n";
	return 1;
	}
	t@@ -234,6 +237,7 @@ int main(const int argc, const char* argv[])
	delete [] pres;
	delete [] es;
	delete [] es_dot;
	+ delete [] vel;

	}

	diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h
	t@@ -0,0 +1,7 @@
	+#ifndef O_PPM_H_
	+#define O_PPM_H_
	+
	+int image_to_ppm(rgb* rgb_img, const char* filename,
	+ const unsigned int width, const unsigned int height);
	+
	+#endif
	diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp
	t@@ -5,8 +5,9 @@
	#include <cuda.h>
	#include <cutil_math.h>
	#include <string.h>
	+#include <stdlib.h>
	#include "header.h"
	-#include "rt_kernel_cpu.h"
	+#include "rt-kernel-cpu.h"

	// Constants
	float3 constc_u;
	t@@ -35,10 +36,10 @@ __inline__ float lengthf3(float3 in)
	// Kernel for initializing image data
	void imageInit_cpu(unsigned char* _img, unsigned int pixels)
	{
	- for (unsigned int mempos=0; mempos<pixels; mempos++) {
	- _img[mempos*4] = 0; // Red channel
	- _img[mempos*4 + 1] = 0; // Green channel
	- _img[mempos*4 + 2] = 0; // Blue channel
	+ for (unsigned int mempos=0; mempos<pixels; ++mempos) {
	+ _img[mempos*4] = 255; // Red channel
	+ _img[mempos*4 + 1] = 255; // Green channel
	+ _img[mempos*4 + 2] = 255; // Blue channel
	}
	}

	t@@ -49,17 +50,19 @@ void rayInitPerspective_cpu(float3* _ray_origo,
	unsigned int width,
	unsigned int height)
	{
	- int i;
	- #pragma omp parallel for
	- for (i=0; i<width; i++) {
	- for (unsigned int j=0; j<height; j++) {
	+ int i,j;
	+ unsigned int mempos;
	+ float p_u, p_v;
	+ #pragma omp parallel for private(mempos,j,p_u,p_v)
	+ for (i=0; i<(int)width; ++i) {
	+ for (j=0; j<(int)height; ++j) {

	- unsigned int mempos = i + j*height;
	+ mempos = i + j*width;

	// Calculate pixel coordinates in image plane
	- float p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
	+ p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
	* (i + 0.5f) / width;
	- float p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
	+ p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
	* (j + 0.5f) / height;

	// Write ray origo and direction to global memory
	t@@ -74,37 +77,34 @@ void rayInitPerspective_cpu(float3* _ray_origo,
	void rayIntersectSpheres_cpu(float3* _ray_origo,
	float3* _ray_direction,
	float4* _p,
	+ float* _nuance,
	unsigned char* _img,
	unsigned int pixels,
	unsigned int np)
	{
	- int mempos;
	- #pragma omp parallel for
	- for (mempos=0; mempos<pixels; mempos++) {
	+ long int mempos;
	+ float3 e, d, n, p, c;
	+ float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance;
	+ Inttype i, ifinal;
	+ #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d…
	+ for (mempos=0; mempos<pixels; ++mempos) {

	// Read ray data from global memory
	- float3 e = _ray_origo[mempos];
	- float3 d = _ray_direction[mempos];
	- //float step = lengthf3(d);
	+ e = _ray_origo[mempos];
	+ d = _ray_direction[mempos];

	// Distance, in ray steps, between object and eye initialized with a large…
	- float tdist = 1e10f;
	-
	- // Surface normal at closest sphere intersection
	- float3 n;
	-
	- // Intersection point coordinates
	- float3 p;
	+ tdist = 1e10f;

	// Iterate through all particles
	- for (unsigned int i=0; i<np; i++) {
	+ for (i=0; i<np; ++i) {

	// Read sphere coordinate and radius
	- float3 c = f4_to_f3(_p[i]);
	- float R = _p[i].w;
	+ c = f4_to_f3(_p[i]);
	+ R = _p[i].w;

	// Calculate the discriminant: d = B^2 - 4AC
	- float Delta = (2.0fdot(d,(e-c)))(2.0f*dot(d,(e-c))) // B^2
	+ Delta = (2.0fdot(d,(e-c)))(2.0f*dot(d,(e-c))) // B^2
	- 4.0fdot(d,d) // -4A
	* (dot((e-c),(e-c)) - R*R); // C

	t@@ -113,7 +113,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
	if (Delta > 0.0f) {

	// Calculate roots, Shirley 2009 p. 77
	- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - do…
	+ t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
	* (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));

	// Check wether intersection is closer than previous values
	t@@ -121,6 +121,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
	p = e + t_minus*d;
	tdist = fabs(t_minus);
	n = normalize(2.0f * (p - c)); // Surface normal
	+ ifinal = i;
	}

	} // End of solution branch
	t@@ -131,21 +132,25 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
	if (tdist < 1e10) {

	// Lambertian shading parameters
	- float dotprod = fabs(dot(n, constc_light));
	- float I_d = 40.0f; // Light intensity
	- float k_d = 5.0f; // Diffuse coefficient
	+ //float dotprod = fabs(dot(n, constc_light));
	+ dotprod = fmax(0.0f,dot(n, constc_light));
	+ I_d = 70.0f; // Light intensity
	+ k_d = 5.0f; // Diffuse coefficient

	// Ambient shading
	- float k_a = 10.0f;
	- float I_a = 5.0f;
	+ k_a = 30.0f;
	+ I_a = 5.0f;
	+
	+ // Read color nuance of grain
	+ nuance = _nuance[ifinal];

	// Write shading model values to pixel color channels
	_img[mempos4] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*0.48f);
	+ + k_a * I_a)0.48fnuance);
	_img[mempos4 + 1] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*0.41f);
	+ + k_a * I_a)0.41fnuance);
	_img[mempos4 + 2] = (unsigned char) ((k_d I_d * dotprod
	- + k_a * I_a)*0.27f);
	+ + k_a * I_a)0.27fnuance);
	}
	}
	}
	t@@ -160,9 +165,11 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw…
	float3 view = eye - lookat;

	// Construct the camera view orthonormal base
	- float3 v = make_float3(0.0f, 1.0f, 0.0f); // v: Pointing upward
	- float3 w = -view/lengthf3(view); // w: Pointing backwards
	- float3 u = cross(make_float3(v.x, v.y, v.z), make_float3(w.x, w.y, w.z)); //…
	+ //float3 up = make_float3(0.0f, 1.0f, 0.0f); // Pointing upward along +y
	+ float3 up = make_float3(0.0f, 0.0f, 1.0f); // Pointing upward along +z
	+ float3 w = -view/length(view); // w: Pointing backwards
	+ float3 u = cross(up, w) / length(cross(up, w));
	+ float3 v = cross(w, u);

	// Focal length 20% of eye vector length
	float d = lengthf3(view)*0.8f;
	t@@ -179,6 +186,8 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw,…
	constc_imgplane = imgplane;
	constc_d = d;
	constc_light = light;
	+
	+ std::cout << "Rendering image...";
	}


	t@@ -197,8 +206,15 @@ int rt_cpu(float4* p, unsigned int np,
	// Start timer 1
	t1_go = clock();

	+ // Generate random nuance values for all grains
	+ static float* _nuance; // Values between 0.5 and 1.0
	+ _nuance = new float[np];
	+ srand(0); // Initialize seed at same value at every run
	+ for (unsigned int i=0; i<np; ++i)
	+ _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f;
	+
	// Allocate memory
	- cout << " Allocating device memory\n";
	+ cout << " Allocating memory\n";
	static unsigned char *_img; // RGBw values in image
	static float3* _ray_origo; // Ray origo (x,y,z)
	static float3* _ray_direction; // Ray direction (x,y,z)
	t@@ -230,7 +246,7 @@ int rt_cpu(float4* p, unsigned int np,
	// Find closest intersection between rays and spheres
	rayIntersectSpheres_cpu(
	_ray_origo, _ray_direction,
	- p, _img, pixels, np);
	+ p, _nuance, _img, pixels, np);

	// Stop timer 2
	t2_stop = clock();
	t@@ -238,6 +254,7 @@ int rt_cpu(float4* p, unsigned int np,
	memcpy(img, _img, sizeof(unsigned char)pixels4);

	// Free dynamically allocated device memory
	+ delete [] _nuance;
	delete [] _img;
	delete [] _ray_origo;
	delete [] _ray_direction;
	t@@ -246,7 +263,8 @@ int rt_cpu(float4* p, unsigned int np,
	t1_stop = clock();

	// Report time spent
	- cout << " Time spent on entire CPU raytracing routine: "
	+ cout << " done.\n"
	+ << " Time spent on entire CPU raytracing routine: "
	<< (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
	cout << " - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n…

	diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h
	t@@ -2,6 +2,8 @@
	#define RT_KERNEL_H_

	#include <vector_functions.h>
	+#include "../src/datatypes.h"
	+#include "header.h"

	// Constants
	__constant__ float4 const_u;
	t@@ -11,19 +13,23 @@ __constant__ float4 const_eye;
	__constant__ float4 const_imgplane;
	__constant__ float const_d;
	__constant__ float4 const_light;
	+__constant__ unsigned int const_pixels;
	+__constant__ Inttype const_np;


	// Host prototype functions

	extern "C"
	-void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio);
	+void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
	+ unsigned int pixels, Inttype np);

	extern "C"
	void checkForCudaErrors(const char* checkpoint_description);

	extern "C"
	-int rt(float4* p, const unsigned int np,
	- rgb* img, const unsigned int width, const unsigned int height,
	- f3 origo, f3 L, f3 eye, f3 lookat, float imgw);
	-
	+int rt(float4* p, Inttype np,
	+ rgb* img, unsigned int width, unsigned int height,
	+ f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
	+ int visualize, float max_val,
	+ float* fixvel, float* pres, float* es_dot, float* es, float* vel);
	#endif
	diff --git a/src/Makefile b/src/Makefile
	t@@ -1,6 +1,11 @@
	+# Cuda paths
	CUDA_INSTALL_PATH=/usr/local/cuda
	CUDA_BIN=$(CUDA_INSTALL_PATH)/bin

	+# Define the editor and optional arguments
	+EDITOR=vim -p
	+
	+# Define compilers and linker
	CC=g++
	NVCC=$(CUDA_BIN)/nvcc
	LINKER=$(CC)
	t@@ -26,7 +31,7 @@ CCFILES=main.cpp file_io.cpp
	CUFILES=device.cu utility.cu
	CCOBJECTS=$(CCFILES:.cpp=.o)
	CUOBJECTS=$(CUFILES:.cu=.o)
	- OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
	+OBJECTS=$(CCOBJECTS) $(CUOBJECTS)

	# Detect OS
	OSUPPER=$(shell uname -s 2>/dev/null \| tr [:lower:] [:upper:])
	t@@ -53,7 +58,10 @@ LDFLAGS+=-L$(CUDA_INSTALL_PATH)/lib
	LDFLAGS+=-L$(SDKPATH)/../../shared/lib -L$(SDKPATH)/../lib
	LDFLAGS+=-lcutil_x86_64 -lcuda -lcudart

	-all: $(CCFILES) $(CUFILES) $(EXECUTABLE)
	+all: $(CCFILES) $(CUFILES) $(EXECUTABLE) raytracer
	+
	+raytracer:
	+ $(MAKE) -C ../raytracer/

	$(EXECUTABLE): $(OBJECTS)
	$(LINKER) $(OBJECTS) $(LDFLAGS) -o $@
	t@@ -61,20 +69,20 @@ $(EXECUTABLE): $(OBJECTS)
	utility.o: utility.cu
	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@

	-file_io.o: file_io.cpp
	+file_io.o: file_io.cpp datatypes.h
	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@

	-device.o: device.cu
	+device.o: device.cu datatypes.h *.cuh
	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@

	-main.o: main.cpp
	+main.o: main.cpp datatypes.h
	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@

	../sphere_status: sphere_status.cpp
	$(CC) $(CCFLAGS) sphere_status.cpp -o ../sphere_status

	edit:
	- vim -p Makefile $(CCFILES) $(CUFILES) .h .cuh
	+ $(EDITOR) Makefile $(CCFILES) $(CUFILES) .h .cuh

	backup:
	tar cvfz ../../sphere_backup/$(BACKUPNAME) ../../sphere
	t@@ -83,4 +91,10 @@ backup:
	clean:
	$(RM) $(OBJECTS)
	$(RM) ../sphere_*
	+ $(MAKE) -C ../raytracer clean
	+
	+clear:
	+ # Remove all output data and images
	+ $(RM) -rf ../img_out/*.{ppm,png}
	+ $(RM) -rf ../output/*.{dat,bin}

	diff --git a/src/constants.cuh b/src/constants.cuh
	t@@ -2,28 +2,31 @@
	#define CONSTANTS_CUH_

	// Constant memory size: 64 kb
	-__constant__ float devC_origo[ND]; // World coordinate system origo
	-__constant__ float devC_L[ND]; // World length in dimension ND
	+__constant__ Float devC_origo[ND]; // World coordinate system origo
	+__constant__ Float devC_L[ND]; // World length in dimension ND
	__constant__ unsigned int devC_num[ND]; // Number of cells in dimens…
	-__constant__ float devC_dt; // Time step length
	+__constant__ Float devC_dt; // Time step length
	__constant__ int devC_global; // Parameter properties, 1: g…
	-__constant__ float devC_g[ND]; // Gravitational acceleration …
	+__constant__ Float devC_g[ND]; // Gravitational acceleration …
	__constant__ unsigned int devC_np; // Number of particles
	__constant__ char devC_nc; // Max. number of contacts a par…
	__constant__ unsigned int devC_shearmodel; // Shear force model: 1: viscous, f…
	-__constant__ float devC_k_n; // Material normal stiffness
	-__constant__ float devC_k_s; // Material shear stiffness
	-__constant__ float devC_k_r; // Material rolling stiffness
	-__constant__ float devC_gamma_n; // Material normal viscosity
	-__constant__ float devC_gamma_s; // Material shear viscosity
	-__constant__ float devC_gamma_r; // Material rolling viscos…
	-__constant__ float devC_mu_s; // Material static shear fricti…
	-__constant__ float devC_mu_d; // Material dynamic shear frict…
	-__constant__ float devC_mu_r; // Material rolling friction …
	-__constant__ float devC_rho; // Material density
	-__constant__ float devC_kappa; // Capillary bond prefactor
	-__constant__ float devC_db; // Debonding distance
	-__constant__ float devC_V_b; // Liquid volume of capillary …
	+__constant__ Float devC_k_n; // Material normal stiffness
	+__constant__ Float devC_k_s; // Material shear stiffness
	+__constant__ Float devC_k_r; // Material rolling stiffness
	+__constant__ Float devC_gamma_n; // Material normal viscosity
	+__constant__ Float devC_gamma_s; // Material shear viscosity
	+__constant__ Float devC_gamma_r; // Material rolling viscos…
	+__constant__ Float devC_gamma_wn; // Wall normal viscosity
	+__constant__ Float devC_gamma_ws; // Wall shear viscosity
	+__constant__ Float devC_gamma_wr; // Wall rolling viscosity
	+__constant__ Float devC_mu_s; // Material static shear fricti…
	+__constant__ Float devC_mu_d; // Material dynamic shear frict…
	+__constant__ Float devC_mu_r; // Material rolling friction …
	+__constant__ Float devC_rho; // Material density
	+__constant__ Float devC_kappa; // Capillary bond prefactor
	+__constant__ Float devC_db; // Debonding distance
	+__constant__ Float devC_V_b; // Liquid volume of capillary …
	__constant__ unsigned int devC_nw; // Number of walls
	__constant__ unsigned int devC_w_n; // Dimension of orthogonal wall …
	__constant__ int devC_periodic; // Behavior of x- and y boundaries:…
	diff --git a/src/datatypes.h b/src/datatypes.h
	t@@ -4,25 +4,60 @@
	#ifndef DATATYPES_H_
	#define DATATYPES_H_

	-/////////////////////////////////
	-// STATIC VARIABLE DECLARATION //
	-/////////////////////////////////
	+#include <math.h>
	+#include "vector_functions.h"
	+//#include "vector_arithmetic.h"


	-#include "vector_functions.h"
	+// Enable profiling of kernel runtimes?
	+// 0: No
	+// 1: Yes
	+#define PROFILING 1
	+
	+
	+//////////////////////
	+// TYPE DEFINITIONS //
	+//////////////////////
	+
	+// REMEMBER: When changing the precision below,
	+// change values in typedefs.h accordingly.
	+
	+// Uncomment all five lines below for single precision
	+/*
	+typedef Float Float;
	+typedef Float3 Float3;
	+typedef Float4 Float4;
	+#define MAKE_FLOAT3(x, y, z) make_Float3(x, y, z)
	+#define MAKE_FLOAT4(x, y, z, w) make_Float4(x, y, z, w)
	+*/
	+
	+
	+// Uncomment all five lines below for double precision
	+///*
	+typedef double Float;
	+typedef double2 Float2;
	+typedef double3 Float3;
	+typedef double4 Float4;
	+#define MAKE_FLOAT3(x, y, z) make_double3(x, y, z)
	+#define MAKE_FLOAT4(x, y, z, w) make_double4(x, y, z, w)
	+//*/
	+

	-//// Symbolic constants
	+////////////////////////
	+// SYMBOLIC CONSTANTS //
	+////////////////////////

	-const float PI = 3.14159265358979f;
	+const Float PI = 3.14159265358979;

	// Number of dimensions (1 and 2 NOT functional)
	const unsigned int ND = 3;

	// Define source code version
	-const float VERS = 0.25;
	+const Float VERS = 0.25;

	// Max. number of contacts per particle
	-const char NC = 16;
	+//const char NC = 16;
	+const char NC = 32;


	///////////////////////////
	t@@ -31,39 +66,39 @@ const char NC = 16;

	// Structure containing variable particle parameters
	struct Particles {
	- float *radius;
	- float *k_n;
	- float *k_s;
	- float *k_r;
	- float *gamma_n;
	- float *gamma_s;
	- float *gamma_r;
	- float *mu_s;
	- float *mu_d;
	- float *mu_r;
	- float *rho;
	- float *es_dot;
	- float *es;
	- float *p;
	- float *m;
	- float *I;
	+ Float *radius;
	+ Float *k_n;
	+ Float *k_s;
	+ Float *k_r;
	+ Float *gamma_n;
	+ Float *gamma_s;
	+ Float *gamma_r;
	+ Float *mu_s;
	+ Float *mu_d;
	+ Float *mu_r;
	+ Float *rho;
	+ Float *es_dot;
	+ Float *es;
	+ Float *p;
	+ Float *m;
	+ Float *I;
	unsigned int np;
	};

	// Structure containing grid parameters
	struct Grid {
	unsigned int nd;
	- float *origo;
	- float *L;
	+ Float *origo;
	+ Float *L;
	unsigned int *num;
	};

	// Structure containing time parameters
	struct Time {
	- float dt;
	+ Float dt;
	double current;
	double total;
	- float file_dt;
	+ Float file_dt;
	unsigned int step_count;
	};

	t@@ -71,23 +106,26 @@ struct Time {
	struct Params {
	//bool global;
	int global;
	- float *g;
	+ Float *g;
	unsigned int np;
	unsigned int nw;
	- float dt;
	- float k_n;
	- float k_s;
	- float k_r;
	- float gamma_n;
	- float gamma_s;
	- float gamma_r;
	- float mu_s;
	- float mu_d;
	- float mu_r;
	- float rho;
	- float kappa;
	- float db;
	- float V_b;
	+ Float dt;
	+ Float k_n;
	+ Float k_s;
	+ Float k_r;
	+ Float gamma_n;
	+ Float gamma_s;
	+ Float gamma_r;
	+ Float gamma_wn;
	+ Float gamma_ws;
	+ Float gamma_wr;
	+ Float mu_s;
	+ Float mu_d;
	+ Float mu_r;
	+ Float rho;
	+ Float kappa;
	+ Float db;
	+ Float V_b;
	int periodic;
	unsigned int shearmodel;
	};
	t@@ -97,29 +135,29 @@ struct Params {
	// PROTOTYPE FUNCTIONS //
	/////////////////////////
	int fwritebin(char target, Particles p,
	- float4 host_x, float4 host_vel,
	- float4 host_angvel, float4 host_force,
	- float4 host_torque, uint4 host_bonds,
	+ Float4 host_x, Float4 host_vel,
	+ Float4 host_angvel, Float4 host_force,
	+ Float4 host_torque, uint4 host_bonds,
	Grid grid, Time time, Params *params,
	- float4 host_w_nx, float4 host_w_mvfd);
	+ Float4 host_w_nx, Float4 host_w_mvfd);

	// device.cu
	//extern "C"
	void initializeGPU(void);

	//extern "C"
	-void gpuMain(float4* host_x,
	- float4* host_vel,
	- float4* host_acc,
	- float4* host_angvel,
	- float4* host_angacc,
	- float4* host_force,
	- float4* host_torque,
	+void gpuMain(Float4* host_x,
	+ Float4* host_vel,
	+ Float4* host_acc,
	+ Float4* host_angvel,
	+ Float4* host_angacc,
	+ Float4* host_force,
	+ Float4* host_torque,
	uint4* host_bonds,
	Particles* p, Grid* grid,
	Time* time, Params* params,
	- float4* host_w_nx,
	- float4* host_w_mvfd,
	+ Float4* host_w_nx,
	+ Float4* host_w_mvfd,
	const char* cwd,
	const char* inputbin);

	diff --git a/src/device.cu b/src/device.cu
	t@@ -2,8 +2,8 @@
	#include <iostream>
	#include <cstdio>
	#include <cuda.h>
	-#include <cutil_math.h>

	+#include "vector_arithmetic.h"
	#include "thrust/device_ptr.h"
	#include "thrust/sort.h"

	t@@ -11,1488 +11,11 @@
	#include "datatypes.cuh"
	#include "constants.cuh"

	-//#include "cuPrintf.cu"
	-
	-
	-// Returns the cellID containing the particle, based cubic grid
	-// See Bayraktar et al. 2009
	-// Kernel is executed on the device, and is callable from the device only
	-__device__ int calcCellID(float3 x)
	-{
	- unsigned int i_x, i_y, i_z;
	-
	- // Calculate integral coordinates:
	- i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	- i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	- i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	-
	- // Integral coordinates are converted to 1D coordinate:
	- return __umul24(__umul24(i_z, devC_num[1]),
	- devC_num[0])
	- + __umul24(i_y, devC_num[0]) + i_x;
	-
	-} // End of calcCellID(...)
	-
	-
	-// Calculate hash value for each particle, based on position in grid.
	-// Kernel executed on device, and callable from host only.
	-__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
	- unsigned int* dev_gridParticleIndex,
	- float4* dev_x)
	-{
	- //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	- unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	-
	- if (idx < devC_np) { // Condition prevents block size error
	-
	- //volatile float4 x = dev_x[idx]; // Ensure coalesced read
	- float4 x = dev_x[idx]; // Ensure coalesced read
	-
	- unsigned int cellID = calcCellID(make_float3(x.x, x.y, x.z));
	-
	- // Store values
	- dev_gridParticleCellID[idx] = cellID;
	- dev_gridParticleIndex[idx] = idx;
	-
	- }
	-} // End of calcParticleCellID(...)
	-
	-
	-// Reorder particle data into sorted order, and find the start and end particl…
	-// of each cell in the sorted hash array.
	-// Kernel executed on device, and callable from host only.
	-__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_c…
	- unsigned int* dev_gridParticleCellID,
	- unsigned int* dev_gridParticleIndex,
	- float4* dev_x, float4* dev_vel,
	- float4* dev_angvel, float* dev_radius,
	- //uint4* dev_bonds,
	- float4* dev_x_sorted, float4* dev_vel_sorted,
	- float4* dev_angvel_sorted, float* dev_radius_sor…
	- //uint4* dev_bonds_sorted)
	-{
	-
	- // Create hash array in shared on-chip memory. The size of the array
	- // (threadsPerBlock + 1) is determined at launch time (extern notation).
	- extern __shared__ unsigned int shared_data[];
	-
	- // Thread index in block
	- unsigned int tidx = threadIdx.x;
	-
	- // Thread index in grid
	- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
	- //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	-
	- // CellID hash value of particle idx
	- unsigned int cellID;
	-
	- // Read cellID data and store it in shared memory (shared_data)
	- if (idx < devC_np) { // Condition prevents block size error
	- cellID = dev_gridParticleCellID[idx];
	-
	- // Load hash data into shared memory, allowing access to neighbor particle…
	- shared_data[tidx+1] = cellID;
	-
	- if (idx > 0 && tidx == 0) {
	- // First thread in block must load neighbor particle hash
	- shared_data[0] = dev_gridParticleCellID[idx-1];
	- }
	- }
	-
	- // Pause completed threads in this block, until all
	- // threads are done loading data into shared memory
	- __syncthreads();
	-
	- // Find lowest and highest particle index in each cell
	- if (idx < devC_np) { // Condition prevents block size error
	- // If this particle has a different cell index to the previous particle, i…
	- // particle in the cell -> Store the index of this particle in the cell.
	- // The previous particle must be the last particle in the previous cell.
	- if (idx == 0 \|\| cellID != shared_data[tidx]) {
	- dev_cellStart[cellID] = idx;
	- if (idx > 0)
	- dev_cellEnd[shared_data[tidx]] = idx;
	- }
	-
	- // Check wether the thread is the last one
	- if (idx == (devC_np - 1))
	- dev_cellEnd[cellID] = idx + 1;
	-
	-
	- // Use the sorted index to reorder the position and velocity data
	- unsigned int sortedIndex = dev_gridParticleIndex[idx];
	-
	- // Fetch from global read
	- float4 x = dev_x[sortedIndex];
	- float4 vel = dev_vel[sortedIndex];
	- float4 angvel = dev_angvel[sortedIndex];
	- float radius = dev_radius[sortedIndex];
	- //uint4 bonds = dev_bonds[sortedIndex];
	-
	- __syncthreads();
	- // Write sorted data to global memory
	- dev_x_sorted[idx] = x;
	- dev_vel_sorted[idx] = vel;
	- dev_angvel_sorted[idx] = angvel;
	- dev_radius_sorted[idx] = radius;
	- //dev_bonds_sorted[idx] = bonds;
	- }
	-} // End of reorderArrays(...)
	-
	-
	-
	-
	-// COLLISION LOCATION AND PROCESSING FUNCTIONS
	-
	-
	-// Linear viscoelastic contact model for particle-wall interactions
	-// with tangential friction and rolling resistance
	-__device__ float contactLinear_wall(float3* N, float3* T, float* es_dot, float…
	- unsigned int idx_a, float radius_a,
	- float4* dev_vel_sorted, float4* dev_angvel…
	- float3 n, float delta, float wvel)
	-{
	- // Fetch particle velocities from global memory
	- const float4 linvel_tmp = dev_vel_sorted[idx_a];
	- const float4 angvel_tmp = dev_angvel_sorted[idx_a];
	-
	- // Convert velocities to three-component vectors
	- const float3 linvel = make_float3(linvel_tmp.x,
	- linvel_tmp.y,
	- linvel_tmp.z);
	- const float3 angvel = make_float3(angvel_tmp.x,
	- angvel_tmp.y,
	- angvel_tmp.z);
	-
	- // Store the length of the angular velocity for later use
	- const float angvel_length = length(angvel);
	-
	- // Contact velocity is the sum of the linear and
	- // rotational components
	- const float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
	-
	- // Normal component of the contact velocity
	- const float vel_n = dot(vel, n);
	-
	- // The tangential velocity is the contact velocity
	- // with the normal component subtracted
	- const float3 vel_t = vel - n * (dot(vel, n));
	- const float vel_t_length = length(vel_t);
	-
	- // Calculate elastic normal component
	- //float3 f_n = -devC_k_n * delta * n;
	-
	- // Normal force component: Elastic - viscous damping
	- float3 f_n = (-devC_k_n * delta - devC_gamma_n * vel_n) * n;
	-
	- // Make sure the viscous damping doesn't exceed the elastic component,
	- // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	- if (dot(f_n, n) < 0.0f)
	- f_n = make_float3(0.0f, 0.0f, 0.0f);
	-
	- const float f_n_length = length(f_n); // Save length for later use
	-
	- // Initialize vectors
	- float3 f_t = make_float3(0.0f, 0.0f, 0.0f);
	- float3 T_res = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Check that the tangential velocity is high enough to avoid
	- // divide by zero (producing a NaN)
	- if (vel_t_length > 0.f) {
	-
	- const float f_t_visc = devC_gamma_s * vel_t_length; // Tangential force b…
	- const float f_t_limit = devC_mu_s * f_n_length; // Max. friction
	-
	- // If the shear force component exceeds the friction,
	- // the particle slips and energy is dissipated
	- if (f_t_visc < f_t_limit) {
	- f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
	-
	- } else { // Dynamic friction, friction failure
	- f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
	-
	- // Shear energy production rate [W]
	- //*es_dot += -dot(vel_t, f_t);
	- }
	- }
	-
	-/* if (angvel_length > 0.f) {
	- // Apply rolling resistance (Zhou et al. 1999)
	- //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
	-
	- // New rolling resistance model
	- T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
	- devC_mu_r * radius_a * f_n_length)
	- * angvel_a/angvel_length;
	- }*/
	-
	- // Total force from wall
	- *N += f_n + f_t;
	-// *N += f_n;
	-
	- // Total torque from wall
	- T += -radius_a cross(n, f_t) + T_res;
	-
	- // Pressure excerted onto particle from this contact
	- p += f_n_length / (4.0f PI * radius_a*radius_a);
	-
	- // Return force excerted onto the wall
	- //return -dot(*N, n);
	- return dot(f_n, n);
	-}
	-
	-
	-// Linear vicoelastic contact model for particle-particle interactions
	-// with tangential friction and rolling resistance
	-__device__ void contactLinearViscous(float3* N, float3* T, float* es_dot, floa…
	- unsigned int idx_a, unsigned in…
	- float4* dev_vel_sorted,
	- float4* dev_angvel_sorted,
	- float radius_a, float radius_b,
	- float3 x_ab, float x_ab_length,
	- float delta_ab, float kappa)
	-{
	-
	- // Allocate variables and fetch missing time=t values for particle A and B
	- const float4 vel_a = dev_vel_sorted[idx_a];
	- const float4 vel_b = dev_vel_sorted[idx_b];
	- const float4 angvel4_a = dev_angvel_sorted[idx_a];
	- const float4 angvel4_b = dev_angvel_sorted[idx_b];
	-
	- // Convert to float3's
	- const float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	- const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	-
	- // Force between grain pair decomposed into normal- and tangential part
	- float3 f_n, f_t, f_c, T_res;
	-
	- // Normal vector of contact
	- const float3 n_ab = x_ab/x_ab_length;
	-
	- // Relative contact interface velocity, w/o rolling
	- const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	-
	- // Relative contact interface velocity of particle surfaces at
	- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	- const float3 vel_ab = vel_ab_linear
	- + radius_a * cross(n_ab, angvel_a)
	- + radius_b * cross(n_ab, angvel_b);
	-
	- // Relative contact interface rolling velocity
	- const float3 angvel_ab = angvel_a - angvel_b;
	- const float angvel_ab_length = length(angvel_ab);
	-
	- // Normal component of the relative contact interface velocity
	- const float vel_n_ab = dot(vel_ab_linear, n_ab);
	-
	- // Tangential component of the relative contact interface velocity
	- // Hinrichsen and Wolf 2004, eq. 13.9
	- const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	- const float vel_t_ab_length = length(vel_t_ab);
	-
	- // Compute the normal stiffness of the contact
	- //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
	-
	- // Calculate rolling radius
	- const float R_bar = (radius_a + radius_b) / 2.0f;
	-
	- // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
	- // with velocity dependant damping
	- // Damping coefficient: alpha = 0.8
	- //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_effk_n_ab) vel_ab) * …
	-
	- // Linear spring for normal component (Renzo 2004, eq. 35)
	- // Dissipation due to plastic deformation is modelled by using a different
	- // unloading spring constant (Walton and Braun 1986)
	- // Here the factor in the second term determines the relative strength of the
	- // unloading spring relative to the loading spring.
	- /* if (vel_n_ab > 0.0f) { // Loading
	- f_n = (-k_n_ab * delta_ab) * n_ab;
	- } else { // Unloading
	- f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
	- } // f_n is OK! */
	-
	- // Normal force component: Elastic
	- //f_n = -k_n_ab * delta_ab * n_ab;
	-
	- // Normal force component: Elastic - viscous damping
	- f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
	-
	- // Make sure the viscous damping doesn't exceed the elastic component,
	- // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	- if (dot(f_n, n_ab) < 0.0f)
	- f_n = make_float3(0.0f, 0.0f, 0.0f);
	-
	- const float f_n_length = length(f_n);
	-
	- // Add max. capillary force
	- f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
	-
	- // Initialize force vectors to zero
	- f_t = make_float3(0.0f, 0.0f, 0.0f);
	- T_res = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Shear force component: Nonlinear relation
	- // Coulomb's law of friction limits the tangential force to less or equal
	- // to the normal force
	- if (vel_t_ab_length > 0.f) {
	-
	- // Tangential force by viscous model
	- const float f_t_visc = devC_gamma_s * vel_t_ab_length;
	-
	- // Determine max. friction
	- float f_t_limit;
	- if (vel_t_ab_length > 0.001f) { // Dynamic
	- f_t_limit = devC_mu_d * length(f_n-f_c);
	- } else { // Static
	- f_t_limit = devC_mu_s * length(f_n-f_c);
	- }
	-
	- // If the shear force component exceeds the friction,
	- // the particle slips and energy is dissipated
	- if (f_t_visc < f_t_limit) { // Static
	- f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
	-
	- } else { // Dynamic, friction failure
	- f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
	-
	- // Shear friction production rate [W]
	- //*es_dot += -dot(vel_t_ab, f_t);
	- }
	- }
	-
	-/* if (angvel_ab_length > 0.f) {
	- // Apply rolling resistance (Zhou et al. 1999)
	- //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
	-
	- // New rolling resistance model
	- T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
	- devC_mu_r * R_bar * f_n_length)
	- * angvel_ab/angvel_ab_length;
	- }
	-*/
	-
	- // Add force components from this collision to total force for particle
	- *N += f_n + f_t + f_c;
	- T += -R_bar cross(n_ab, f_t) + T_res;
	-
	- // Pressure excerted onto the particle from this contact
	- p += f_n_length / (4.0f PI * radius_a*radius_a);
	-
	-} // End of contactLinearViscous()
	-
	-
	-// Linear elastic contact model for particle-particle interactions
	-__device__ void contactLinear(float3* N, float3* T,
	- float* es_dot, float* p,
	- unsigned int idx_a_orig,
	- unsigned int idx_b_orig,
	- float4 vel_a,
	- float4* dev_vel,
	- float3 angvel_a,
	- float4* dev_angvel,
	- float radius_a, float radius_b,
	- float3 x_ab, float x_ab_length,
	- float delta_ab, float4* dev_delta_t,
	- unsigned int mempos)
	-{
	-
	- // Allocate variables and fetch missing time=t values for particle A and B
	- const float4 vel_b = dev_vel[idx_b_orig];
	- const float4 angvel4_b = dev_angvel[idx_b_orig];
	-
	- // Fetch previous sum of shear displacement for the contact pair
	- const float4 delta_t0_4 = dev_delta_t[mempos];
	-
	- const float3 delta_t0_uncor = make_float3(delta_t0_4.x,
	- delta_t0_4.y,
	- delta_t0_4.z);
	-
	- // Convert to float3
	- const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	-
	- // Force between grain pair decomposed into normal- and tangential part
	- float3 f_n, f_t, f_c, T_res;
	-
	- // Normal vector of contact
	- const float3 n_ab = x_ab/x_ab_length;
	-
	- // Relative contact interface velocity, w/o rolling
	- const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	-
	- // Relative contact interface velocity of particle surfaces at
	- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	- const float3 vel_ab = vel_ab_linear
	- + radius_a * cross(n_ab, angvel_a)
	- + radius_b * cross(n_ab, angvel_b);
	-
	- // Relative contact interface rolling velocity
	- const float3 angvel_ab = angvel_a - angvel_b;
	- const float angvel_ab_length = length(angvel_ab);
	-
	- // Normal component of the relative contact interface velocity
	- const float vel_n_ab = dot(vel_ab_linear, n_ab);
	-
	- // Tangential component of the relative contact interface velocity
	- // Hinrichsen and Wolf 2004, eq. 13.9
	- const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	- const float vel_t_ab_length = length(vel_t_ab);
	-
	- // Correct tangential displacement vector, which is
	- // necessary if the tangential plane rotated
	- const float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
	- const float delta_t0_length = length(delta_t0);
	-
	- // New tangential displacement vector
	- float3 delta_t;
	-
	- // Compute the normal stiffness of the contact
	- //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
	-
	- // Calculate rolling radius
	- const float R_bar = (radius_a + radius_b) / 2.0f;
	-
	- // Normal force component: Elastic
	- //f_n = -devC_k_n * delta_ab * n_ab;
	-
	- // Normal force component: Elastic - viscous damping
	- f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
	-
	- // Make sure the viscous damping doesn't exceed the elastic component,
	- // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	- if (dot(f_n, n_ab) < 0.0f)
	- f_n = make_float3(0.0f, 0.0f, 0.0f);
	-
	- const float f_n_length = length(f_n);
	-
	- // Add max. capillary force
	- f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
	-
	- // Initialize force vectors to zero
	- f_t = make_float3(0.0f, 0.0f, 0.0f);
	- T_res = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Apply a tangential force if the previous tangential displacement
	- // is non-zero, or the current sliding velocity is non-zero.
	- if (delta_t0_length > 0.f \|\| vel_t_ab_length > 0.f) {
	-
	- // Shear force: Visco-Elastic, limited by Coulomb friction
	- float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
	- float3 f_t_visc = -1.0f * devC_gamma_s * vel_t_ab;
	-
	- float f_t_limit;
	-
	- if (vel_t_ab_length > 0.001f) { // Dynamic friciton
	- f_t_limit = devC_mu_d * length(f_n-f_c);
	- } else { // Static friction
	- f_t_limit = devC_mu_s * length(f_n-f_c);
	- }
	-
	- // Tangential force before friction limit correction
	- f_t = f_t_elast + f_t_visc;
	- float f_t_length = length(f_t);
	-
	- // If failure criterion is not met, contact is viscous-linear elastic.
	- // If failure criterion is met, contact force is limited,
	- // resulting in a slip and energy dissipation
	- if (f_t_length > f_t_limit) { // Dynamic case
	-
	- // Frictional force is reduced to equal the limit
	- f_t *= f_t_limit/f_t_length;
	-
	- // A slip event zeros the displacement vector
	- //delta_t = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // In a slip event, the tangential spring is adjusted to a
	- // length which is consistent with Coulomb's equation
	- // (Hinrichsen and Wolf, 2004)
	- delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
	-
	- // Shear friction heat production rate:
	- // The energy lost from the tangential spring is dissipated as heat
	- //*es_dot += -dot(vel_t_ab, f_t);
	- es_dot += length(delta_t0 - delta_t) devC_k_s / devC_dt; // Seen in E…
	-
	- } else { // Static case
	-
	- // No correction of f_t is required
	-
	- // Add tangential displacement to total tangential displacement
	- delta_t = delta_t0 + vel_t_ab * devC_dt;
	- }
	- }
	-
	- if (angvel_ab_length > 0.f) {
	- // Apply rolling resistance (Zhou et al. 1999)
	- //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
	-
	- // New rolling resistance model
	- T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
	- devC_mu_r * R_bar * f_n_length)
	- * angvel_ab/angvel_ab_length;
	- }
	-
	- // Add force components from this collision to total force for particle
	- *N += f_n + f_t + f_c;
	- T += -R_bar cross(n_ab, f_t) + T_res;
	-
	- // Pressure excerted onto the particle from this contact
	- p += f_n_length / (4.0f PI * radius_a*radius_a);
	-
	- // Store sum of tangential displacements
	- dev_delta_t[mempos] = make_float4(delta_t, 0.0f);
	-
	-} // End of contactLinear()
	-
	-
	-// Linear-elastic bond: Attractive force with normal- and shear components
	-// acting upon particle A in a bonded particle pair
	-__device__ void bondLinear(float3* N, float3* T, float* es_dot, float* p,
	- unsigned int idx_a, unsigned int idx_b,
	- float4* dev_x_sorted, float4* dev_vel_sorted,
	- float4* dev_angvel_sorted,
	- float radius_a, float radius_b,
	- float3 x_ab, float x_ab_length,
	- float delta_ab)
	-{
	-
	- // If particles are not overlapping, apply bond force
	- if (delta_ab > 0.0f) {
	-
	- // Allocate variables and fetch missing time=t values for particle A and B
	- float4 vel_a = dev_vel_sorted[idx_a];
	- float4 vel_b = dev_vel_sorted[idx_b];
	- float4 angvel4_a = dev_angvel_sorted[idx_a];
	- float4 angvel4_b = dev_angvel_sorted[idx_b];
	-
	- // Convert to float3's
	- float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	- float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	-
	- // Normal vector of contact
	- float3 n_ab = x_ab/x_ab_length;
	-
	- // Relative contact interface velocity, w/o rolling
	- float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
	- vel_a.y - vel_b.y,
	- vel_a.z - vel_b.z);
	-
	- // Relative contact interface velocity of particle surfaces at
	- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	- float3 vel_ab = vel_ab_linear
	- + radius_a * cross(n_ab, angvel_a)
	- + radius_b * cross(n_ab, angvel_b);
	-
	- // Relative contact interface rolling velocity
	- //float3 angvel_ab = angvel_a - angvel_b;
	- //float angvel_ab_length = length(angvel_ab);
	-
	- // Normal component of the relative contact interface velocity
	- //float vel_n_ab = dot(vel_ab_linear, n_ab);
	-
	- // Tangential component of the relative contact interface velocity
	- // Hinrichsen and Wolf 2004, eq. 13.9
	- float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	- //float vel_t_ab_length = length(vel_t_ab);
	-
	- float3 f_n = make_float3(0.0f, 0.0f, 0.0f);
	- float3 f_t = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Mean radius
	- float R_bar = (radius_a + radius_b)/2.0f;
	-
	- // Normal force component: Elastic
	- f_n = devC_k_n * delta_ab * n_ab;
	-
	- if (length(vel_t_ab) > 0.f) {
	- // Shear force component: Viscous
	- f_t = -1.0f * devC_gamma_s * vel_t_ab;
	-
	- // Shear friction production rate [W]
	- //*es_dot += -dot(vel_t_ab, f_t);
	- }
	-
	- // Add force components from this bond to total force for particle
	- *N += f_n + f_t;
	- T += -R_bar cross(n_ab, f_t);
	-
	- // Pressure excerted onto the particle from this bond
	- p += length(f_n) / (4.0f PI * radius_a*radius_a);
	-
	- }
	-} // End of bondLinear()
	-
	-
	-// Capillary cohesion after Richefeu et al. (2006)
	-__device__ void capillaryCohesion_exp(float3* N, float radius_a,
	- float radius_b, float delta_ab,
	- float3 x_ab, float x_ab_length,
	- float kappa)
	-{
	-
	- // Normal vector
	- float3 n_ab = x_ab/x_ab_length;
	-
	- float3 f_c;
	- float lambda, R_geo, R_har, r, h;
	-
	- // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
	- if ((radius_a/radius_b) > (radius_b/radius_a))
	- r = radius_a/radius_b;
	- else
	- r = radius_b/radius_a;
	-
	- // Exponential decay function
	- h = -sqrtf(r);
	-
	- // The harmonic mean
	- R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
	-
	- // The geometrical mean
	- R_geo = sqrtf(radius_a * radius_b);
	-
	- // The exponential falloff of the capillary force with distance
	- lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
	-
	- // Calculate cohesional force
	- f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
	-
	- // Add force components from this collision to total force for particle
	- *N += f_c;
	-
	-} // End of capillaryCohesion_exp
	-
	-
	-
	-// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
	-// Kernel executed on device, and callable from device only.
	-__device__ void overlapsInCell(int3 targetCell,
	- unsigned int idx_a,
	- float4 x_a, float radius_a,
	- float3* N, float3* T,
	- float* es_dot, float* p,
	- float4* dev_x_sorted,
	- float* dev_radius_sorted,
	- float4* dev_vel_sorted,
	- float4* dev_angvel_sorted,
	- unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd,
	- float4* dev_w_nx,
	- float4* dev_w_mvfd)
	- //uint4 bonds)
	-{
	-
	- // Variable containing modifier for interparticle
	- // vector, if it crosses a periodic boundary
	- float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Check whether x- and y boundaries are to be treated as periodic
	- // 1: x- and y boundaries periodic
	- // 2: x boundaries periodic
	- if (devC_periodic == 1) {
	-
	- // Periodic x-boundary
	- if (targetCell.x < 0) {
	- targetCell.x = devC_num[0] - 1;
	- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	- if (targetCell.x == devC_num[0]) {
	- targetCell.x = 0;
	- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	-
	- // Periodic y-boundary
	- if (targetCell.y < 0) {
	- targetCell.y = devC_num[1] - 1;
	- distmod += make_float3(0.0f, devC_L[1], 0.0f);
	- }
	- if (targetCell.y == devC_num[1]) {
	- targetCell.y = 0;
	- distmod -= make_float3(0.0f, devC_L[1], 0.0f);
	- }
	-
	- } else if (devC_periodic == 2) {
	-
	- // Periodic x-boundary
	- if (targetCell.x < 0) {
	- targetCell.x = devC_num[0] - 1;
	- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	- if (targetCell.x == devC_num[0]) {
	- targetCell.x = 0;
	- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	-
	- // Hande out-of grid cases on y-axis
	- if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	- return;
	-
	- } else {
	-
	- // Hande out-of grid cases on x- and y-axes
	- if (targetCell.x < 0 \|\| targetCell.x == devC_num[0])
	- return;
	- if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	- return;
	- }
	-
	- // Handle out-of-grid cases on z-axis
	- if (targetCell.z < 0 \|\| targetCell.z == devC_num[2])
	- return;
	-
	-
	- //// Check and process particle-particle collisions
	-
	- // Calculate linear cell ID
	- unsigned int cellID = targetCell.x
	- + __umul24(targetCell.y, devC_num[0])
	- + __umul24(__umul24(devC_num[0],
	- devC_num[1]),
	- targetCell.z);
	-
	- // Lowest particle index in cell
	- unsigned int startIdx = dev_cellStart[cellID];
	-
	- // Make sure cell is not empty
	- if (startIdx != 0xffffffff) {
	-
	- // Highest particle index in cell + 1
	- unsigned int endIdx = dev_cellEnd[cellID];
	-
	- // Iterate over cell particles
	- for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
	- if (idx_b != idx_a) { // Do not collide particle with itself
	-
	- // Fetch position and velocity of particle B.
	- float4 x_b = dev_x_sorted[idx_b];
	- float radius_b = dev_radius_sorted[idx_b];
	- float kappa = devC_kappa;
	-
	- // Distance between particle centers (float4 -> float3)
	- float3 x_ab = make_float3(x_a.x - x_b.x,
	- x_a.y - x_b.y,
	- x_a.z - x_b.z);
	-
	- // Adjust interparticle vector if periodic boundary/boundaries
	- // are crossed
	- x_ab += distmod;
	-
	- float x_ab_length = length(x_ab);
	-
	- // Distance between particle perimeters
	- float delta_ab = x_ab_length - (radius_a + radius_b);
	-
	- // Check for particle overlap
	- if (delta_ab < 0.0f) {
	- contactLinearViscous(N, T, es_dot, p,
	- idx_a, idx_b,
	- dev_vel_sorted,
	- dev_angvel_sorted,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_ab, kappa);
	- } else if (delta_ab < devC_db) {
	- // Check wether particle distance satisfies the capillary bond dista…
	- capillaryCohesion_exp(N, radius_a, radius_b, delta_ab,
	- x_ab, x_ab_length, kappa);
	- }
	-
	- // Check wether particles are bonded together
	- /*if (bonds.x == idx_b \|\| bonds.y == idx_b \|\|
	- bonds.z == idx_b \|\| bonds.w == idx_b) {
	- bondLinear(N, T, es_dot, p,
	- idx_a, idx_b,
	- dev_x_sorted, dev_vel_sorted,
	- dev_angvel_sorted,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_ab);
	- }*/
	-
	- } // Do not collide particle with itself end
	- } // Iterate over cell particles end
	- } // Check wether cell is empty end
	-} // End of overlapsInCell(...)
	-
	-// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
	-// Write the indexes of the overlaps in array contacts[]
	-// Kernel executed on device, and callable from device only.
	-__device__ void findContactsInCell(int3 targetCell,
	- unsigned int idx_a,
	- float4 x_a, float radius_a,
	- float4* dev_x_sorted,
	- float* dev_radius_sorted,
	- unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd,
	- unsigned int* dev_gridParticleIndex,
	- int* nc,
	- unsigned int* dev_contacts,
	- float4* dev_distmod)
	-{
	- // Variable containing modifier for interparticle
	- // vector, if it crosses a periodic boundary
	- float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Check whether x- and y boundaries are to be treated as periodic
	- // 1: x- and y boundaries periodic
	- // 2: x boundaries periodic
	- if (devC_periodic == 1) {
	-
	- // Periodic x-boundary
	- if (targetCell.x < 0) {
	- targetCell.x = devC_num[0] - 1;
	- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	- if (targetCell.x == devC_num[0]) {
	- targetCell.x = 0;
	- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	-
	- // Periodic y-boundary
	- if (targetCell.y < 0) {
	- targetCell.y = devC_num[1] - 1;
	- distmod += make_float3(0.0f, devC_L[1], 0.0f);
	- }
	- if (targetCell.y == devC_num[1]) {
	- targetCell.y = 0;
	- distmod -= make_float3(0.0f, devC_L[1], 0.0f);
	- }
	-
	- } else if (devC_periodic == 2) {
	-
	- // Periodic x-boundary
	- if (targetCell.x < 0) {
	- targetCell.x = devC_num[0] - 1;
	- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	- if (targetCell.x == devC_num[0]) {
	- targetCell.x = 0;
	- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
	- }
	-
	- // Hande out-of grid cases on y-axis
	- if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	- return;
	-
	- } else {
	-
	- // Hande out-of grid cases on x- and y-axes
	- if (targetCell.x < 0 \|\| targetCell.x == devC_num[0])
	- return;
	- if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	- return;
	- }
	-
	- // Handle out-of-grid cases on z-axis
	- if (targetCell.z < 0 \|\| targetCell.z == devC_num[2])
	- return;
	-
	-
	- //// Check and process particle-particle collisions
	-
	- // Calculate linear cell ID
	- unsigned int cellID = targetCell.x
	- + __umul24(targetCell.y, devC_num[0])
	- + __umul24(__umul24(devC_num[0],
	- devC_num[1]),
	- targetCell.z);
	-
	- // Lowest particle index in cell
	- unsigned int startIdx = dev_cellStart[cellID];
	-
	- // Make sure cell is not empty
	- if (startIdx != 0xffffffff) {
	-
	- // Highest particle index in cell + 1
	- unsigned int endIdx = dev_cellEnd[cellID];
	-
	- // Read the original index of particle A
	- unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
	-
	- // Iterate over cell particles
	- for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
	- if (idx_b != idx_a) { // Do not collide particle with itself
	-
	- // Fetch position and radius of particle B.
	- float4 x_b = dev_x_sorted[idx_b];
	- float radius_b = dev_radius_sorted[idx_b];
	-
	- // Read the original index of particle B
	- unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
	-
	- __syncthreads();
	-
	- // Distance between particle centers (float4 -> float3)
	- float3 x_ab = make_float3(x_a.x - x_b.x,
	- x_a.y - x_b.y,
	- x_a.z - x_b.z);
	-
	- // Adjust interparticle vector if periodic boundary/boundaries
	- // are crossed
	- x_ab += distmod;
	-
	- float x_ab_length = length(x_ab);
	-
	- // Distance between particle perimeters
	- float delta_ab = x_ab_length - (radius_a + radius_b);
	-
	- // Check for particle overlap
	- if (delta_ab < 0.0f) {
	-
	- // If the particle is not yet registered in the contact list,
	- // use the next position in the array
	- int cpos = *nc;
	- unsigned int cidx;
	-
	- // Find out, if particle is already registered in contact list
	- for (int i=0; i<devC_nc; ++i) {
	- __syncthreads();
	- cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
	- if (cidx == idx_b_orig)
	- cpos = i;
	- }
	-
	- __syncthreads();
	-
	- // Write the particle index to the relevant position,
	- // no matter if it already is there or not (concurrency of write)
	- dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
	-
	-
	- // Write the interparticle vector and radius of particle B
	- //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4…
	- dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(d…
	-
	- // Increment contact counter
	- ++*nc;
	- }
	-
	- // Write the inter-particle position vector correction and radius of p…
	- //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(d…
	-
	- // Check wether particles are bonded together
	- /*if (bonds.x == idx_b \|\| bonds.y == idx_b \|\|
	- bonds.z == idx_b \|\| bonds.w == idx_b) {
	- bondLinear(N, T, es_dot, p,
	- idx_a, idx_b,
	- dev_x_sorted, dev_vel_sorted,
	- dev_angvel_sorted,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_ab);
	- }*/
	-
	- } // Do not collide particle with itself end
	- } // Iterate over cell particles end
	- } // Check wether cell is empty end
	-} // End of findContactsInCell(...)
	-
	-
	-// For a single particle:
	-// Search for neighbors to particle 'idx' inside the 27 closest cells,
	-// and save the contact pairs in global memory.
	-__global__ void topology(unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd, // Input: Particles in…
	- unsigned int* dev_gridParticleIndex, // Input: Unsort…
	- float4* dev_x_sorted, float* dev_radius_sorted,
	- unsigned int* dev_contacts,
	- float4* dev_distmod)
	-{
	- // Thread index equals index of particle A
	- unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	- if (idx_a < devC_np) {
	- // Fetch particle data in global read
	- float4 x_a = dev_x_sorted[idx_a];
	- float radius_a = dev_radius_sorted[idx_a];
	-
	- // Count the number of contacts in this time step
	- int nc = 0;
	-
	- // Grid position of host and neighbor cells in uniform, cubic grid
	- int3 gridPos;
	- int3 targetPos;
	-
	- // Calculate cell address in grid from position of particle
	- gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	- gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	- gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	-
	- // Find overlaps between particle no. idx and all particles
	- // from its own cell + 26 neighbor cells
	- for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	- targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	- findContactsInCell(targetPos, idx_a, x_a, radius_a,
	- dev_x_sorted, dev_radius_sorted,
	- dev_cellStart, dev_cellEnd,
	- dev_gridParticleIndex,
	- &nc, dev_contacts, dev_distmod);
	- }
	- }
	- }
	- }
	-} // End of topology(...)
	-
	-
	-// For a single particle:
	-// Search for neighbors to particle 'idx' inside the 27 closest cells,
	-// and compute the resulting normal- and shear force on it.
	-// Collide with top- and bottom walls, save resulting force on upper wall.
	-// Kernel is executed on device, and is callable from host only.
	-__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsort…
	- unsigned int* dev_cellStart,
	- unsigned int* dev_cellEnd,
	- float4* dev_x, float* dev_radius,
	- float4* dev_x_sorted, float* dev_radius_sorted,
	- float4* dev_vel_sorted, float4* dev_angvel_sorted,
	- float4* dev_vel, float4* dev_angvel,
	- float4* dev_force, float4* dev_torque,
	- float* dev_es_dot, float* dev_es, float* dev_p,
	- float4* dev_w_nx, float4* dev_w_mvfd,
	- float* dev_w_force, //uint4* dev_bonds_sorted,
	- unsigned int* dev_contacts,
	- float4* dev_distmod,
	- float4* dev_delta_t)
	-{
	- // Thread index equals index of particle A
	- unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	-
	- if (idx_a < devC_np) {
	-
	- // Fetch particle data in global read
	- unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
	- float4 x_a = dev_x_sorted[idx_a];
	- float radius_a = dev_radius_sorted[idx_a];
	-
	- // Fetch wall data in global read
	- float4 w_up_nx = dev_w_nx[0];
	- float4 w_up_mvfd = dev_w_mvfd[0];
	-
	- // Fetch world dimensions in constant memory read
	- float3 origo = make_float3(devC_origo[0],
	- devC_origo[1],
	- devC_origo[2]);
	- float3 L = make_float3(devC_L[0],
	- devC_L[1],
	- devC_L[2]);
	-
	- // Index of particle which is bonded to particle A.
	- // The index is equal to the particle no (p.np)
	- // if particle A is bond-less.
	- //uint4 bonds = dev_bonds_sorted[idx_a];
	-
	- // Initiate shear friction loss rate at 0.0
	- float es_dot = 0.0f;
	-
	- // Initiate pressure on particle at 0.0
	- float p = 0.0f;
	-
	- // Allocate memory for temporal force/torque vector values
	- float3 N = make_float3(0.0f, 0.0f, 0.0f);
	- float3 T = make_float3(0.0f, 0.0f, 0.0f);
	-
	- // Apply linear elastic, frictional contact model to registered contacts
	- if (devC_shearmodel == 2) {
	- unsigned int idx_b_orig, mempos;
	- float delta_n, x_ab_length;
	- float4 x_b;
	- float4 distmod;
	- float radius_b;
	- float3 x_ab;
	- float4 vel_a = dev_vel_sorted[idx_a];
	- float4 angvel4_a = dev_angvel_sorted[idx_a];
	- float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	-
	- // Loop over all possible contacts, and remove contacts
	- // that no longer are valid (delta_n > 0.0)
	- for (int i = 0; i<(int)devC_nc; ++i) {
	- mempos = (unsigned int)(idx_a_orig * devC_nc + i);
	- __syncthreads();
	- idx_b_orig = dev_contacts[mempos];
	- distmod = dev_distmod[mempos];
	- x_b = dev_x[idx_b_orig];
	- //radius_b = dev_radius[idx_b_orig];
	- radius_b = distmod.w;
	-
	- // Inter-particle vector, corrected for periodic boundaries
	- x_ab = make_float3(x_a.x - x_b.x + distmod.x,
	- x_a.y - x_b.y + distmod.y,
	- x_a.z - x_b.z + distmod.z);
	-
	- x_ab_length = length(x_ab);
	- delta_n = x_ab_length - (radius_a + radius_b);
	-
	-
	- if (idx_b_orig != (unsigned int)devC_np) {
	-
	- // Process collision if the particles are overlapping
	- if (delta_n < 0.0f) {
	- //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u…
	- // idx_a_orig, idx_b_orig, i, delta_n);
	- contactLinear(&N, &T, &es_dot, &p,
	- idx_a_orig,
	- idx_b_orig,
	- vel_a,
	- dev_vel,
	- angvel_a,
	- dev_angvel,
	- radius_a, radius_b,
	- x_ab, x_ab_length,
	- delta_n, dev_delta_t,
	- mempos);
	- } else {
	- __syncthreads();
	- // Remove this contact (there is no particle with index=np)
	- dev_contacts[mempos] = devC_np;
	- // Zero sum of shear displacement in this position
	- dev_delta_t[mempos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	- }
	- } else {
	- __syncthreads();
	- dev_delta_t[mempos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	- }
	- } // Contact loop end
	-
	- } else if (devC_shearmodel == 1) {
	-
	- int3 gridPos;
	- int3 targetPos;
	-
	- // Calculate address in grid from position
	- gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	- gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	- gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	-
	- // Find overlaps between particle no. idx and all particles
	- // from its own cell + 26 neighbor cells.
	- // Calculate resulting normal- and shear-force components and
	- // torque for the particle on the base of contactLinearViscous()
	- for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	- targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	- overlapsInCell(targetPos, idx_a, x_a, radius_a,
	- &N, &T, &es_dot, &p,
	- dev_x_sorted, dev_radius_sorted,
	- dev_vel_sorted, dev_angvel_sorted,
	- dev_cellStart, dev_cellEnd,
	- dev_w_nx, dev_w_mvfd);
	- }
	- }
	- }
	-
	- }
	-
	- //// Interact with walls
	- float delta_w; // Overlap distance
	- float3 w_n; // Wall surface normal
	- float w_force = 0.0f; // Force on wall from particle A
	-
	- // Upper wall (idx 0)
	- delta_w = w_up_nx.w - (x_a.z + radius_a);
	- w_n = make_float3(0.0f, 0.0f, -1.0f);
	- if (delta_w < 0.0f) {
	- w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, w_up_mvfd.y);
	- }
	-
	- // Lower wall (force on wall not stored)
	- delta_w = x_a.z - radius_a - origo.z;
	- w_n = make_float3(0.0f, 0.0f, 1.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	-
	- if (devC_periodic == 0) {
	-
	- // Left wall
	- delta_w = x_a.x - radius_a - origo.x;
	- w_n = make_float3(1.0f, 0.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	- // Right wall
	- delta_w = L.x - (x_a.x + radius_a);
	- w_n = make_float3(-1.0f, 0.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	- // Front wall
	- delta_w = x_a.y - radius_a - origo.y;
	- w_n = make_float3(0.0f, 1.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	- // Back wall
	- delta_w = L.y - (x_a.y + radius_a);
	- w_n = make_float3(0.0f, -1.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	- } else if (devC_periodic == 2) {
	-
	- // Front wall
	- delta_w = x_a.y - radius_a - origo.y;
	- w_n = make_float3(0.0f, 1.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	-
	- // Back wall
	- delta_w = L.y - (x_a.y + radius_a);
	- w_n = make_float3(0.0f, -1.0f, 0.0f);
	- if (delta_w < 0.0f) {
	- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	- dev_vel_sorted, dev_angvel_sorted,
	- w_n, delta_w, 0.0f);
	- }
	- }
	-
	-
	- // Hold threads for coalesced write
	- __syncthreads();
	-
	- // Write force to unsorted position
	- unsigned int orig_idx = dev_gridParticleIndex[idx_a];
	- dev_force[orig_idx] = make_float4(N, 0.0f);
	- dev_torque[orig_idx] = make_float4(T, 0.0f);
	- dev_es_dot[orig_idx] = es_dot;
	- dev_es[orig_idx] += es_dot * devC_dt;
	- dev_p[orig_idx] = p;
	- dev_w_force[orig_idx] = w_force;
	- }
	-} // End of interact(...)
	-
	-
	-
	-
	-
	-// FUNCTION FOR UPDATING TRAJECTORIES
	-
	-// Second order integration scheme based on Taylor expansion of particle kinem…
	-// Kernel executed on device, and callable from host only.
	-__global__ void integrate(float4* dev_x_sorted, float4* dev_vel_sorted, // Inp…
	- float4* dev_angvel_sorted, float* dev_radius_sorted,…
	- float4* dev_x, float4* dev_vel, float4* dev_angvel, …
	- float4* dev_force, float4* dev_torque, // Input
	- unsigned int* dev_gridParticleIndex) // Input: Sorte…
	-{
	- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	-
	- if (idx < devC_np) { // Condition prevents block size error
	-
	- // Copy data to temporary arrays to avoid any potential read-after-write,
	- // write-after-read, or write-after-write hazards.
	- unsigned int orig_idx = dev_gridParticleIndex[idx];
	- float4 force = dev_force[orig_idx];
	- float4 torque = dev_torque[orig_idx];
	-
	- // Initialize acceleration vectors to zero
	- float4 acc = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	- float4 angacc = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	-
	- // Fetch particle position and velocity values from global read
	- float4 x = dev_x_sorted[idx];
	- float4 vel = dev_vel_sorted[idx];
	- float4 angvel = dev_angvel_sorted[idx];
	- float radius = dev_radius_sorted[idx];
	-
	- // Coherent read from constant memory to registers
	- float dt = devC_dt;
	- float3 origo = make_float3(devC_origo[0], devC_origo[1], devC_origo[2]);
	- float3 L = make_float3(devC_L[0], devC_L[1], devC_L[2]);
	- float rho = devC_rho;
	-
	- // Particle mass
	- float m = 4.0f/3.0f * PI * radiusradiusradius * rho;
	-
	- // Update linear acceleration of particle
	- acc.x = force.x / m;
	- acc.y = force.y / m;
	- acc.z = force.z / m;
	-
	- // Update angular acceleration of particle
	- // (angacc = (total moment)/Intertia, intertia = 2/5mr^2)
	- angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
	- angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
	- angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
	-
	- // Add gravity
	- acc.x += devC_g[0];
	- acc.y += devC_g[1];
	- acc.z += devC_g[2];
	- //acc.z -= 9.82f;
	-
	- // Update angular velocity
	- angvel.x += angacc.x * dt;
	- angvel.y += angacc.y * dt;
	- angvel.z += angacc.z * dt;
	-
	- // Check if particle has a fixed horizontal velocity
	- if (vel.w > 0.0f) {
	-
	- // Zero horizontal acceleration and disable
	- // gravity to counteract segregation.
	- // Particles may move in the z-dimension,
	- // to allow for dilation.
	- acc.x = 0.0f;
	- acc.y = 0.0f;
	- acc.z -= devC_g[2];
	-
	- // Zero the angular acceleration and -velocity
	- angvel = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	- }
	-
	- // Update linear velocity
	- vel.x += acc.x * dt;
	- vel.y += acc.y * dt;
	- vel.z += acc.z * dt;
	-
	- // Update position. First-order Euler's scheme:
	- //x.x += vel.x * dt;
	- //x.y += vel.y * dt;
	- //x.z += vel.z * dt;
	-
	- // Update position. Second-order scheme based on Taylor expansion
	- // (greater accuracy than the first-order Euler's scheme)
	- x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
	- x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
	- x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
	-
	-
	- // Add x-displacement for this time step to
	- // sum of x-displacements
	- x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
	-
	- // Move particle across boundary if it is periodic
	- if (devC_periodic == 1) {
	- if (x.x < origo.x)
	- x.x += L.x;
	- if (x.x > L.x)
	- x.x -= L.x;
	- if (x.y < origo.y)
	- x.y += L.y;
	- if (x.y > L.y)
	- x.y -= L.y;
	- } else if (devC_periodic == 2) {
	- if (x.x < origo.x)
	- x.x += L.x;
	- if (x.x > L.x)
	- x.x -= L.x;
	- }
	-
	- // Hold threads for coalesced write
	- __syncthreads();
	-
	- // Store data in global memory at original, pre-sort positions
	- dev_angvel[orig_idx] = angvel;
	- dev_vel[orig_idx] = vel;
	- dev_x[orig_idx] = x;
	- }
	-} // End of integrate(...)
	-
	-
	-// Reduce wall force contributions from particles to a single value per wall
	-__global__ void summation(float* in, float *out)
	-{
	- __shared__ float cache[256];
	- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
	- unsigned int cacheIdx = threadIdx.x;
	-
	- float temp = 0.0f;
	- while (idx < devC_np) {
	- temp += in[idx];
	- idx += blockDim.x * gridDim.x;
	- }
	-
	- // Set the cache values
	- cache[cacheIdx] = temp;
	-
	- __syncthreads();
	-
	- // For reductions, threadsPerBlock must be a power of two
	- // because of the following code
	- unsigned int i = blockDim.x/2;
	- while (i != 0) {
	- if (cacheIdx < i)
	- cache[cacheIdx] += cache[cacheIdx + i];
	- __syncthreads();
	- i /= 2;
	- }
	-
	- // Write sum for block to global memory
	- if (cacheIdx == 0)
	- out[blockIdx.x] = cache[0];
	-}
	-
	-// Update wall positions
	-__global__ void integrateWalls(float4* dev_w_nx,
	- float4* dev_w_mvfd,
	- float* dev_w_force_partial,
	- unsigned int blocksPerGrid)
	-{
	- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	-
	- if (idx < devC_nw) { // Condition prevents block size error
	-
	- // Copy data to temporary arrays to avoid any potential read-after-write,
	- // write-after-read, or write-after-write hazards.
	- float4 w_nx = dev_w_nx[idx];
	- float4 w_mvfd = dev_w_mvfd[idx];
	-
	- // Find the final sum of forces on wall
	- w_mvfd.z = 0.0f;
	- for (int i=0; i<blocksPerGrid; ++i) {
	- w_mvfd.z += dev_w_force_partial[i];
	- }
	-
	- float dt = devC_dt;
	-
	- // Normal load = Deviatoric stress times wall surface area,
	- // directed downwards.
	- float N = -w_mvfd.wdevC_L[0]devC_L[1];
	-
	- // Calculate resulting acceleration of wall
	- // (Wall mass is stored in w component of position float4)
	- float acc = (w_mvfd.z+N)/w_mvfd.x;
	-
	- // Update linear velocity
	- w_mvfd.y += acc * dt;
	-
	- // Update position. Second-order scheme based on Taylor expansion
	- // (greater accuracy than the first-order Euler's scheme)
	- w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
	-
	- // Store data in global memory
	- dev_w_nx[idx] = w_nx;
	- dev_w_mvfd[idx] = w_mvfd;
	- }
	-} // End of integrateWalls(...)
	-
	-
	+#include "sorting.cuh"
	+#include "contactmodels.cuh"
	+#include "cohesion.cuh"
	+#include "contactsearch.cuh"
	+#include "integration.cuh"


	// Wrapper function for initializing the CUDA components.
	t@@ -1542,6 +65,24 @@ __host__ void initializeGPU(void)
	checkForCudaErrors("After initializing CUDA device");
	}

	+// Start timer for kernel profiling
	+__host__ void startTimer(cudaEvent_t* kernel_tic)
	+{
	+ cudaEventRecord(*kernel_tic);
	+}
	+
	+// Stop timer for kernel profiling and time to function sum
	+__host__ void stopTimer(cudaEvent_t *kernel_tic,
	+ cudaEvent_t *kernel_toc,
	+ float *kernel_elapsed,
	+ double* sum)
	+{
	+ cudaEventRecord(*kernel_toc, 0);
	+ cudaEventSynchronize(*kernel_toc);
	+ cudaEventElapsedTime(kernel_elapsed, kernel_tic, kernel_toc);
	+ sum += kernel_elapsed;
	+}
	+
	// Copy selected constant components to constant device memory.
	//extern "C"
	__host__ void transferToConstantMemory(Particles* p,
	t@@ -1555,12 +96,12 @@ __host__ void transferToConstantMemory(Particles* p,

	cudaMemcpyToSymbol("devC_np", &p->np, sizeof(p->np));
	cudaMemcpyToSymbol("devC_nc", &NC, sizeof(char));
	- cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(float)*ND);
	- cudaMemcpyToSymbol("devC_L", grid->L, sizeof(float)*ND);
	+ cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(Float)*ND);
	+ cudaMemcpyToSymbol("devC_L", grid->L, sizeof(Float)*ND);
	cudaMemcpyToSymbol("devC_num", grid->num, sizeof(unsigned int)*ND);
	- cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(float));
	+ cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(Float));
	cudaMemcpyToSymbol("devC_global", &params->global, sizeof(int));
	- cudaMemcpyToSymbol("devC_g", params->g, sizeof(float)*ND);
	+ cudaMemcpyToSymbol("devC_g", params->g, sizeof(Float)*ND);
	cudaMemcpyToSymbol("devC_nw", &params->nw, sizeof(unsigned int));
	cudaMemcpyToSymbol("devC_periodic", &params->periodic, sizeof(int));

	t@@ -1578,19 +119,22 @@ __host__ void transferToConstantMemory(Particles* p,
	params->mu_d = p->mu_d[0];
	params->mu_r = p->mu_r[0];
	params->rho = p->rho[0];
	- cudaMemcpyToSymbol("devC_k_n", &params->k_n, sizeof(float));
	- cudaMemcpyToSymbol("devC_k_s", &params->k_s, sizeof(float));
	- cudaMemcpyToSymbol("devC_k_r", &params->k_r, sizeof(float));
	- cudaMemcpyToSymbol("devC_gamma_n", &params->gamma_n, sizeof(float));
	- cudaMemcpyToSymbol("devC_gamma_s", &params->gamma_s, sizeof(float));
	- cudaMemcpyToSymbol("devC_gamma_r", &params->gamma_r, sizeof(float));
	- cudaMemcpyToSymbol("devC_mu_s", &params->mu_s, sizeof(float));
	- cudaMemcpyToSymbol("devC_mu_d", &params->mu_d, sizeof(float));
	- cudaMemcpyToSymbol("devC_mu_r", &params->mu_r, sizeof(float));
	- cudaMemcpyToSymbol("devC_rho", &params->rho, sizeof(float));
	- cudaMemcpyToSymbol("devC_kappa", &params->kappa, sizeof(float));
	- cudaMemcpyToSymbol("devC_db", &params->db, sizeof(float));
	- cudaMemcpyToSymbol("devC_V_b", &params->V_b, sizeof(float));
	+ cudaMemcpyToSymbol("devC_k_n", &params->k_n, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_k_s", &params->k_s, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_k_r", &params->k_r, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_n", &params->gamma_n, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_s", &params->gamma_s, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_r", &params->gamma_r, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_wn", &params->gamma_wn, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_ws", &params->gamma_ws, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_gamma_wr", &params->gamma_wr, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_mu_s", &params->mu_s, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_mu_d", &params->mu_d, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_mu_r", &params->mu_r, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_rho", &params->rho, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_kappa", &params->kappa, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_db", &params->db, sizeof(Float));
	+ cudaMemcpyToSymbol("devC_V_b", &params->V_b, sizeof(Float));
	cudaMemcpyToSymbol("devC_shearmodel", &params->shearmodel, sizeof(unsigned…
	} else {
	//printf("(params.global == %d) ", params.global);
	t@@ -1609,20 +153,20 @@ __host__ void transferToConstantMemory(Particles* p,


	//extern "C"
	-__host__ void gpuMain(float4* host_x,
	- float4* host_vel,
	- float4* host_acc,
	- float4* host_angvel,
	- float4* host_angacc,
	- float4* host_force,
	- float4* host_torque,
	+__host__ void gpuMain(Float4* host_x,
	+ Float4* host_vel,
	+ Float4* host_acc,
	+ Float4* host_angvel,
	+ Float4* host_angacc,
	+ Float4* host_force,
	+ Float4* host_torque,
	uint4* host_bonds,
	Particles* p,
	Grid* grid,
	Time* time,
	Params* params,
	- float4* host_w_nx,
	- float4* host_w_mvfd,
	+ Float4* host_w_nx,
	+ Float4* host_w_mvfd,
	const char* cwd,
	const char* inputbin)
	{
	t@@ -1633,31 +177,31 @@ __host__ void gpuMain(float4* host_x,
	transferToConstantMemory(p, grid, time, params);

	// Declare pointers for particle variables on the device
	- float4* dev_x; // Particle position
	- float4* dev_vel; // Particle linear velocity
	- float4* dev_angvel; // Particle angular velocity
	- float4* dev_acc; // Particle linear acceleration
	- float4* dev_angacc; // Particle angular acceleration
	- float4* dev_force; // Sum of forces
	- float4* dev_torque; // Sum of torques
	- float* dev_radius; // Particle radius
	- float* dev_es_dot; // Current shear energy producion rate
	- float* dev_es; // Total shear energy excerted on particle
	- float* dev_p; // Pressure excerted onto particle
	+ Float4* dev_x; // Particle position
	+ Float4* dev_vel; // Particle linear velocity
	+ Float4* dev_angvel; // Particle angular velocity
	+ Float4* dev_acc; // Particle linear acceleration
	+ Float4* dev_angacc; // Particle angular acceleration
	+ Float4* dev_force; // Sum of forces
	+ Float4* dev_torque; // Sum of torques
	+ Float* dev_radius; // Particle radius
	+ Float* dev_es_dot; // Current shear energy producion rate
	+ Float* dev_es; // Total shear energy excerted on particle
	+ Float* dev_p; // Pressure excerted onto particle
	//uint4* dev_bonds; // Particle bond pairs

	// Declare pointers for wall vectors on the device
	- float4* dev_w_nx; // Wall normal (x,y,z) and position (w)
	- float4* dev_w_mvfd; // Wall mass (x), velocity (y), force (z)
	+ Float4* dev_w_nx; // Wall normal (x,y,z) and position (w)
	+ Float4* dev_w_mvfd; // Wall mass (x), velocity (y), force (z)
	// and deviatoric stress (w)
	- float* dev_w_force; // Resulting force on wall per particle
	- float* dev_w_force_partial; // Partial sum from block of threads
	+ Float* dev_w_force; // Resulting force on wall per particle
	+ Float* dev_w_force_partial; // Partial sum from block of threads

	// Memory for sorted particle data
	- float4* dev_x_sorted;
	- float4* dev_vel_sorted;
	- float4* dev_angvel_sorted;
	- float* dev_radius_sorted;
	+ Float4* dev_x_sorted;
	+ Float4* dev_vel_sorted;
	+ Float4* dev_angvel_sorted;
	+ Float* dev_radius_sorted;
	//uint4* dev_bonds_sorted;

	// Grid-particle array pointers
	t@@ -1669,35 +213,35 @@ __host__ void gpuMain(float4* host_x,
	// Particle contact bookkeeping
	unsigned int* dev_contacts;
	// Particle pair distance correction across periodic boundaries
	- float4* dev_distmod;
	+ Float4* dev_distmod;
	// x,y,z contains the interparticle vector, corrected if contact
	// is across a periodic boundary.
	- float4* dev_delta_t; // Accumulated shear distance of contact
	+ Float4* dev_delta_t; // Accumulated shear distance of contact

	// Particle memory size
	- unsigned int memSizef = sizeof(float) * p->np;
	- unsigned int memSizef4 = sizeof(float4) * p->np;
	+ unsigned int memSizeF = sizeof(Float) * p->np;
	+ unsigned int memSizeF4 = sizeof(Float4) * p->np;

	// Allocate device memory for particle variables,
	// tie to previously declared pointers
	cout << " Allocating device memory: ";

	// Particle arrays
	- cudaMalloc((void**)&dev_x, memSizef4);
	- cudaMalloc((void**)&dev_x_sorted, memSizef4);
	- cudaMalloc((void**)&dev_vel, memSizef4);
	- cudaMalloc((void**)&dev_vel_sorted, memSizef4);
	- cudaMalloc((void**)&dev_angvel, memSizef4);
	- cudaMalloc((void**)&dev_angvel_sorted, memSizef4);
	- cudaMalloc((void**)&dev_acc, memSizef4);
	- cudaMalloc((void**)&dev_angacc, memSizef4);
	- cudaMalloc((void**)&dev_force, memSizef4);
	- cudaMalloc((void**)&dev_torque, memSizef4);
	- cudaMalloc((void**)&dev_radius, memSizef);
	- cudaMalloc((void**)&dev_radius_sorted, memSizef);
	- cudaMalloc((void**)&dev_es_dot, memSizef);
	- cudaMalloc((void**)&dev_es, memSizef);
	- cudaMalloc((void**)&dev_p, memSizef);
	+ cudaMalloc((void**)&dev_x, memSizeF4);
	+ cudaMalloc((void**)&dev_x_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_vel, memSizeF4);
	+ cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_angvel, memSizeF4);
	+ cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
	+ cudaMalloc((void**)&dev_acc, memSizeF4);
	+ cudaMalloc((void**)&dev_angacc, memSizeF4);
	+ cudaMalloc((void**)&dev_force, memSizeF4);
	+ cudaMalloc((void**)&dev_torque, memSizeF4);
	+ cudaMalloc((void**)&dev_radius, memSizeF);
	+ cudaMalloc((void**)&dev_radius_sorted, memSizeF);
	+ cudaMalloc((void**)&dev_es_dot, memSizeF);
	+ cudaMalloc((void**)&dev_es, memSizeF);
	+ cudaMalloc((void**)&dev_p, memSizeF);
	//cudaMalloc((void*)&dev_bonds, sizeof(uint4) p->np);
	//cudaMalloc((void*)&dev_bonds_sorted, sizeof(uint4) p->np);

	t@@ -1709,13 +253,13 @@ __host__ void gpuMain(float4* host_x,

	// Particle contact bookkeeping arrays
	cudaMalloc((void*)&dev_contacts, sizeof(unsigned int)p->np*NC); // Max NC …
	- cudaMalloc((void*)&dev_distmod, sizeof(float4)p->np*NC);
	- cudaMalloc((void*)&dev_delta_t, sizeof(float4)p->np*NC);
	+ cudaMalloc((void*)&dev_distmod, sizeof(Float4)p->np*NC);
	+ cudaMalloc((void*)&dev_delta_t, sizeof(Float4)p->np*NC);

	// Wall arrays
	- cudaMalloc((void*)&dev_w_nx, sizeof(float)params->nw*4);
	- cudaMalloc((void*)&dev_w_mvfd, sizeof(float)params->nw*4);
	- cudaMalloc((void*)&dev_w_force, sizeof(float)params->nw*p->np);
	+ cudaMalloc((void*)&dev_w_nx, sizeof(Float)params->nw*4);
	+ cudaMalloc((void*)&dev_w_mvfd, sizeof(Float)params->nw*4);
	+ cudaMalloc((void*)&dev_w_force, sizeof(Float)params->nw*p->np);

	checkForCudaErrors("Post device memory allocation");
	cout << "Done\n";
	t@@ -1724,22 +268,22 @@ __host__ void gpuMain(float4* host_x,
	cout << " Transfering data to the device: ";

	// Particle data
	- cudaMemcpy(dev_x, host_x, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_vel, host_vel, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_acc, host_acc, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_angvel, host_angvel, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_angacc, host_angacc, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_force, host_force, memSizef4, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_torque, host_torque, memSizef4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
	//cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p->np, cudaMemcpyHostToD…
	- cudaMemcpy(dev_radius, p->radius, memSizef, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_es_dot, p->es_dot, memSizef, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_es, p->es, memSizef, cudaMemcpyHostToDevice);
	- cudaMemcpy(dev_p, p->p, memSizef, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_radius, p->radius, memSizeF, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_es_dot, p->es_dot, memSizeF, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_es, p->es, memSizeF, cudaMemcpyHostToDevice);
	+ cudaMemcpy(dev_p, p->p, memSizeF, cudaMemcpyHostToDevice);

	// Wall data (wall mass and number in constant memory)
	- cudaMemcpy(dev_w_nx, host_w_nx, sizeof(float)params->nw4, cudaMemcpyHostTo…
	- cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(float)params->nw4, cudaMemcpyHo…
	+ cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)params->nw4, cudaMemcpyHostTo…
	+ cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)params->nw4, cudaMemcpyHo…

	// Initialize contacts lists to p.np
	unsigned int* npu = new unsigned int[p->np*NC];
	t@@ -1750,12 +294,12 @@ __host__ void gpuMain(float4* host_x,

	// Create array of 0.0 values on the host and transfer these to the distance
	// modifier and shear displacement arrays
	- float4* zerosf4 = new float4[p->np*NC];
	+ Float4* zerosF4 = new Float4[p->np*NC];
	for (unsigned int i=0; i<(p->np*NC); ++i)
	- zerosf4[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
	- cudaMemcpy(dev_distmod, zerosf4, sizeof(float4)p->npNC, cudaMemcpyHostToDe…
	- cudaMemcpy(dev_delta_t, zerosf4, sizeof(float4)p->npNC, cudaMemcpyHostToDe…
	- delete[] zerosf4;
	+ zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)p->npNC, cudaMemcpyHostToDe…
	+ cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)p->npNC, cudaMemcpyHostToDe…
	+ delete[] zerosF4;

	checkForCudaErrors("Post memcopy");
	cout << "Done\n";
	t@@ -1783,7 +327,7 @@ __host__ void gpuMain(float4* host_x,
	// Shared memory per block
	unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);

	- cudaMalloc((void*)&dev_w_force_partial, sizeof(float)dimGrid.x);
	+ cudaMalloc((void*)&dev_w_force_partial, sizeof(Float)dimGrid.x);

	// Report to stdout
	cout << "\n Device memory allocation and transfer complete.\n"
	t@@ -1821,9 +365,7 @@ __host__ void gpuMain(float4* host_x,
	cout << "\n Entering the main calculation time loop...\n\n"
	<< " IMPORTANT: Do not close this terminal, doing so will \n"
	<< " terminate this SPHERE process. Follow the \n"
	- << " progress in MATLAB using:\n"
	- << " >> status('" << inputbin << "')\n"
	- << " or in this directory by executing:\n"
	+ << " progress by executing:\n"
	<< " $ ./sphere_status " << inputbin << "\n\n";

	// Enable cuPrintf()
	t@@ -1835,6 +377,23 @@ __host__ void gpuMain(float4* host_x,
	cudaEventCreate(&dev_toc);
	cudaEventRecord(dev_tic, 0);

	+ // If profiling is enabled, initialize timers for each kernel
	+ cudaEvent_t kernel_tic, kernel_toc;
	+ float kernel_elapsed;
	+ double t_calcParticleCellID = 0.0;
	+ double t_thrustsort = 0.0;
	+ double t_reorderArrays = 0.0;
	+ double t_topology = 0.0;
	+ double t_interact = 0.0;
	+ double t_integrate = 0.0;
	+ double t_summation = 0.0;
	+ double t_integrateWalls = 0.0;
	+
	+ if (PROFILING == 1) {
	+ cudaEventCreate(&kernel_tic);
	+ cudaEventCreate(&kernel_toc);
	+ }
	+
	cout << " Current simulation time: " << time->current << " s.";


	t@@ -1855,20 +414,28 @@ __host__ void gpuMain(float4* host_x,
	// For each particle:
	// Compute hash key (cell index) from position
	// in the fine, uniform and homogenous grid.
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
	dev_gridParticleIndex,
	dev_x);

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_calcParticleCell…
	checkForCudaErrors("Post calcParticleCellID");


	// Sort particle (key, particle ID) pairs by hash key with Thrust radix so…
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
	thrust::device_ptr<uint>(dev_gridParticleCellID + p->n…
	thrust::device_ptr<uint>(dev_gridParticleIndex));
	cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implic…
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
	checkForCudaErrors("Post calcParticleCellID");


	t@@ -1882,6 +449,8 @@ __host__ void gpuMain(float4* host_x,

	// Use sorted order to reorder particle arrays (position, velocities, radi…
	// coherent memory access. Save ordered configurations in new arrays (*_so…
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
	dev_cellEnd,
	dev_gridParticleCellID,
	t@@ -1897,6 +466,8 @@ __host__ void gpuMain(float4* host_x,

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_reorderArrays);
	checkForCudaErrors("Post reorderArrays", iter);

	// The contact search in topology() is only necessary for determining
	t@@ -1904,6 +475,8 @@ __host__ void gpuMain(float4* host_x,
	// shear force model
	if (params->shearmodel == 2) {
	// For each particle: Search contacts in neighbor cells
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	topology<<<dimGrid, dimBlock>>>(dev_cellStart,
	dev_cellEnd,
	dev_gridParticleIndex,
	t@@ -1918,11 +491,15 @@ __host__ void gpuMain(float4* host_x,

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_topology);
	checkForCudaErrors("Post topology: One or more particles moved outside t…
	}


	// For each particle: Process collisions and compute resulting forces.
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
	dev_cellStart,
	dev_cellEnd,
	t@@ -1944,23 +521,37 @@ __host__ void gpuMain(float4* host_x,

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_interact);
	checkForCudaErrors("Post interact - often caused if particles move outside…

	// Update particle kinematics
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted,
	dev_angvel_sorted, dev_radius_sorted,
	dev_x, dev_vel, dev_angvel,
	dev_force, dev_torque,
	dev_gridParticleIndex);

	+ cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrate);
	+
	// Summation of forces on wall
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
	checkForCudaErrors("Post integrate & wall force summation");

	// Update wall kinematics
	+ if (PROFILING == 1)
	+ startTimer(&kernel_tic);
	integrateWalls<<< 1, params->nw>>>(dev_w_nx,
	dev_w_mvfd,
	dev_w_force_partial,
	t@@ -1968,6 +559,8 @@ __host__ void gpuMain(float4* host_x,

	// Synchronization point
	cudaThreadSynchronize();
	+ if (PROFILING == 1)
	+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrateWalls);
	checkForCudaErrors("Post integrateWalls");


	t@@ -1976,7 +569,8 @@ __host__ void gpuMain(float4* host_x,
	filetimeclock += time->dt;

	// Report time to console
	- cout << "\r Current simulation time: " << time->current << " s. ";
	+ cout << "\r Current simulation time: "
	+ << time->current << " s. ";// << std::flush;


	// Produce output binary if the time interval
	t@@ -1990,22 +584,22 @@ __host__ void gpuMain(float4* host_x,
	//// Copy device data to host memory

	// Particle data
	- cudaMemcpy(host_x, dev_x, memSizef4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_vel, dev_vel, memSizef4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_acc, dev_acc, memSizef4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_angvel, dev_angvel, memSizef4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_force, dev_force, memSizef4, cudaMemcpyDeviceToHost);
	- cudaMemcpy(host_torque, dev_torque, memSizef4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
	//cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p->np, cudaMemcpyDev…
	- cudaMemcpy(p->es_dot, dev_es_dot, memSizef, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p->es, dev_es, memSizef, cudaMemcpyDeviceToHost);
	- cudaMemcpy(p->p, dev_p, memSizef, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(p->es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(p->es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
	+ cudaMemcpy(p->p, dev_p, memSizeF, cudaMemcpyDeviceToHost);

	// Wall data
	cudaMemcpy(host_w_nx, dev_w_nx,
	- sizeof(float)params->nw4, cudaMemcpyDeviceToHost);
	+ sizeof(Float)params->nw4, cudaMemcpyDeviceToHost);
	cudaMemcpy(host_w_mvfd, dev_w_mvfd,
	- sizeof(float)params->nw4, cudaMemcpyDeviceToHost);
	+ sizeof(Float)params->nw4, cudaMemcpyDeviceToHost);

	// Pause the CPU thread until all CUDA calls previously issued are compl…
	cudaThreadSynchronize();
	t@@ -2057,6 +651,23 @@ __host__ void gpuMain(float4* host_x,
	cudaEventDestroy(dev_tic);
	cudaEventDestroy(dev_toc);

	+ cudaEventDestroy(kernel_tic);
	+ cudaEventDestroy(kernel_toc);
	+
	+ // Report time spent on each kernel
	+ if (PROFILING == 1) {
	+ cout << "\nKernel profiling statistics:\n"
	+ << " - calcParticleCellID:\t" << t_calcParticleCellID/1000.0 << " s\…
	+ << " - thrustsort:\t\t" << t_thrustsort/1000.0 << " s\n"
	+ << " - reorderArrays:\t" << t_reorderArrays/1000.0 << " s\n"
	+ << " - topology:\t\t" << t_topology/1000.0 << " s\n"
	+ << " - interact:\t\t" << t_interact/1000.0 << " s\n"
	+ << " - integrate:\t\t" << t_integrate/1000.0 << " s\n"
	+ << " - summation:\t\t" << t_summation/1000.0 << " s\n"
	+ << " - integrateWalls:\t" << t_integrateWalls/1000.0 << " s\n";
	+ }
	+
	+
	// Free memory allocated to cudaPrintfInit
	//cudaPrintfEnd();

	diff --git a/src/file_io.cpp b/src/file_io.cpp
	t@@ -7,12 +7,22 @@


	// Write host variables to target binary file
	-int fwritebin(char target, Particles p, float4 host_x, float4 host_vel, fl…
	- float4 host_force, float4 host_torque,
	+// The output format should ALWAYS be double precision,
	+// so this function will typecast the data before write
	+// if it is single precision.
	+int fwritebin(char *target,
	+ Particles *p,
	+ Float4 *host_x,
	+ Float4 *host_vel,
	+ Float4 *host_angvel,
	+ Float4 *host_force,
	+ Float4 *host_torque,
	uint4 *host_bonds,
	- Grid grid, Time time,
	+ Grid *grid,
	+ Time *time,
	Params *params,
	- float4 host_w_nx, float4 host_w_mvfd)
	+ Float4 *host_w_nx,
	+ Float4 *host_w_mvfd)
	{

	FILE *fp;
	t@@ -24,110 +34,286 @@ int fwritebin(char target, Particles p, float4 *host_x…
	return 1; // Return unsuccessful exit status
	}

	- // World dimensions
	- fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
	+ // If double precision: Values can be written directly
	+ if (sizeof(Float) == sizeof(double)) {

	- // Number of particles
	- fwrite(&p->np, sizeof(p->np), 1, fp);
	+ // World dimensions
	+ fwrite(&grid->nd, sizeof(grid->nd), 1, fp);

	- // Temporal parameters
	- fwrite(&time->dt, sizeof(time->dt), 1, fp);
	- fwrite(&time->current, sizeof(time->current), 1, fp);
	- fwrite(&time->total, sizeof(time->total), 1, fp);
	- fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
	- fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
	+ // Number of particles
	+ fwrite(&p->np, sizeof(p->np), 1, fp);

	- // World coordinate system origo
	- for (u=0; u<grid->nd; ++u) {
	- fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
	- }
	+ // Temporal parameters
	+ fwrite(&time->dt, sizeof(time->dt), 1, fp);
	+ fwrite(&time->current, sizeof(time->current), 1, fp);
	+ fwrite(&time->total, sizeof(time->total), 1, fp);
	+ fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
	+ fwrite(&time->step_count, sizeof(time->step_count), 1, fp);

	- // World dimensions
	- for (u=0; u<grid->nd; ++u) {
	- fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
	- }
	+ // World coordinate system origo
	+ for (u=0; u<grid->nd; ++u) {
	+ fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
	+ }

	- // Grid cells along each dimension
	- for (u=0; u<grid->nd; ++u) {
	- fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
	- }
	+ // World dimensions
	+ for (u=0; u<grid->nd; ++u) {
	+ fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
	+ }

	- // Particle vectors
	- for (j=0; j<p->np; ++j) {
	- // x-axis
	- fwrite(&host_x[j].x, sizeof(float), 1, fp);
	- fwrite(&host_vel[j].x, sizeof(float), 1, fp);
	- fwrite(&host_angvel[j].x, sizeof(float), 1, fp);
	- fwrite(&host_force[j].x, sizeof(float), 1, fp);
	- fwrite(&host_torque[j].x, sizeof(float), 1, fp);
	-
	- // y-axis
	- fwrite(&host_x[j].y, sizeof(float), 1, fp);
	- fwrite(&host_vel[j].y, sizeof(float), 1, fp);
	- fwrite(&host_angvel[j].y, sizeof(float), 1, fp);
	- fwrite(&host_force[j].y, sizeof(float), 1, fp);
	- fwrite(&host_torque[j].y, sizeof(float), 1, fp);
	-
	- // z-axis
	- fwrite(&host_x[j].z, sizeof(float), 1, fp);
	- fwrite(&host_vel[j].z, sizeof(float), 1, fp);
	- fwrite(&host_angvel[j].z, sizeof(float), 1, fp);
	- fwrite(&host_force[j].z, sizeof(float), 1, fp);
	- fwrite(&host_torque[j].z, sizeof(float), 1, fp);
	- }
	-
	- // Individual particle values
	- for (j=0; j<p->np; ++j) {
	- fwrite(&host_vel[j].w, sizeof(float), 1, fp);
	- fwrite(&host_x[j].w, sizeof(float), 1, fp);
	- fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
	- fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
	- fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
	- fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
	- fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
	- fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
	- fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
	- fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
	- fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
	- fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
	- fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
	- fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
	- fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
	- fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
	- }
	+ // Grid cells along each dimension
	+ for (u=0; u<grid->nd; ++u) {
	+ fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
	+ }

	- // Singular parameters
	- fwrite(&params->global, sizeof(params->global), 1, fp);
	- for (u=0; u<grid->nd; ++u) {
	- fwrite(&params->g[u], sizeof(params->g[u]), 1, fp);
	- }
	- fwrite(&params->kappa, sizeof(params->kappa), 1, fp);
	- fwrite(&params->db, sizeof(params->db), 1, fp);
	- fwrite(&params->V_b, sizeof(params->V_b), 1, fp);
	- fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
	-
	- // Walls
	- fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
	- for (j=0; j<params->nw; ++j) {
	- // Wall normal
	- fwrite(&host_w_nx[j].x, sizeof(float), 1, fp);
	- fwrite(&host_w_nx[j].y, sizeof(float), 1, fp);
	- fwrite(&host_w_nx[j].z, sizeof(float), 1, fp);
	-
	- fwrite(&host_w_nx[j].w, sizeof(float), 1, fp); // Wall position
	- fwrite(&host_w_mvfd[j].x, sizeof(float), 1, fp); // Wall mass
	- fwrite(&host_w_mvfd[j].y, sizeof(float), 1, fp); // Wall velocity
	- fwrite(&host_w_mvfd[j].z, sizeof(float), 1, fp); // Wall force
	- fwrite(&host_w_mvfd[j].w, sizeof(float), 1, fp); // Wall deviatoric stress
	- }
	- fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
	-
	- // Write bond pair values
	- for (j=0; j<p->np; ++j) {
	- fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
	- fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
	- fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
	- fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
	+ // Particle vectors
	+ for (j=0; j<p->np; ++j) {
	+ // x-axis
	+ fwrite(&host_x[j].x, sizeof(Float), 1, fp);
	+ fwrite(&host_vel[j].x, sizeof(Float), 1, fp);
	+ fwrite(&host_angvel[j].x, sizeof(Float), 1, fp);
	+ fwrite(&host_force[j].x, sizeof(Float), 1, fp);
	+ fwrite(&host_torque[j].x, sizeof(Float), 1, fp);
	+
	+ // y-axis
	+ fwrite(&host_x[j].y, sizeof(Float), 1, fp);
	+ fwrite(&host_vel[j].y, sizeof(Float), 1, fp);
	+ fwrite(&host_angvel[j].y, sizeof(Float), 1, fp);
	+ fwrite(&host_force[j].y, sizeof(Float), 1, fp);
	+ fwrite(&host_torque[j].y, sizeof(Float), 1, fp);
	+
	+ // z-axis
	+ fwrite(&host_x[j].z, sizeof(Float), 1, fp);
	+ fwrite(&host_vel[j].z, sizeof(Float), 1, fp);
	+ fwrite(&host_angvel[j].z, sizeof(Float), 1, fp);
	+ fwrite(&host_force[j].z, sizeof(Float), 1, fp);
	+ fwrite(&host_torque[j].z, sizeof(Float), 1, fp);
	+ }
	+
	+ // Individual particle values
	+ for (j=0; j<p->np; ++j) {
	+ fwrite(&host_vel[j].w, sizeof(Float), 1, fp);
	+ fwrite(&host_x[j].w, sizeof(Float), 1, fp);
	+ fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
	+ fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
	+ fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
	+ fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
	+ fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
	+ fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
	+ fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
	+ fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
	+ fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
	+ fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
	+ fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
	+ fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
	+ fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
	+ fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
	+ }
	+
	+ // Singular parameters
	+ fwrite(&params->global, sizeof(params->global), 1, fp);
	+ for (u=0; u<grid->nd; ++u) {
	+ fwrite(&params->g[u], sizeof(params->g[u]), 1, fp);
	+ }
	+ fwrite(&params->kappa, sizeof(params->kappa), 1, fp);
	+ fwrite(&params->db, sizeof(params->db), 1, fp);
	+ fwrite(&params->V_b, sizeof(params->V_b), 1, fp);
	+ fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
	+
	+ // Walls
	+ fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
	+ for (j=0; j<params->nw; ++j) {
	+ // Wall normal
	+ fwrite(&host_w_nx[j].x, sizeof(Float), 1, fp);
	+ fwrite(&host_w_nx[j].y, sizeof(Float), 1, fp);
	+ fwrite(&host_w_nx[j].z, sizeof(Float), 1, fp);
	+
	+ fwrite(&host_w_nx[j].w, sizeof(Float), 1, fp); // Wall position
	+ fwrite(&host_w_mvfd[j].x, sizeof(Float), 1, fp); // Wall mass
	+ fwrite(&host_w_mvfd[j].y, sizeof(Float), 1, fp); // Wall velocity
	+ fwrite(&host_w_mvfd[j].z, sizeof(Float), 1, fp); // Wall force
	+ fwrite(&host_w_mvfd[j].w, sizeof(Float), 1, fp); // Wall deviatoric stre…
	+ }
	+ fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
	+ fwrite(&params->gamma_wn, sizeof(params->gamma_wn), 1, fp);
	+ fwrite(&params->gamma_ws, sizeof(params->gamma_ws), 1, fp);
	+ fwrite(&params->gamma_wr, sizeof(params->gamma_wr), 1, fp);
	+
	+
	+ // Write bond pair values
	+ for (j=0; j<p->np; ++j) {
	+ fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
	+ }
	+
	+ } else if (sizeof(Float) == sizeof(float)) {
	+ // Single precision: Type conversion required
	+
	+ double d; // Double precision placeholder
	+
	+ // World dimensions
	+ fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
	+
	+ // Number of particles
	+ fwrite(&p->np, sizeof(p->np), 1, fp);
	+
	+ // Temporal parameters
	+ d = (double)time->dt;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ fwrite(&time->current, sizeof(time->current), 1, fp);
	+ fwrite(&time->total, sizeof(time->total), 1, fp);
	+ d = (double)time->file_dt;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
	+
	+ // World coordinate system origo
	+ for (u=0; u<grid->nd; ++u) {
	+ d = (double)grid->origo[u];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+
	+ // World dimensions
	+ for (u=0; u<grid->nd; ++u) {
	+ d = (double)grid->L[u];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+
	+ // Grid cells along each dimension
	+ for (u=0; u<grid->nd; ++u) {
	+ fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
	+ }
	+
	+ // Particle vectors
	+ for (j=0; j<p->np; ++j) {
	+ // x-axis
	+ d = (double)host_x[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_vel[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_angvel[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_force[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_torque[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+
	+ // y-axis
	+ d = (double)host_x[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_vel[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_angvel[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_force[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_torque[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+
	+ // z-axis
	+ d = (double)host_x[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_vel[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_angvel[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_force[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_torque[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+
	+ // Individual particle values
	+ for (j=0; j<p->np; ++j) {
	+ d = (double)host_vel[j].w;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_x[j].w;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->radius[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->rho[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->k_n[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->k_s[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->k_r[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->gamma_n[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->gamma_s[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->gamma_r[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->mu_s[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->mu_d[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->mu_r[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->es_dot[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->es[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)p->p[j];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+
	+ // Singular parameters
	+ fwrite(&params->global, sizeof(params->global), 1, fp);
	+ for (u=0; u<grid->nd; ++u) {
	+ d = (double)params->g[u];
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+ d = (double)params->kappa;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)params->db;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)params->V_b;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
	+
	+ // Walls
	+ fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
	+ for (j=0; j<params->nw; ++j) {
	+ // Wall normal
	+ d = (double)host_w_nx[j].x;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_nx[j].y;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_nx[j].z;
	+ fwrite(&d, sizeof(d), 1, fp);
	+
	+ d = (double)host_w_nx[j].w; // Wall position
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_mvfd[j].x; // Wall mass
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_mvfd[j].y; // Wall velocity
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_mvfd[j].z; // Wall force
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)host_w_mvfd[j].w; // Wall deviatoric stress
	+ fwrite(&d, sizeof(d), 1, fp);
	+ }
	+ fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
	+ d = (double)params->gamma_wn;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)params->gamma_ws;
	+ fwrite(&d, sizeof(d), 1, fp);
	+ d = (double)params->gamma_wr;
	+ fwrite(&d, sizeof(d), 1, fp);
	+
	+ // Write bond pair values
	+ for (j=0; j<p->np; ++j) {
	+ fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
	+ fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
	+ }
	+
	+ } else {
	+ fprintf(stderr, "Error: Chosen floating-point precision is incompatible wi…
	+ exit(1);
	}

	fclose(fp);
	diff --git a/src/main.cpp b/src/main.cpp
	t@@ -2,10 +2,10 @@
	/* SPHERE source code by Anders Damsgaard Christensen, 2010-12, */
	/* a 3D Discrete Element Method algorithm with CUDA GPU acceleration. */
	/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
	-
	+// Licence: Gnu Public License (GPL) v. 3. See license.txt.
	// See doc/sphere-doc.pdf for full documentation.
	// Compile with GNU make by typing 'make' in the src/ directory.
	-// SPHERE is called from the command line with e.g. './sphere_<architecture> p…
	+// SPHERE is called from the command line with './sphere_<architecture> projec…


	// Including library files
	t@@ -15,7 +15,7 @@
	#include <unistd.h> // UNIX only: For getcwd
	#include <string.h> // For strerror and strcmp
	#include <errno.h> // For errno
	-#include <math.h> // Basic floating point mathematical operations
	+#include <math.h> // Basic Floating point mathematical operations
	#include <time.h> // Time and date functions for timer

	// Including user files
	t@@ -139,41 +139,41 @@ int main(int argc, char *argv[])
	<< time.step_count << "\n";


	- // For spatial vectors an array of float4 vectors is chosen for best fit wit…
	+ // For spatial vectors an array of Float4 vectors is chosen for best fit wit…
	// GPU memory handling. Vector variable structure: ( x, y, z, <empty>).
	// Indexing starts from 0.

	// Allocate host arrays
	cout << "\n Allocating host memory: ";
	- grid.origo = new float[ND]; // Coordinate system origo
	- grid.L = new float[ND]; // Model world dimensions
	+ grid.origo = new Float[ND]; // Coordinate system origo
	+ grid.L = new Float[ND]; // Model world dimensions
	grid.num = new unsigned int[ND]; // Number of cells in each dimension
	- params.g = new float[ND]; // Gravitational acceleration vector
	- p.radius = new float[p.np]; // Particle radii
	- p.rho = new float[p.np]; // Particle densities
	- p.m = new float[p.np]; // Particle masses
	- p.I = new float[p.np]; // Particle moment of inertia
	- p.k_n = new float[p.np]; // Particle normal stiffnesses
	- p.k_s = new float[p.np]; // Particle shear stiffness…
	- p.k_r = new float[p.np]; // Particle rolling stiffne…
	- p.gamma_n = new float[p.np]; // Particle normal viscosity
	- p.gamma_s = new float[p.np]; // Particle shear viscosity
	- p.gamma_r = new float[p.np]; // Particle rolling viscosity
	- p.mu_s = new float[p.np]; // Inter-particle static shear contact fr…
	- p.mu_d = new float[p.np]; // Inter-particle dynamic shear co…
	- p.mu_r = new float[p.np]; // Inter-particle rolling contact …
	- p.es_dot = new float[p.np]; // Rate of shear energy dissipation
	- p.es = new float[p.np]; // Total shear energy dissipation
	- p.p = new float[p.np]; // Pressure excerted onto par…
	-
	- // Allocate float4 host arrays
	- float4 *host_x = new float4[p.np]; // Center coordinates for each part…
	- float4 *host_vel = new float4[p.np]; // Particle velocities (dotx = v)
	- float4 *host_acc = new float4[p.np]; // Particle accellerations (dotdotx…
	- float4 *host_angvel = new float4[p.np]; // Particle angular velocity vector…
	- float4 *host_angacc = new float4[p.np]; // Particle angular acceleration ve…
	- float4 *host_force = new float4[p.np]; // Particle summed force
	- float4 *host_torque = new float4[p.np]; // Particle summed torque
	+ params.g = new Float[ND]; // Gravitational acceleration vector
	+ p.radius = new Float[p.np]; // Particle radii
	+ p.rho = new Float[p.np]; // Particle densities
	+ p.m = new Float[p.np]; // Particle masses
	+ p.I = new Float[p.np]; // Particle moment of inertia
	+ p.k_n = new Float[p.np]; // Particle normal stiffnesses
	+ p.k_s = new Float[p.np]; // Particle shear stiffness…
	+ p.k_r = new Float[p.np]; // Particle rolling stiffne…
	+ p.gamma_n = new Float[p.np]; // Particle normal viscosity
	+ p.gamma_s = new Float[p.np]; // Particle shear viscosity
	+ p.gamma_r = new Float[p.np]; // Particle rolling viscosity
	+ p.mu_s = new Float[p.np]; // Inter-particle static shear contact fr…
	+ p.mu_d = new Float[p.np]; // Inter-particle dynamic shear co…
	+ p.mu_r = new Float[p.np]; // Inter-particle rolling contact …
	+ p.es_dot = new Float[p.np]; // Rate of shear energy dissipation
	+ p.es = new Float[p.np]; // Total shear energy dissipation
	+ p.p = new Float[p.np]; // Pressure excerted onto par…
	+
	+ // Allocate Float4 host arrays
	+ Float4 *host_x = new Float4[p.np]; // Center coordinates for each part…
	+ Float4 *host_vel = new Float4[p.np]; // Particle velocities (dotx = v)
	+ Float4 *host_acc = new Float4[p.np]; // Particle accellerations (dotdotx…
	+ Float4 *host_angvel = new Float4[p.np]; // Particle angular velocity vector…
	+ Float4 *host_angacc = new Float4[p.np]; // Particle angular acceleration ve…
	+ Float4 *host_force = new Float4[p.np]; // Particle summed force
	+ Float4 *host_torque = new Float4[p.np]; // Particle summed torque

	uint4 *host_bonds = new uint4[p.np]; // Bonds from particle [i] to two p…
	cout << "Done\n";
	t@@ -196,44 +196,44 @@ int main(int argc, char *argv[])
	}

	for (j=0; j<p.np; ++j) {
	- if (fread(&host_x[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_x[j].x, sizeof(Float), 1, fp) != 1)
	return 1; // Return unsuccessful exit status
	- if (fread(&host_vel[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_vel[j].x, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_angvel[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_angvel[j].x, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_force[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_force[j].x, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_torque[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_torque[j].x, sizeof(Float), 1, fp) != 1)
	return 1;

	- if (fread(&host_x[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_x[j].y, sizeof(Float), 1, fp) != 1)
	return 1; // Return unsuccessful exit status
	- if (fread(&host_vel[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_vel[j].y, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_angvel[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_angvel[j].y, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_force[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_force[j].y, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_torque[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_torque[j].y, sizeof(Float), 1, fp) != 1)
	return 1;

	- if (fread(&host_x[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_x[j].z, sizeof(Float), 1, fp) != 1)
	return 1; // Return unsuccessful exit status
	- if (fread(&host_vel[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_vel[j].z, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_angvel[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_angvel[j].z, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_force[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_force[j].z, sizeof(Float), 1, fp) != 1)
	return 1;
	- if (fread(&host_torque[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_torque[j].z, sizeof(Float), 1, fp) != 1)
	return 1;
	}

	for (j=0; j<p.np; ++j) {
	- if (fread(&host_vel[j].w, sizeof(float), 1, fp) != 1) // Fixvel
	+ if (fread(&host_vel[j].w, sizeof(Float), 1, fp) != 1) // Fixvel
	return 1; // Return unsuccessful exit status
	- if (fread(&host_x[j].w, sizeof(float), 1, fp) != 1) // xsum
	+ if (fread(&host_x[j].w, sizeof(Float), 1, fp) != 1) // xsum
	return 1;
	if (fread(&p.radius[j], sizeof(p.radius[j]), 1, fp) != 1)
	return 1;
	t@@ -288,39 +288,39 @@ int main(int argc, char *argv[])
	// Allocate host memory for walls
	// Wall normal (x,y,z), w: wall position on axis parallel to wall normal
	// Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
	- float4 *host_w_nx = new float4[params.nw];
	- float4 *host_w_mvfd = new float4[params.nw];
	+ Float4 *host_w_nx = new Float4[params.nw];
	+ Float4 *host_w_mvfd = new Float4[params.nw];

	// Read wall data
	for (j=0; j<params.nw; ++j) {
	// Wall normal, x-dimension
	- if (fread(&host_w_nx[j].x, sizeof(float), 1, fp) != 1) // Read in single p…
	+ if (fread(&host_w_nx[j].x, sizeof(Float), 1, fp) != 1) // Read in single p…
	return 1;
	// Wall normal, y-dimension
	- if (fread(&host_w_nx[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_nx[j].y, sizeof(Float), 1, fp) != 1)
	return 1;
	// Wall normal, z-dimension
	- if (fread(&host_w_nx[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_nx[j].z, sizeof(Float), 1, fp) != 1)
	return 1;

	// Wall position along axis parallel to wall normal
	- if (fread(&host_w_nx[j].w, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_nx[j].w, sizeof(Float), 1, fp) != 1)
	return 1;

	// Wall mass
	- if (fread(&host_w_mvfd[j].x, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_mvfd[j].x, sizeof(Float), 1, fp) != 1)
	return 1;

	// Wall velocity along axis parallel to wall normal
	- if (fread(&host_w_mvfd[j].y, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_mvfd[j].y, sizeof(Float), 1, fp) != 1)
	return 1;

	// Wall force along axis parallel to wall normal
	- if (fread(&host_w_mvfd[j].z, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_mvfd[j].z, sizeof(Float), 1, fp) != 1)
	return 1;

	// Wall deviatoric stress
	- if (fread(&host_w_mvfd[j].w, sizeof(float), 1, fp) != 1)
	+ if (fread(&host_w_mvfd[j].w, sizeof(Float), 1, fp) != 1)
	return 1;
	}
	// x- and y boundary behavior.
	t@@ -330,6 +330,15 @@ int main(int argc, char *argv[])
	params.periodic = 0;
	}

	+ // Wall viscosities
	+ if (fread(&params.gamma_wn, sizeof(params.gamma_wn), 1, fp) != 1)
	+ return 1;
	+ if (fread(&params.gamma_ws, sizeof(params.gamma_ws), 1, fp) != 1)
	+ return 1;
	+ if (fread(&params.gamma_wr, sizeof(params.gamma_wr), 1, fp) != 1)
	+ return 1;
	+
	+
	for (i=0; i<p.np; ++i) {
	if (fread(&host_bonds[i].x, sizeof(unsigned int), 1, fp) != 1)
	return 1;