GopherProxy

	tMajor revision: Double precision, kernels split into separate sources - sphere…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit cd41737e0b69a87e1dbca387031c53793b51ab56
	parent d818c179c8ce377aa14d4a4398ff6e6f9907fe75
	Author: Anders Damsgaard <[email protected]>
	Date: Mon, 20 Aug 2012 16:36:05 +0200

	Major revision: Double precision, kernels split into separate sources

	Diffstat:
	A img_out/about.txt \| 2 ++
	A input/1e3-test-shear.bin \| 0
	D mfiles/freadbin.m \| 118 -----------------------------…
	D mfiles/fwritebin.m \| 98 -----------------------------…
	D mfiles/initsetup.m \| 180 -----------------------------…
	D mfiles/plotspheres.m \| 99 -----------------------------…
	D mfiles/status.m \| 11 -----------
	D mfiles/useoutput.m \| 112 -----------------------------…
	D mfiles/visualize.m \| 379 -----------------------------…
	A python/elastic_collision.py \| 46 +++++++++++++++++++++++++++++…
	A python/inelastic_collision.py \| 49 +++++++++++++++++++++++++++++…
	A raytracer/colorbar.h \| 22 ++++++++++++++++++++++
	A raytracer/render_all_outputs_CPU.sh \| 27 +++++++++++++++++++++++++++
	A raytracer/render_all_outputs_GPU.sh \| 29 +++++++++++++++++++++++++++++
	A raytracer/rt-kernel.cu \| 442 +++++++++++++++++++++++++++++…
	A raytracer/rt_GPU_init_pres.sh \| 36 +++++++++++++++++++++++++++++…
	A raytracer/rt_GPU_pres.sh \| 36 +++++++++++++++++++++++++++++…
	A raytracer/rt_GPU_tall_pres.sh \| 36 +++++++++++++++++++++++++++++…
	A src/cohesion.cuh \| 126 +++++++++++++++++++++++++++++…
	A src/contactmodels.cuh \| 411 ++++++++++++++++++++++++++++++
	A src/contactsearch.cuh \| 625 +++++++++++++++++++++++++++++…
	A src/integration.cuh \| 207 ++++++++++++++++++++++++++++++
	A src/sorting.cuh \| 127 +++++++++++++++++++++++++++++…
	A src/vector_arithmetic.h \| 994 +++++++++++++++++++++++++++++…

	24 files changed, 3215 insertions(+), 997 deletions(-)
	---
	diff --git a/img_out/about.txt b/img_out/about.txt
	t@@ -0,0 +1,2 @@
	+This folder will contain images rendered from ../output binaries, and
	+graphs plotted with ../python/sphere.py.
	diff --git a/input/1e3-test-shear.bin b/input/1e3-test-shear.bin
	Binary files differ.
	diff --git a/mfiles/freadbin.m b/mfiles/freadbin.m
	t@@ -1,118 +0,0 @@
	-function [p, grids, time, params, walls] = freadbin(path, fn)
	- %path = '../output/'; %Target directory
	- %fn = 'output0.bin'; %Target binary
	-
	- %Open binary file for reading with little endian ordering and unicode char…
	- %fid = fopen([path fn],'rb','ieee-le','UTF-8');
	- fid = fopen([path fn], 'rb', 'ieee-le');
	-
	- % Read the number of dimensions and particles
	- grids.nd = fread(fid, 1, 'int');
	- p.np = fread(fid, 1, 'uint32');
	-
	- % Read the time variables
	- time.dt = fread(fid, 1, 'float');
	- time.current = fread(fid, 1, 'double');
	- time.total = fread(fid, 1, 'double');
	- time.file_dt = fread(fid, 1, 'float');
	- time.step_count = fread(fid, 1, 'uint32');
	-
	- %Initiate variables for faster execution
	- p.x = zeros(p.np, grids.nd); % Coordinate vector
	- p.vel = zeros(p.np, grids.nd); % Velocity vector
	- p.angvel = zeros(p.np, grids.nd); % Acceleration vector
	- p.force = zeros(p.np, grids.nd); % Linear force vector
	- p.torque = zeros(p.np, grids.nd); % Rotational torque vector
	- p.fixvel = zeros(p.np, 1); % Fix horizontal particle veloci…
	- p.xsum = zeros(p.np, 1); % Total particle displacement al…
	- p.radius = zeros(p.np, 1); % Particle radius
	- p.rho = zeros(p.np, 1); % Density
	- p.k_n = zeros(p.np, 1); % Normal stiffness
	- p.k_s = zeros(p.np, 1); % Shear stiffness
	- p.k_r = zeros(p.np, 1); % Rolling stiffness
	- p.gamma_s = zeros(p.np, 1); % Shear viscosity
	- p.gamma_r = zeros(p.np, 1); % Rolling viscosity
	- p.mu_s = zeros(p.np, 1); % Inter-particle contact static shear f…
	- p.mu_d = zeros(p.np, 1); % Inter-particle contact dynamic shear …
	- p.mu_r = zeros(p.np, 1); % Inter-particle contact rolling fricti…
	- p.C = zeros(p.np, 1); % Inter-particle cohesion
	- p.E = zeros(p.np, 1); % Young's modulus
	- p.K = zeros(p.np, 1); % Bulk modulus
	- p.nu = zeros(p.np, 1); % Poisson's ratio
	- p.es_dot = zeros(p.np, 1); % Current shear energy dissipation
	- p.es = zeros(p.np, 1); % Total shear energy dissipation
	- p.p = zeros(p.np, 1); % Pressure on particle
	- p.bonds = zeros(p.np, 2); % Inter-particle bonds
	-
	- % Read remaining data from MATLAB binary
	- grids.origo = fread(fid, [1, grids.nd], 'float');
	- grids.L = fread(fid, [1, grids.nd], 'float');
	- grids.num = fread(fid, [1, grids.nd], 'uint32');
	-
	- for j=1:p.np
	- %p.id = fread(fid, 1, 'uint32');
	-
	- for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity
	- p.x(j,i) = fread(fid, 1, 'float');
	- p.vel(j,i) = fread(fid, 1, 'float');
	- p.angvel(j,i) = fread(fid, 1, 'float');
	- p.force(j,i) = fread(fid, 1, 'float');
	- p.torque(j,i) = fread(fid, 1, 'float');
	- end
	- end
	-
	- for j=1:p.np %Parameters with one value per particle
	- p.fixvel(j) = fread(fid, 1, 'float');
	- p.xsum(j) = fread(fid, 1, 'float');
	- p.radius(j) = fread(fid, 1, 'float');
	- p.rho(j) = fread(fid, 1, 'float');
	- p.k_n(j) = fread(fid, 1, 'float');
	- p.k_s(j) = fread(fid, 1, 'float');
	- p.k_r(j) = fread(fid, 1, 'float');
	- p.gamma_s(j) = fread(fid, 1, 'float');
	- p.gamma_r(j) = fread(fid, 1, 'float');
	- p.mu_s(j) = fread(fid, 1, 'float');
	- p.mu_d(j) = fread(fid, 1, 'float');
	- p.mu_r(j) = fread(fid, 1, 'float');
	- p.C(j) = fread(fid, 1, 'float');
	- p.E(j) = fread(fid, 1, 'float');
	- p.K(j) = fread(fid, 1, 'float');
	- p.nu(j) = fread(fid, 1, 'float');
	- p.es_dot(j) = fread(fid, 1, 'float');
	- p.es(j) = fread(fid, 1, 'float');
	- p.p(j) = fread(fid, 1, 'float');
	- end
	-
	- %Physical, constant parameters
	- %params.global = fread(fid, 1, 'ubit1');
	- params.global = fread(fid, 1, 'int');
	- params.g = fread(fid, [1, grids.nd], 'float');
	- params.kappa = fread(fid, 1, 'float');
	- params.db = fread(fid, 1, 'float');
	- params.V_b = fread(fid, 1, 'float');
	- params.shearmodel = fread(fid, 1, 'uint32');
	-
	- % Wall data
	- walls.nw = fread(fid, 1, 'uint32');
	- for j=1:walls.nw
	- for i=1:grids.nd
	- walls.n(j,i) = fread(fid, 1, 'float');
	- end
	- walls.x(j) = fread(fid, 1, 'float');
	- walls.m(j) = fread(fid, 1, 'float');
	- walls.vel(j) = fread(fid, 1, 'float');
	- walls.force(j) = fread(fid, 1, 'float');
	- walls.devs(j) = fread(fid, 1, 'float');
	- end
	- params.periodic = fread(fid, 1, 'int');
	-
	- for j=1:p.np
	- p.bonds(j,1) = fread(fid, 1, 'uint32');
	- p.bonds(j,2) = fread(fid, 1, 'uint32');
	- p.bonds(j,3) = fread(fid, 1, 'uint32');
	- p.bonds(j,4) = fread(fid, 1, 'uint32');
	- end
	-
	- fclose(fid);
	-
	-end
	diff --git a/mfiles/fwritebin.m b/mfiles/fwritebin.m
	t@@ -1,98 +0,0 @@
	-function fwritebin(path, fn, p, grids, time, params, walls)
	- %path = '../input/'; %Target directory
	- %fn = '3dtest.bin'; %Target binary
	-
	- %Open binary file for writing, little endian ordering, unicode char encodi…
	- %fid = fopen([path fn], 'wb','ieee-le','UTF-8');
	- fid = fopen([path fn], 'wb', 'ieee-le');
	-
	- %fwrite(fid, grids.nd, 'ushort'); %Unsigned short int
	- %fwrite(fid, p.np, 'ulong'); %Unsigned long int
	- fwrite(fid, grids.nd, 'int');
	- fwrite(fid, p.np, 'uint32');
	-
	- %Time parameters
	- fwrite(fid, time.dt, 'float');
	- fwrite(fid, time.current, 'double');
	- fwrite(fid, time.total, 'double');
	- fwrite(fid, time.file_dt, 'float');
	- fwrite(fid, time.step_count, 'uint32');
	-
	- for i=1:grids.nd %Grid origo
	- fwrite(fid, grids.origo(i), 'float');
	- end
	-
	- for i=1:grids.nd %Grid dimensions
	- fwrite(fid, grids.L(i), 'float');
	- end
	-
	- for i=1:grids.nd %Grid dimensions
	- fwrite(fid, grids.num(i), 'uint32');
	- end
	-
	- for j=1:p.np
	- for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity
	- fwrite(fid, p.x(j,i), 'float');
	- fwrite(fid, p.vel(j,i), 'float');
	- fwrite(fid, p.angvel(j,i), 'float');
	- fwrite(fid, p.force(j,i), 'float');
	- fwrite(fid, p.torque(j,i), 'float');
	- end
	- end
	-
	- for j=1:p.np %Parameters with one value per particle
	- fwrite(fid, p.fixvel(j), 'float');
	- fwrite(fid, p.xsum(j), 'float');
	- fwrite(fid, p.radius(j), 'float');
	- fwrite(fid, p.rho(j), 'float');
	- fwrite(fid, p.k_n(j), 'float');
	- fwrite(fid, p.k_s(j), 'float');
	- fwrite(fid, p.k_r(j), 'float');
	- fwrite(fid, p.gamma_s(j), 'float');
	- fwrite(fid, p.gamma_r(j), 'float');
	- fwrite(fid, p.mu_s(j), 'float');
	- fwrite(fid, p.mu_d(j), 'float');
	- fwrite(fid, p.mu_r(j), 'float');
	- fwrite(fid, p.C(j), 'float');
	- fwrite(fid, p.E(j), 'float');
	- fwrite(fid, p.K(j), 'float');
	- fwrite(fid, p.nu(j), 'float');
	- fwrite(fid, p.es_dot(j), 'float');
	- fwrite(fid, p.es(j), 'float');
	- fwrite(fid, p.p(j), 'float');
	- end
	-
	- %Physical, constant parameters
	- %fwrite(fid, params.global, 'ubit1');
	- fwrite(fid, params.global, 'int');
	- for i=1:grids.nd
	- fwrite(fid, params.g(i), 'float');
	- end
	- fwrite(fid, params.kappa, 'float');
	- fwrite(fid, params.db, 'float');
	- fwrite(fid, params.V_b, 'float');
	- fwrite(fid, params.shearmodel, 'uint32');
	-
	- fwrite(fid, walls.nw, 'uint32');
	- for j=1:walls.nw
	- for i=1:grids.nd
	- fwrite(fid, walls.n(j,i), 'float'); % Wall normal
	- end
	- fwrite(fid, walls.x(j), 'float'); % Wall pos. on axis parallel to wall…
	- fwrite(fid, walls.m(j), 'float'); % Wall mass
	- fwrite(fid, walls.vel(j), 'float'); % Wall vel. on axis parallel to wa…
	- fwrite(fid, walls.force(j), 'float'); % Wall force on axis parallel to…
	- fwrite(fid, walls.devs(j), 'float'); % Deviatoric stress on wall normal
	- end
	- fwrite(fid, params.periodic, 'int');
	-
	- for j=1:p.np
	- fwrite(fid, p.bonds(i,1), 'uint32');
	- fwrite(fid, p.bonds(i,2), 'uint32');
	- fwrite(fid, p.bonds(i,3), 'uint32');
	- fwrite(fid, p.bonds(i,4), 'uint32');
	- end
	-
	- fclose(fid);
	-
	-end
	diff --git a/mfiles/initsetup.m b/mfiles/initsetup.m
	t@@ -1,180 +0,0 @@
	- function initsetup(plotfigure)
	-% initsetup() creates a model setup of particles in 3D with cubic
	-% packing. Specify PSD and desired number of particles, and the
	-% function will determine the model dimensions, and fill the space
	-% with a number of particles from a specified particle size distribution.
	-close all;
	-
	-% Simulation project name
	-simulation_name = '1e3-init-visc';
	-
	-% Plot particle assembly in MATLAB after initialization
	-if exist('plotfigure','var')
	- if plotfigure == 1
	- visualize = 1;
	- else
	- visualize = 0;
	- end
	-else
	- visualize = 0;
	-end
	-
	-% Physical, constant parameters
	-params.g = [0.0, 0.0, -9.80665]; %standard gravity, by definition 9.80665 m/s^2
	-%params.g = 0.98;
	-
	-% No. of dimensions
	-grids.nd = 3;
	-grids.origo = [0 0 0]; %Coordinate system origo
	-
	-% Number of particles
	-p.np = 1e3;
	-
	-% Create grid
	-form = 4; % For a cubic grid, use 3.
	- % For a higher grid, use 4 or more (for cubic end config. use 5).
	- % For a flatter grid, use 1 or 2.
	-grids.num(1) = ceil(nthroot(p.np, form)); % Particles in x direction
	-grids.num(2) = grids.num(1); % Particles in y direction
	-grids.num(3) = ceil(p.np/(grids.num(1)*grids.num(2))); % Particles in z direct…
	-
	-disp(['Grid dimensions: x=' num2str(grids.num(1)) ...
	- ', y=' num2str(grids.num(2)) ...
	- ', z=' num2str(grids.num(3))])
	-
	-%grids.num = ceil(nthroot(p.np,grids.nd)) * ones(grids.nd, 1); %Square/cubic
	-p.np = grids.num(1)grids.num(2)grids.num(3);
	-
	-% Particle size distribution
	-p.psd = 'logn'; %PSD: logn or uni
	-
	-p.m = 440e-6; %Mean size
	-p.v = p.m*0.00002; %Variance
	-
	-p.radius = zeros(p.np, 1);
	-p.x = zeros(p.np, grids.nd);
	-
	-p.bonds = ones(p.np, 4) * p.np; % No bonds when the values equal the no. of p…
	-
	-if strcmp(p.psd,'logn') %Log-normal PSD. Note: Keep variance small.
	- mu = log((p.m^2)/sqrt(p.v+p.m^2));
	- sigma = sqrt(log(p.v/(p.m^2)+1));
	- p.radius = lognrnd(mu,sigma,1,p.np); %Array of particle radii
	-elseif strcmp(p.psd,'uni') %Uniform PSD between rmin and rmax
	- rmin = p.m - p.v*1e5;
	- rmax = p.m + p.v*1e5;
	- %rmin = 0.1dd; rmax = 0.4dd;
	- p.radius = (rmax-rmin)*rand(p.np,1)+rmin; %Array of particle radii
	-end
	-
	-% Display PSD
	-if visualize == 1
	- figure;
	- hist(p.radius);
	- title(['PSD: ' num2str(p.np) ' particles, m = ' num2str(p.m) ' m']);
	-end
	-
	-% Friction angles
	-ang_s = 30; % Angle of static shear resistance
	-ang_d = 25; % Angle of dynamic shear resistance
	-ang_r = 35; % Angle of rolling resistance
	-
	-% Other particle parameters
	-p.vel = zeros(p.np, grids.nd); % Velocity vector
	-p.angvel = zeros(p.np, grids.nd); % Angular velocity
	-p.fixvel = zeros(p.np, 1); % 0: horizontal particle velocity free, 1…
	-p.xsum = zeros(p.np, 1); % Total displacement along x-axis
	-p.force = zeros(p.np, grids.nd); % Force vector
	-p.torque = zeros(p.np, grids.nd); % Torque vector
	-
	-params.global = 1; % 1: Physical parameters global, 0: individual per particle
	-p.rho = 3600*ones(p.np,1); % Density
	-p.k_n = 4e5*ones(p.np,1); % Normal stiffness [N/m]
	-p.k_s = p.k_n(:); % Shear stiffness [N/m]
	-p.k_r = p.k_s(:).*(10); % Rolling stiffness
	-params.shearmodel = 1; % Contact model. 1=frictional, viscou…
	-p.gamma_s = p.k_n./1e3; % Shear viscosity [Ns/m]
	-p.gamma_r = p.gamma_s(:); % Rolling viscosity [?]
	-p.mu_s = tand(ang_s)*ones(p.np,1); % Inter-particle shear contact frict…
	-p.mu_r = tand(ang_r)*ones(p.np,1); % Inter-particle rolling contact fri…
	-p.C = 0*zeros(p.np,1); % Inter-particle cohesion
	-p.E = 10e9*ones(p.np,1); % Young's modulus
	-p.K = 38e9*ones(p.np,1); % Bulk modulus
	-p.nu = 0.12.0sqrt(min(4/3pi.p.radius(:).p.radius(:).p.radius(:).*p.…
	-p.es_dot = zeros(p.np,1); % Current shear energy dissipation
	-p.es = zeros(p.np,1); % Total shear energy dissipation
	-p.p = zeros(p.np,1); % Pressure on particle
	-
	-% Parameters related to capillary bonds
	-% Disable capillary cohesion by setting kappa to zero.
	-enableCapillaryCohesion = 0;
	-theta = 0.0; % Wettability (0: perfect)
	-if (enableCapillaryCohesion == 1)
	- params.kappa = 2pip.gamma_s(1) * cos(theta); % Prefactor
	- params.V_b = 1e-12; % Liquid volume at bond
	-else
	- params.kappa = 0; % Zero capillary force
	- params.V_b = 0; % Zero liquid volume at bond
	-end
	-params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
	-
	-% Time parameters
	-%time.dt = 0.2min(sqrt((p.rho(:).p.radius(:).^2)./p.K(:))); % Comput…
	-%time.dt = time.dt*1e1; % Speedup factor
	-time.dt = 0.17sqrt(min(4/3pi.p.radius(:).p.radius(:).*p.radius(:).…
	-time.current = 0.0;
	-time.total = 1.500+2.0*time.dt; % Total simulation time [s]
	-time.file_dt = 0.0010; % Interval between output#.bin generation [s]
	-time.step_count = 0;
	-
	-% Calculate particle coordinates
	-% Grid unit length. Maximum particle diameter determines grid size
	-GU = 2max(p.radius)1.40; % Forty percent margin
	-grids.L = [GUgrids.num(1) GUgrids.num(2) GU*grids.num(3)];
	-
	-% Particle coordinates by filling grid.
	-x = linspace(GU/2, grids.L(1)-GU, grids.num(1));
	-y = linspace(GU/2, grids.L(2)-GU, grids.num(2));
	-z = linspace(GU/2, grids.L(3)-GU, grids.num(3));
	-
	-[X Y Z] = meshgrid(x,y,z);
	-X=X(:); Y=Y(:); Z=Z(:);
	-
	-p.x = [X Y Z];
	-
	-%Particle positions randomly modified by + 10 percent of a grid unit
	-p.x = p.x + (rand(p.np, grids.nd)0.49GU);
	-
	-% Walls with friction
	-% Note that the world boundaries already act as fricionless boundaries
	-% Upper wall
	-wno = 1;
	-walls.n(wno,:) = [0.0, 0.0, -1.0]; % Normal along negative z-axis
	-walls.x(wno) = grids.L(3); % Positioned at upper boundary
	-walls.m(wno) = p.rho(1)p.nppimax(p.radius)max(p.radius)*max(p.radius…
	-walls.vel(wno) = 0.0; % Wall velocity
	-walls.force(wno) = 0.0; % Wall force
	-walls.devs(wno) = 0.0; % Deviatoric stress
	-walls.nw = wno;
	-
	-% Define behavior of x- and y boundaries.
	-% Uncomment only one!
	-%params.periodic = 0; % Behave as frictional walls
	-params.periodic = 1; % Behave as periodic boundaries
	-%params.periodic = 2; % x: periodic, y: frictional walls
	-
	-% Write output binary
	-fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls…
	-disp('Writing of binary file complete.');
	-
	-% Plot particles in bubble plot
	-if visualize == 1
	- disp('Waiting for visualization.');
	- plotspheres(p, grids, 5, 0, 1);
	-end
	-
	-disp(['Project "' simulation_name '" is now ready for processing with SPHERE.'…
	-disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
	-disp(['to initiate, and check progress here in MATLAB using "status(''' simula…
	-
	-end
	diff --git a/mfiles/plotspheres.m b/mfiles/plotspheres.m
	t@@ -1,98 +0,0 @@
	-function plotspheres(p, grids, n, quiver, visible)
	-
	-if exist('visible','var')
	- if visible == 0
	- % Create a new invisible figure window
	- figure('visible','off');
	- else
	- % Create a new figure window
	- figure('Renderer','OpenGL')
	- end
	-else
	- % Create a new figure window
	- figure('Renderer','OpenGL')
	-end
	-
	-% Generate unit sphere, consisting of n-by-n faces (i.e. the resolution)
	-if exist('n', 'var')
	- [x,y,z] = sphere(n);
	-else
	- [x,y,z] = sphere(10);
	-end
	-
	-% Iterate through particle data
	-hold on
	-for i=1:p.np
	- spheresurf = surf(x*p.radius(i)+p.x(i,1), ...
	- y*p.radius(i)+p.x(i,2), ...
	- z*p.radius(i)+p.x(i,3));
	- set(spheresurf,'EdgeColor','none', ...
	- 'FaceColor',[0.96 0.64 0.38], ... % RGB
	- 'FaceLighting','phong', ...
	- 'AmbientStrength',0.3, ...
	- 'DiffuseStrength',0.8, ...
	- 'SpecularStrength',0.9, ...
	- 'SpecularExponent',25, ...
	- 'BackFaceLighting','lit');
	-end
	-camlight left;
	-hidden off
	-
	-% Optional quiver3 plot (velocity vectors)
	-if exist('quiver','var')
	- if quiver == 1
	- quiver3(p.x(:,1), p.x(:,2) , p.x(:,3), ...
	- p.vel(:,1), p.vel(:,2), p.vel(:,3), 3);
	- end
	-end
	-
	-% Draw walls from 8 vertices and connect these to 6 faces
	-vertices = [grids.origo(1) grids.origo(2) grids.origo(3); ... % 1
	- grids.origo(1) grids.L(2) grids.origo(3); ... % 2
	- grids.L(1) grids.L(2) grids.origo(3); ... % 3
	- grids.L(1) grids.origo(2) grids.origo(3); ... % 4
	- grids.origo(1) grids.origo(2) grids.L(3); ... % 5
	- grids.origo(1) grids.L(2) grids.L(3); ... % 6
	- grids.L(1) grids.L(2) grids.L(3); ... % 7
	- grids.L(1) grids.origo(2) grids.L(3)]; % 8
	-
	-faces = [1 2 3 4; ... % (observing along pos. y axis direction)
	- 2 6 7 3; ... %
	- 4 3 7 8; ... %
	- 1 5 8 4; ... %
	- 1 2 6 5; ... %
	- 5 6 7 8]; %
	-
	-patch('Faces', faces, 'Vertices', vertices, ...
	- 'FaceColor','none', 'EdgeColor','black','LineWidth',2);
	-
	-% View specifications
	-%daspect([1 1 1])
	-
	-view([grids.L(1), -2*grids.L(2), grids.L(3)])
	-grid on
	-axis equal
	-maxr = max(p.radius);
	-axis([grids.origo(1)-maxr grids.L(1)+maxr ...
	- grids.origo(2)-maxr grids.L(2)+maxr ...
	- grids.origo(3)-maxr grids.L(3)+maxr]);
	-
	-light('Position', [grids.L(1), -grids.L(2), grids.L(3)]);
	-material shiny; %shiny dull metal
	-camlight(45,45);
	-lighting gouraud; %flat gouraud phone none
	-
	-% Remove hidden lines from mesh plot (hidden on)
	-hidden off
	-
	-% Labels
	-title([num2str(p.np) ' particles']);
	-xlabel('x [m]');
	-ylabel('y [m]');
	-zlabel('z [m]');
	-
	-% Add delaunay triangulation
	-%dt = DelaunayTri(p.x(:,1), p.x(:,2), p.x(:,3));
	-%tetramesh(dt,'FaceAlpha',0.02);
	-
	-end
	-\ No newline at end of file
	diff --git a/mfiles/status.m b/mfiles/status.m
	t@@ -1,11 +0,0 @@
	-function lastfile = status(project)
	-
	-% status() writes status of current model run
	-
	-st = load(['../output/' project '.status.dat']);
	-
	-disp(['Status: time = ',num2str(st(1)),', ',num2str(st(2)),'% done, filenr = '…
	-
	-lastfile = st(3);
	-
	-end
	diff --git a/mfiles/useoutput.m b/mfiles/useoutput.m
	t@@ -1,112 +0,0 @@
	-%function useoutput(fn, plotfigure)
	-% Use particle data from target binary from output directory,
	-% create a fitting grid, and zero time variables.
	-% Final configuration is saved in a new filename in input/ directory.
	-
	-% Plot particle assembly in MATLAB after initialization
	-visualize = 0; % No
	-%visualize = 1; % Yes
	-
	-% Input binary file
	-directory = '../input/';
	-inputbin = '5e4-cons_10kPa.bin';
	-
	-% New configuration name
	-simulation_name = '5e4-shear_10kPa';
	-
	-% Read data
	-[p, grids, time, params, walls] = freadbin(directory, inputbin);
	-%[p, grids, time, params, walls] = freadbinold(directory, inputbin);
	-
	-% Change temporal parameters (comment out to use previous values"
	-time.current = 0.0; % New starting time
	-time.step_count = 0; % First output file number
	-time.total = 0.25+2*time.dt; % Total simulation time [s]
	-time.file_dt = 0.01; %Interval between output#.bin generation [s]
	-time.dt = 0.17sqrt(min(4/3pi.p.radius(:).p.radius(:).*p.radius(:).…
	-
	-% Particle stiffness
	-p.k_n = 4e5*ones(p.np,1); % Normal stiffness [N/m]
	-p.gamma_s = p.k_n./1e3; % Shear viscosity [Ns/m]
	-p.gamma_r = p.gamma_s(:); % Rolling viscosity
	-
	-% Friction angles
	-ang_s = 25; % Angle of shear resistance
	-ang_r = 35; % Angle of rolling resistance
	-p.mu_s = tand(ang_s)*ones(p.np,1); % Inter-particle shear contact friction coe…
	-p.mu_r = tand(ang_r)*ones(p.np,1); % Inter-particle rolling contact friction c…
	-
	-% Compute new grid, scaled to fit max.- & min. particle positions
	-GU = 2*max(p.radius); % Cell size
	-x_min = min(p.x(:,1));% - p.radius(:));
	-x_max = max(p.x(:,1));% + p.radius(:));%*3.0;
	-%y_min = min(p.x(:,2) - p.radius(:));
	-%y_max = max(p.x(:,2) + p.radius(:));
	-z_min = min(p.x(:,3) - p.radius(:));
	-z_max = max(p.x(:,3) + p.radius(:));
	-z_adjust = 1.2; % Specify overheightening of world to allow for shear dilatancy
	-%grids.num(1) = ceil((x_max-x_min)/GU);
	-%grids.num(2) = ceil((y_max-y_min)/GU);
	-grids.num(3) = ceil((z_max-z_min)*z_adjust/GU);
	-%grids.L = [(x_max-x_min) (y_max-y_min) (z_max-z_min)*z_adjust];
	-grids.L(3) = (z_max-z_min)*z_adjust;
	-
	-% Parameters related to capillary bonds
	-% Disable capillary cohesion by setting kappa to zero.
	-enableCapillaryCohesion = 0;
	-theta = 0.0; % Wettability (0: perfect)
	-if (enableCapillaryCohesion == 1)
	- params.kappa = 2pip.gamma_s(1) * cos(theta); % Prefactor
	- params.V_b = 1e-12; % Liquid volume at bond
	-else
	- params.kappa = 0; % Zero capillary force
	- params.V_b = 0; % Zero liquid volume at bond
	-end
	-params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
	-
	-% Move mobile upper wall to top of domain
	-walls.x(1) = max(p.x(:,3)+p.radius(:));
	-
	-% Define new deviatoric stress [Pa]
	-%walls.devs(1) = 0.0;
	-walls.devs(1) = 10.0e3;
	-
	-% Let x- and y boundaries be periodic
	-params.periodic = 1; % x- and y boundaries periodic
	-%params.periodic = 2; % Only x-boundaries periodic
	-
	-% By default, all particles are free to move horizontally
	-p.fixvel = zeros(p.np, 1); % Start off by defining all particles as free
	-shearing = 1; % 1: true, 0: false
	-
	-if shearing == 1
	- % Fix horizontal velocity to 0.0 of lowermost particles
	- I = find(p.x(:,3) < (z_max-z_min)*0.1); % Find the lower 10%
	- p.fixvel(I) = 1; % Fix horiz. velocity
	- p.vel(I,1) = 0.0; % x-value
	- p.vel(I,2) = 0.0; % y-value
	-
	- % Fix horizontal velocity to 0.0 of uppermost particles
	- I = find(p.x(:,3) > (z_max-z_min)*0.9); % Find the upper 10%
	- p.fixvel(I) = 1; % Fix horiz. velocity
	- p.vel(I,1) = (x_max-x_min)*1.0; % x-value: One grid length per second
	- p.vel(I,2) = 0.0; % y-value
	-end
	-
	-% Zero x-axis displacement
	-p.xsum = zeros(p.np, 1);
	-
	-% Write output binary
	-fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls…
	-disp('Writing of binary file complete.');
	-
	-% Plot particles in bubble plot
	-if visualize == 1
	- disp('Waiting for visualization.');
	- plotspheres(p, grids, 5, 0, 1);
	-end
	-
	-disp(['Project "' simulation_name '" is now ready for processing with SPHERE.'…
	-disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
	-disp(['to initiate, and check progress here in MATLAB using "status(''' simula…
	-
	diff --git a/mfiles/visualize.m b/mfiles/visualize.m
	t@@ -1,379 +0,0 @@
	-function visualize(project, method, file_nr)
	-
	- lastfile = status(project);
	-
	- if exist('file_nr','var')
	- filenr = file_nr;
	- else
	- filenr = lastfile;
	- end
	-
	- figure;
	-
	- % Plot sum of kinetic energy vs. time
	- if strcmpi(method, 'energy')
	-
	- disp('Energy')
	-
	- Epot = zeros(1,lastfile);
	- Ekin = zeros(1,lastfile);
	- Erot = zeros(1,lastfile);
	- Es = zeros(1,lastfile);
	- Es_dot = zeros(1,lastfile);
	- Esum = zeros(1,lastfile);
	-
	- % Load data from all output files
	- for i = 1:lastfile
	- fn = [project '.output' num2str(i) '.bin'];
	- disp(fn);
	- [p, ~, time, params, ~] = freadbin('../output/', fn);
	- for j = 1:p.np
	- r = p.radius(j);
	- m = (4/3pirrr*p.rho(j));
	- Epot(i) = Epot(i) + m * norm(params.g) * p.x(j,3);
	- Ekin(i) = Ekin(i) + 0.5 * m ...
	- * norm(p.vel(j,:)) * norm(p.vel(j,:));
	- Erot(i) = Erot(i) + 0.5 * (2/5 * m * r*r) ...
	- * norm(p.angvel(j,:)) * norm(p.angvel(j,:));
	- end
	- Es(i) = sum(p.es);
	- Es_dot(i) = sum(p.es_dot);
	-
	- Esum(i) = Epot(i) + Ekin(i) + Erot(i) + Es(i);
	- %Esum(i) = Epot(i) + Ekin(i) + Es(i);
	- end
	-
	- % Time axis
	- t = linspace(time.file_dt, time.current, length(Ekin));
	-
	- %disp(num2str(Ekin(:)));
	-
	- % Visualization, E_pot
	- subplot(2,3,1);
	- plot(t, Epot, '+-');
	- title('Potential energy');
	- xlabel('Time [s]');
	- ylabel('Total potential energy [J]');
	- box on;
	- grid on;
	-
	- % Visualization, E_kin
	- subplot(2,3,2);
	- plot(t, Ekin, '+-');
	- title('Kinetic energy');
	- xlabel('Time [s]');
	- ylabel('Total kinetic energy [J]');
	- box on;
	- grid on;
	-
	- % Visualization, E_rot
	- subplot(2,3,3);
	- plot(t, Erot, '+-');
	- title('Rotational energy');
	- xlabel('Time [s]');
	- ylabel('Total rotational energy [J]');
	- box on;
	- grid on;
	-
	- % Visualizaiton, E_s_dot (current shear energy)
	- subplot(2,3,4);
	- plot(t, Es_dot, '+-');
	- title('Shear energy rate');
	- xlabel('Time [s]');
	- ylabel('Shear energy rate [W]');
	- box on;
	- grid on;
	-
	- % Visualizaiton, E_s (total shear energy)
	- subplot(2,3,5);
	- plot(t, Es, '+-');
	- title('Total shear energy');
	- xlabel('Time [s]');
	- ylabel('Total shear energy [J]');
	- box on;
	- grid on;
	-
	- % Total energy, Esum
	- subplot(2,3,6);
	- plot(t, Esum, '+-');
	- title('Total energy: Pot + Kin + Rot + Shear');
	- xlabel('Time [s]');
	- ylabel('Total energy [J]');
	- box on;
	- grid on;
	-
	- end % End visualize energy
	-
	-
	- % Visualize wall kinematics and force
	- if strcmpi(method, 'walls')
	- disp('Walls')
	-
	- [~, ~, ~, ~, walls] = freadbin('../output/', [project '.output0.bin']);
	-
	- wforce = zeros(lastfile, walls.nw);
	- wvel = zeros(lastfile, walls.nw);
	- wpos = zeros(lastfile, walls.nw);
	- wdevs = zeros(lastfile, walls.nw);
	-
	- % Load data from all output files
	- for i = 1:lastfile
	- fn = [project '.output' num2str(i) '.bin'];
	- disp(fn);
	- [~, ~, time, ~, walls] = freadbin('../output/', fn);
	-
	- for j = 1:walls.nw
	- wforce(i,j) = walls.force(j);
	- wvel(i,j) = walls.vel(j);
	- wpos(i,j) = walls.x(j);
	- wdevs(i,j) = walls.devs(j);
	- end
	- end
	-
	- % Time axis
	- t = linspace(time.file_dt, time.current, lastfile);
	-
	- % Visualization, wall position
	- subplot(2,2,1);
	- hold on
	- for i=1:walls.nw
	- p = plot(t, wpos(:,i), '+-');
	- if i==2
	- set(p,'Color','red')
	- end
	- end
	- hold off
	- title('Wall positions');
	- xlabel('Time [s]');
	- ylabel('Position [m]');
	- box on;
	- grid on;
	-
	- % Visualization, wall force
	- subplot(2,2,2);
	- hold on
	- for i=1:walls.nw
	- p = plot(t, wforce(:,i), '+-');
	- if i==2
	- set(p,'Color','red')
	- end
	- end
	- hold off
	- title('Wall forces');
	- xlabel('Time [s]');
	- ylabel('Force [N]');
	- box on;
	- grid on;
	-
	- % Visualization, wall velocity
	- subplot(2,2,3);
	- hold on
	- for i=1:walls.nw
	- p = plot(t, wvel(:,i), '+-');
	- if i==2
	- set(p,'Color','red')
	- end
	- end
	- hold off
	- title('Wall velocities');
	- xlabel('Time [s]');
	- ylabel('Velocity [m/s]');
	- box on;
	- grid on;
	-
	- % Visualization, wall deviatoric stresses
	- subplot(2,2,4);
	- hold on
	- for i=1:walls.nw
	- p = plot(t, wdevs(:,i), '+-');
	- if i==2
	- set(p,'Color','red')
	- end
	- end
	- hold off
	- title('Wall deviatoric stresses');
	- xlabel('Time [s]');
	- ylabel('Stress [Pa]');
	- box on;
	- grid on;
	-
	- end % End visualize walls
	-
	-
	- % Visualize first output file with plotspheres
	- if strcmpi(method, 'first')
	- disp('Visualizing first output file')
	- fn = [project '.output0.bin'];
	- [p, grids, ~, ~] = freadbin('../output/', fn);
	- plotspheres(p, grids, 5, 0, 1);
	- end % End visualize last file
	-
	-
	- % Visualize last output file with plotspheres
	- if strcmpi(method, 'last')
	- disp('Visualizing last output file')
	- fn = [project '.output' num2str(lastfile) '.bin'];
	- [p, grids, ~, ~] = freadbin('../output/', fn);
	- plotspheres(p, grids, 5, 0, 1);
	- end % End visualize last file
	-
	-
	- % Visualize horizontal velocities (velocity profile)
	- if strcmpi(method, 'veloprofile')
	-
	- disp('Visualizing velocity profile');
	-
	- % Read data
	- fn = [project '.output' num2str(filenr) '.bin'];
	- disp(fn);
	- [p, ~, time, params, ~] = freadbin('../output/', fn);
	-
	- horiz_velo = zeros(2, p.np); % x,y velocities for each particle
	-
	- for j = 1:p.np
	- horiz_velo(1, j) = p.vel(j, 1); % x-velocity
	- horiz_velo(2, j) = p.vel(j, 2); % y-velocity
	- end
	-
	- % Find shear velocity (for scaling first axis)
	- fixidx = find(p.fixvel > 0.0);
	- shearvel = max(p.vel(fixidx,1));
	-
	- % Plot x- and y velocities vs. z position
	- hold on;
	- plot(horiz_velo(2,:), p.x(:,3), '.b'); % y-velocity (blue)
	- plot(horiz_velo(1,:), p.x(:,3), '.r'); % x-velocity (red)
	- axis([-0.5shearvel, shearvel1.5, min(p.x(:,3)), max(p.x(:,3))]);
	- title(['Velocity profile, t = ' num2str(time.current) ' s']);
	- xlabel('Horizontal velocity [m/s]');
	- ylabel('Vertical position [m]');
	- legend('y', 'x', 'Location', 'SouthEast');
	- box on;
	- grid on;
	- hold off;
	-
	- end % End visualize velocity profile
	-
	- if strcmpi(method, 'sheardisp')
	- disp('Visualizing shear displacement, 1D');
	-
	- % Read first datafile
	- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
	-
	- % Read original z-position at t = 0 s.
	- zpos = p.x(:,3);
	-
	- % Read last datafile
	- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfil…
	-
	- % Plot
	- plot(p.xsum(:), zpos(:), 'o');
	- title(project);
	- xlabel('Shear displacement [m]');
	- ylabel('Initial vertical position [m]');
	- box on;
	- grid on;
	-
	- end % End visualize shear displacement
	-
	- if strcmpi(method, 'sheardisp2d')
	- disp('Visualizing shear displacement, 2D');
	-
	- % Read first datafile
	- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
	-
	- % Read original z-position at t = 0 s.
	- zpos = p.x(:,3);
	-
	- % Read last datafile
	- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfil…
	-
	-
	-
	- % Plot
	- colormap(jet)
	- title(project);
	- xlabel('Shear displacement [m]');
	- ylabel('y [m]');
	- zlabel('z [m]');
	- box on;
	- grid on;
	- axis equal;
	- axis vis3d;
	- rotate3d on;
	-
	- end % End visualize shear displacement 2D
	-
	- if strcmpi(method, 'sheardisp3d')
	- disp('Visualizing shear displacement, 3D');
	-
	- % Read last datafile
	- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfil…
	-
	- % Plot
	- scatter3(p.xsum(:), p.x(:,2), p.x(:,3), 30, p.x(:,3), 'filled');
	- colormap(jet)
	- title(project);
	- xlabel('Shear displacement [m]');
	- ylabel('y [m]');
	- zlabel('z [m]');
	- box on;
	- grid on;
	- axis equal;
	- axis vis3d;
	- rotate3d on;
	-
	- end % End visualize shear displacement 3D
	-
	- % Visualize shear stresses
	- if strcmpi(method, 'shear')
	-
	- [p, grids, time, params, walls] = freadbin('../output/',[project '.output0…
	-
	- disp('Visualizing shear stress dynamics')
	- xdisp = zeros(1, lastfile+1); % Shear displacement
	- sigma = zeros(1, lastfile+1); % Effective normal stress
	- tau = zeros(1, lastfile+1); % Shear stress
	- dilation = zeros(1, lastfile+1); % Dilation
	-
	- % Calculate the shear velocity
	- fixedidx_all = find(p.fixvel(:) > 0.0); % All particles with a fixed horiz…
	- shearvel = max(p.vel(fixedidx_all,1));
	-
	- % Surface area
	- A = grids.L(1) * grids.L(2); % x-length * y-length
	-
	- % Load data from all output files
	- for i = 0:lastfile
	- fn = [project '.output' num2str(i) '.bin'];
	- disp(fn);
	- [p, ~, time, ~, walls] = freadbin('../output/', fn);
	-
	- xdisp(i+1) = time.current * shearvel;
	- sigma(i+1) = walls.force(1) / A;
	- tau(i+1) = shearstress(p, A);
	- dilation(i+1) = walls.x;
	- end
	-
	- % Plot stresses
	- subplot(2,1,1);
	- plot(xdisp, sigma, 'o-g', xdisp, tau, '+-r');
	- title('Stress dynamics');
	- xlabel('Shear distance [m]');
	- ylabel('Stress [Pa]');
	- legend('\sigma`','\tau');
	- box on;
	- grid on;
	-
	- % Plot dilation
	- subplot(2,1,2);
	- plot(xdisp, dilation, '+-b');
	- title('Dilation');
	- xlabel('Shear distance [m]');
	- ylabel('Upper wall pos. [m]');
	- box on;
	- grid on;
	-
	- end
	-
	-end % End function
	diff --git a/python/elastic_collision.py b/python/elastic_collision.py
	t@@ -0,0 +1,46 @@
	+#!/usr/bin/env python
	+import subprocess
	+import numpy
	+import matplotlib.pyplot as plt
	+
	+# Import sphere functionality
	+from sphere import *
	+
	+# New class
	+init = Spherebin(np = 2, nd = 3)
	+
	+#init.generateRadii(radius_mean = 0.5, radius_variance = 1e-15, histogram = 0)
	+init.radius[0] = numpy.ones(1, dtype=numpy.float64) * 0.5;
	+init.radius[1] = numpy.ones(1, dtype=numpy.float64) * 0.52;
	+
	+init.defaultparams()
	+
	+# The world should never be less than 3 cells in ANY direction, due to contact…
	+init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, …
	+
	+init.x[0] = numpy.array([1, 2, 2])
	+init.x[1] = numpy.array([6, 2, 2])
	+#init.x[2] = numpy.array([7, 2, 2])
	+#init.x[3] = numpy.array([8, 2, 2])
	+
	+init.nu = numpy.zeros(init.np, dtype=numpy.float64)
	+
	+#for i in range(init.np):
	+# init.x[i] = numpy.array([4+iinit.radius[i]2, 2, 2])
	+
	+init.vel[0] = numpy.array([10.0, 0.0, 0.0])
	+
	+
	+init.initTemporal(total = 1.0)
	+
	+init.writebin("../input/nc-test.bin")
	+#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
	+
	+subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=Tr…
	+
	+visualize("nc-test", "energy", savefig=True, outformat='png')
	+#visualize("nc-test", "walls", savefig=True)
	+
	+subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_cl…
	+
	+
	diff --git a/python/inelastic_collision.py b/python/inelastic_collision.py
	t@@ -0,0 +1,49 @@
	+#!/usr/bin/env python
	+import subprocess
	+import numpy
	+import matplotlib.pyplot as plt
	+
	+# Import sphere functionality
	+from sphere import *
	+
	+# New class
	+init = Spherebin(np = 2, nd = 3)
	+
	+init.radius = numpy.ones(init.np, dtype=numpy.float64) * 0.5;
	+
	+init.defaultparams()
	+
	+# The world should never be less than 3 cells in ANY direction, due to contact…
	+init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, …
	+
	+init.x[0] = numpy.array([1, 2, 2])
	+init.x[1] = numpy.array([6, 2, 2])
	+#init.x[2] = numpy.array([7, 2, 2])
	+#init.x[3] = numpy.array([8, 2, 2])
	+
	+# Set fraction of critical damping (0 = elastic, 1 = completely inelastic)
	+damping_fraction = 1.0
	+init.nu = numpy.ones(init.np, dtype=numpy.float64) \
	+ * damping_fraction * 2.0 * math.sqrt(4.0/3.0 * math.pi * init.radius…
	+ * init.rho[0] * init.k_n[0])
	+
	+
	+#for i in range(init.np):
	+# init.x[i] = numpy.array([4+iinit.radius[i]2, 2, 2])
	+
	+init.vel[0] = numpy.array([10.0, 0.0, 0.0])
	+
	+
	+init.initTemporal(total = 1.0)
	+
	+init.writebin("../input/nc-test.bin")
	+#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
	+
	+subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=Tr…
	+
	+visualize("nc-test", "energy", savefig=True, outformat='png')
	+#visualize("nc-test", "walls", savefig=True)
	+
	+subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_cl…
	+
	+
	diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h
	t@@ -0,0 +1,22 @@
	+#ifndef COLORBAR_H_
	+#define COLORBAR_H_
	+
	+// Functions that determine red-, green- and blue color components
	+// in a blue-white-red colormap. Ratio should be between 0.0-1.0.
	+
	+__inline__ __host__ __device__ float red(float ratio)
	+{
	+ return fmin(1.0f, 0.209fratioratioratio - 2.49fratioratio + 3.0fratio …
	+};
	+
	+__inline__ __host__ __device__ float green(float ratio)
	+{
	+ return fmin(1.0f, -2.44fratioratio + 2.15f*ratio + 0.369f);
	+};
	+
	+__inline__ __host__ __device__ float blue(float ratio)
	+{
	+ return fmin(1.0f, -2.21fratioratio + 1.61f*ratio + 0.573f);
	+};
	+
	+#endif
	diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs…
	t@@ -0,0 +1,27 @@
	+#!/bin/bash
	+
	+FILES=`ls ../output/*.bin`
	+
	+XRES=800
	+YRES=800
	+
	+WORKHORSE=CPU
	+
	+echo "# Rendering PPM images and converting to JPG"
	+for F in ../output/*.bin
	+do
	+ (BASE=`basename $F`;
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm;
	+ convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;)
	+ #rm ../img_out/$BASE.ppm &)
	+done
	+
	+#echo "# Converting PPM files to JPG using ImageMagick in parallel"
	+#for F in ../img_out/*.ppm
	+#do
	+# (BASE=`basename $F`; convert $F $F.jpg &)
	+#done
	+
	+#echo "# Removed temporary files"
	+#rm ../img_out/*.ppm
	+
	diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs…
	t@@ -0,0 +1,29 @@
	+#!/bin/bash
	+
	+FILES=`ls ../output/*.bin`
	+
	+XRES=800
	+YRES=800
	+
	+WORKHORSE=GPU
	+
	+echo "# Rendering PPM images"
	+for F in ../output/*.bin
	+do
	+ BASE=`basename $F`
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
	+ if [ $? -ne 0 ] ; then
	+ echo "Error rendering $F, trying again..."
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
	+ fi
	+done
	+
	+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
	+for F in ../img_out/*.ppm
	+do
	+ (BASE=`basename $F`; convert $F $F.jpg &)
	+done
	+
	+#echo "# Removed temporary files"
	+#rm ../img_out/*.ppm
	+
	diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu
	t@@ -0,0 +1,442 @@
	+#include <iostream>
	+#include <cutil_math.h>
	+#include "header.h"
	+#include "rt-kernel.h"
	+#include "colorbar.h"
	+
	+unsigned int iDivUp (unsigned int a, unsigned int b) {
	+ return (a % b != 0) ? (a / b + 1) : (a / b);
	+}
	+
	+__inline__ __host__ __device__ float3 f4_to_f3(float4 in)
	+{
	+ return make_float3(in.x, in.y, in.z);
	+}
	+
	+__inline__ __host__ __device__ float4 f3_to_f4(float3 in)
	+{
	+ return make_float4(in.x, in.y, in.z, 0.0f);
	+}
	+
	+// Kernel for initializing image data
	+__global__ void imageInit(unsigned char* _img, unsigned int pixels)
	+{
	+ // Compute pixel position from threadIdx/blockIdx
	+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	+ if (mempos > pixels)
	+ return;
	+
	+ _img[mempos*4] = 255; // Red channel
	+ _img[mempos*4 + 1] = 255; // Green channel
	+ _img[mempos*4 + 2] = 255; // Blue channel
	+}
	+
	+// Calculate ray origins and directions
	+__global__ void rayInitPerspective(float4* _ray_origo,
	+ float4* _ray_direction,
	+ float4 eye,
	+ unsigned int width,
	+ unsigned int height)
	+{
	+ // Compute pixel position from threadIdx/blockIdx
	+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	+ if (mempos > width*height)
	+ return;
	+
	+ // Calculate 2D position from linear index
	+ unsigned int i = mempos % width;
	+ unsigned int j = (int)floor((float)mempos/width) % width;
	+
	+ // Calculate pixel coordinates in image plane
	+ float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x)
	+ * (i + 0.5f) / width;
	+ float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z)
	+ * (j + 0.5f) / height;
	+
	+ // Write ray origo and direction to global memory
	+ _ray_origo[mempos] = const_eye;
	+ _ray_direction[mempos] = -const_dconst_w + p_uconst_u + p_v*const_v;
	+}
	+
	+// Check wether the pixel's viewing ray intersects with the spheres,
	+// and shade the pixel correspondingly
	+__global__ void rayIntersectSpheres(float4* _ray_origo,
	+ float4* _ray_direction,
	+ float4* _p,
	+ unsigned char* _img)
	+{
	+ // Compute pixel position from threadIdx/blockIdx
	+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	+ if (mempos > const_pixels)
	+ return;
	+
	+ // Read ray data from global memory
	+ float3 e = f4_to_f3(_ray_origo[mempos]);
	+ float3 d = f4_to_f3(_ray_direction[mempos]);
	+ //float step = length(d);
	+
	+ // Distance, in ray steps, between object and eye initialized with a large v…
	+ float tdist = 1e10f;
	+
	+ // Surface normal at closest sphere intersection
	+ float3 n;
	+
	+ // Intersection point coordinates
	+ float3 p;
	+
	+ // Iterate through all particles
	+ for (Inttype i=0; i<const_np; ++i) {
	+
	+ // Read sphere coordinate and radius
	+ float3 c = f4_to_f3(_p[i]);
	+ float R = _p[i].w;
	+
	+ // Calculate the discriminant: d = B^2 - 4AC
	+ float Delta = (2.0fdot(d,(e-c)))(2.0f*dot(d,(e-c))) // B^2
	+ - 4.0fdot(d,d) // -4A
	+ * (dot((e-c),(e-c)) - R*R); // C
	+
	+ // If the determinant is positive, there are two solutions
	+ // One where the line enters the sphere, and one where it exits
	+ if (Delta > 0.0f) {
	+
	+ // Calculate roots, Shirley 2009 p. 77
	+ float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(…
	+ * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
	+
	+ // Check wether intersection is closer than previous values
	+ if (fabs(t_minus) < tdist) {
	+ p = e + t_minus*d;
	+ tdist = fabs(t_minus);
	+ n = normalize(2.0f * (p - c)); // Surface normal
	+ }
	+
	+ } // End of solution branch
	+
	+ } // End of particle loop
	+
	+ // Write pixel color
	+ if (tdist < 1e10f) {
	+
	+ // Lambertian shading parameters
	+ float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
	+ float I_d = 40.0f; // Light intensity
	+ float k_d = 5.0f; // Diffuse coefficient
	+
	+ // Ambient shading
	+ float k_a = 10.0f;
	+ float I_a = 5.0f; // 5.0 for black background
	+
	+ // Write shading model values to pixel color channels
	+ _img[mempos4] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*0.48f);
	+ _img[mempos4 + 1] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*0.41f);
	+ _img[mempos4 + 2] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*0.27f);
	+
	+ }
	+}
	+
	+// Check wether the pixel's viewing ray intersects with the spheres,
	+// and shade the pixel correspondingly using a colormap
	+__global__ void rayIntersectSpheresColormap(float4* _ray_origo,
	+ float4* _ray_direction,
	+ float4* _p,
	+ float* _fixvel,
	+ float* _linarr,
	+ float max_val,
	+ unsigned char* _img)
	+{
	+ // Compute pixel position from threadIdx/blockIdx
	+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
	+ if (mempos > const_pixels)
	+ return;
	+
	+ // Read ray data from global memory
	+ float3 e = f4_to_f3(_ray_origo[mempos]);
	+ float3 d = f4_to_f3(_ray_direction[mempos]);
	+
	+ // Distance, in ray steps, between object and eye initialized with a large v…
	+ float tdist = 1e10f;
	+
	+ // Surface normal at closest sphere intersection
	+ float3 n;
	+
	+ // Intersection point coordinates
	+ float3 p;
	+
	+ unsigned int hitidx;
	+
	+ // Iterate through all particles
	+ for (Inttype i=0; i<const_np; ++i) {
	+
	+ // Read sphere coordinate and radius
	+ float3 c = f4_to_f3(_p[i]);
	+ float R = _p[i].w;
	+
	+ // Calculate the discriminant: d = B^2 - 4AC
	+ float Delta = (2.0fdot(d,(e-c)))(2.0f*dot(d,(e-c))) // B^2
	+ - 4.0fdot(d,d) // -4A
	+ * (dot((e-c),(e-c)) - R*R); // C
	+
	+ // If the determinant is positive, there are two solutions
	+ // One where the line enters the sphere, and one where it exits
	+ if (Delta > 0.0f) {
	+
	+ // Calculate roots, Shirley 2009 p. 77
	+ float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(…
	+ * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
	+
	+ // Check wether intersection is closer than previous values
	+ if (fabs(t_minus) < tdist) {
	+ p = e + t_minus*d;
	+ tdist = fabs(t_minus);
	+ n = normalize(2.0f * (p - c)); // Surface normal
	+ hitidx = i;
	+ }
	+
	+ } // End of solution branch
	+
	+ } // End of particle loop
	+
	+ // Write pixel color
	+ if (tdist < 1e10) {
	+
	+ // Fetch particle data used for color
	+ float ratio = _linarr[hitidx] / max_val;
	+ float fixvel = _fixvel[hitidx];
	+
	+ // Make sure the ratio doesn't exceed the 0.0-1.0 interval
	+ if (ratio < 0.01f)
	+ ratio = 0.01f;
	+ if (ratio > 0.99f)
	+ ratio = 0.99f;
	+
	+ // Lambertian shading parameters
	+ float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
	+ float I_d = 40.0f; // Light intensity
	+ float k_d = 5.0f; // Diffuse coefficient
	+
	+ // Ambient shading
	+ float k_a = 10.0f;
	+ float I_a = 5.0f;
	+
	+ float redv = red(ratio);
	+ float greenv = green(ratio);
	+ float bluev = blue(ratio);
	+
	+ // Make particle dark grey if the horizontal velocity is fixed
	+ if (fixvel > 0.f) {
	+ redv = 0.5;
	+ greenv = 0.5;
	+ bluev = 0.5;
	+ }
	+
	+ // Write shading model values to pixel color channels
	+ _img[mempos4] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*redv);
	+ _img[mempos4 + 1] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*greenv);
	+ _img[mempos4 + 2] = (unsigned char) ((k_d I_d * dotprod
	+ + k_a * I_a)*bluev);
	+ }
	+}
	+
	+extern "C"
	+__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
	+ unsigned int pixels, Inttype np)
	+{
	+ // Image dimensions in world space (l, r, b, t)
	+ float4 imgplane = make_float4(-0.5fimgw, 0.5fimgw, -0.5fimgwhw_ratio, 0.…
	+
	+ // The view vector
	+ float4 view = eye - lookat;
	+
	+ // Construct the camera view orthonormal base
	+ //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f); // Pointing upward along…
	+ float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f); // Pointing upward along +z
	+ float4 w = -view/length(view); // w: Pointing backwards
	+ float4 u = make_float4(cross(make_float3(up.x, up.y, up.z),
	+ make_float3(w.x, w.y, w.z)), 0.0f)
	+ / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y…
	+ float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.…
	+
	+ // Focal length 20% of eye vector length
	+ float d = length(view)*0.8f;
	+
	+ // Light direction (points towards light source)
	+ float4 light = normalize(-1.0feyemake_float4(1.0f, 0.2f, 0.6f, 0.0f));
	+
	+ std::cout << " Transfering camera values to constant memory\n";
	+
	+ cudaMemcpyToSymbol("const_u", &u, sizeof(u));
	+ cudaMemcpyToSymbol("const_v", &v, sizeof(v));
	+ cudaMemcpyToSymbol("const_w", &w, sizeof(w));
	+ cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye));
	+ cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane));
	+ cudaMemcpyToSymbol("const_d", &d, sizeof(d));
	+ cudaMemcpyToSymbol("const_light", &light, sizeof(light));
	+ cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels));
	+ cudaMemcpyToSymbol("const_np", &np, sizeof(np));
	+}
	+
	+// Check for CUDA errors
	+extern "C"
	+__host__ void checkForCudaErrors(const char* checkpoint_description)
	+{
	+ cudaError_t err = cudaGetLastError();
	+ if (err != cudaSuccess) {
	+ std::cout << "\nCuda error detected, checkpoint: " << checkpoint_descripti…
	+ << "\nError string: " << cudaGetErrorString(err) << "\n";
	+ exit(EXIT_FAILURE);
	+ }
	+}
	+
	+
	+// Wrapper for the rt kernel
	+extern "C"
	+__host__ int rt(float4* p, Inttype np,
	+ rgb* img, unsigned int width, unsigned int height,
	+ f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
	+ int visualize, float max_val,
	+ float* fixvel, float* pres, float* es_dot, float* es, float* v…
	+{
	+ using std::cout;
	+
	+ cout << "Initializing CUDA:\n";
	+
	+ // Initialize GPU timestamp recorders
	+ float t1, t2;
	+ cudaEvent_t t1_go, t2_go, t1_stop, t2_stop;
	+ cudaEventCreate(&t1_go);
	+ cudaEventCreate(&t2_go);
	+ cudaEventCreate(&t2_stop);
	+ cudaEventCreate(&t1_stop);
	+
	+ // Start timer 1
	+ cudaEventRecord(t1_go, 0);
	+
	+ // Allocate memory
	+ cout << " Allocating device memory\n";
	+ static float4 *_p; // Particle positions (x,y,z) and …
	+ static float *_fixvel; // Indicates whether a particle has a …
	+ static float *_linarr; // Array for linear values to color t…
	+ static unsigned char *_img; // RGBw values in image
	+ static float4 *_ray_origo; // Ray origo (x,y,z)
	+ static float4 *_ray_direction; // Ray direction (x,y,z)
	+ cudaMalloc((void*)&_p, npsizeof(float4));
	+ cudaMalloc((void*)&_fixvel, npsizeof(float));
	+ cudaMalloc((void*)&_linarr, npsizeof(float)); // 0 size if visualize = 0;
	+ cudaMalloc((void*)&_img, widthheight4sizeof(unsigned char));
	+ cudaMalloc((void*)&_ray_origo, widthheight*sizeof(float4));
	+ cudaMalloc((void*)&_ray_direction, widthheight*sizeof(float4));
	+
	+ // Transfer particle data
	+ cout << " Transfering particle data: host -> device\n";
	+ cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice);
	+ cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice);
	+ if (visualize == 1 \|\| visualize == 4)
	+ cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice);
	+ if (visualize == 2)
	+ cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice);
	+ if (visualize == 3)
	+ cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice);
	+ if (visualize == 4)
	+ cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice);
	+
	+ // Check for errors after memory allocation
	+ checkForCudaErrors("CUDA error after memory allocation");
	+
	+ // Arrange thread/block structure
	+ unsigned int pixels = width*height;
	+ float hw_ratio = (float)height/(float)width;
	+ const unsigned int threadsPerBlock = 256;
	+ const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock);
	+
	+ // Start timer 2
	+ cudaEventRecord(t2_go, 0);
	+
	+ // Initialize image to background color
	+ imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels);
	+
	+ // Initialize camera
	+ cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f),
	+ make_float4(lookat.x, lookat.y, lookat.z, 0.0f),
	+ imgw, hw_ratio, pixels, np);
	+ checkForCudaErrors("CUDA error after cameraInit");
	+
	+ // Construct rays for perspective projection
	+ rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ make_float4(eye.x, eye.y, eye.z, 0.0f),
	+ width, height);
	+
	+ cudaThreadSynchronize();
	+
	+ // Find closest intersection between rays and spheres
	+ if (visualize == 1) { // Visualize pressure
	+ cout << " Pressure color map range: [0, " << max_val << "] Pa\n";
	+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ _p, _fixvel, _linarr, max_val, _img);
	+ } else if (visualize == 2) { // es_dot visualization
	+ cout << " Shear heat production rate color map range: [0, " << max_val <<…
	+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ _p, _fixvel, _linarr, max_val, _img);
	+ } else if (visualize == 3) { // es visualization
	+ cout << " Total shear heat color map range: [0, " << max_val << "] J\n";
	+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ _p, _fixvel, _linarr, max_val, _img);
	+ } else if (visualize == 4) { // velocity visualization
	+ cout << " Velocity color map range: [0, " << max_val << "] m/s\n";
	+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ _p, _fixvel, _linarr, max_val, _img);
	+ } else { // Normal visualization
	+ rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
	+ _ray_origo, _ray_direction,
	+ _p, _img);
	+ }
	+
	+ // Make sure all threads are done before continuing CPU control sequence
	+ cudaThreadSynchronize();
	+
	+ // Check for errors
	+ checkForCudaErrors("CUDA error after kernel execution");
	+
	+ // Stop timer 2
	+ cudaEventRecord(t2_stop, 0);
	+ cudaEventSynchronize(t2_stop);
	+
	+ // Transfer image data from device to host
	+ cout << " Transfering image data: device -> host\n";
	+ cudaMemcpy(img, _img, widthheight4*sizeof(unsigned char), cudaMemcpyDevice…
	+
	+ // Free dynamically allocated device memory
	+ cudaFree(_p);
	+ cudaFree(_fixvel);
	+ cudaFree(_linarr);
	+ cudaFree(_img);
	+ cudaFree(_ray_origo);
	+ cudaFree(_ray_direction);
	+
	+ // Stop timer 1
	+ cudaEventRecord(t1_stop, 0);
	+ cudaEventSynchronize(t1_stop);
	+
	+ // Calculate time spent in t1 and t2
	+ cudaEventElapsedTime(&t1, t1_go, t1_stop);
	+ cudaEventElapsedTime(&t2, t2_go, t2_stop);
	+
	+ // Report time spent
	+ cout << " Time spent on entire GPU routine: "
	+ << t1 << " ms\n";
	+ cout << " - Kernels: " << t2 << " ms\n"
	+ << " - Memory alloc. and transfer: " << t1-t2 << " ms\n";
	+
	+ // Return successfully
	+ return 0;
	+}
	diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh
	t@@ -0,0 +1,36 @@
	+#!/bin/bash
	+
	+# Pass max. pressure as command line argument
	+
	+XRES=400
	+YRES=1600
	+
	+WORKHORSE=GPU
	+
	+echo "# Rendering PPM images"
	+for F in ../output/init.bin
	+do
	+ BASE=`basename $F`
	+ OUTFILE=$BASE.ppm.jpg
	+ if [ -e ../img_out/$OUTFILE ]
	+ then
	+ echo "$OUTFILE exists." > /dev/null
	+ else
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
	+ if [ $? -ne 0 ] ; then
	+ echo "Error rendering $F, trying again..."
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n…
	+ fi
	+ fi
	+done
	+
	+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
	+for F in ../img_out/*.ppm
	+do
	+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
	+done
	+
	+sleep 5
	+echo "# Removing temporary PPM files"
	+rm ../img_out/*.ppm
	+
	diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh
	t@@ -0,0 +1,36 @@
	+#!/bin/bash
	+
	+# Pass max. pressure as command line argument
	+
	+XRES=800
	+YRES=800
	+
	+WORKHORSE=GPU
	+
	+echo "# Rendering PPM images"
	+for F in ../output/*.bin
	+do
	+ BASE=`basename $F`
	+ OUTFILE=$BASE.ppm.jpg
	+ if [ -e ../img_out/$OUTFILE ]
	+ then
	+ echo "$OUTFILE exists." > /dev/null
	+ else
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
	+ if [ $? -ne 0 ] ; then
	+ echo "Error rendering $F, trying again..."
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n…
	+ fi
	+ fi
	+done
	+
	+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
	+for F in ../img_out/*.ppm
	+do
	+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
	+done
	+
	+sleep 5
	+echo "# Removing temporary PPM files"
	+rm ../img_out/*.ppm
	+
	diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh
	t@@ -0,0 +1,36 @@
	+#!/bin/bash
	+
	+# Pass max. pressure as command line argument
	+
	+XRES=800
	+YRES=1200
	+
	+WORKHORSE=GPU
	+
	+echo "# Rendering PPM images"
	+for F in ../output/*.bin
	+do
	+ BASE=`basename $F`
	+ OUTFILE=$BASE.ppm.jpg
	+ if [ -e ../img_out/$OUTFILE ]
	+ then
	+ echo "$OUTFILE exists." > /dev/null
	+ else
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
	+ if [ $? -ne 0 ] ; then
	+ echo "Error rendering $F, trying again..."
	+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n…
	+ fi
	+ fi
	+done
	+
	+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
	+for F in ../img_out/*.ppm
	+do
	+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
	+done
	+
	+sleep 5
	+echo "# Removing temporary PPM files"
	+rm ../img_out/*.ppm
	+
	diff --git a/src/cohesion.cuh b/src/cohesion.cuh
	t@@ -0,0 +1,126 @@
	+#ifndef COHESION_CUH_
	+#define COHESION_CUH_
	+
	+// cohesion.cuh
	+// Functions governing attractive forces between contacts
	+
	+
	+// Linear-elastic bond: Attractive force with normal- and shear components
	+// acting upon particle A in a bonded particle pair
	+__device__ void bondLinear(Float3* N, Float3* T, Float* es_dot, Float* p,
	+ unsigned int idx_a, unsigned int idx_b,
	+ Float4* dev_x_sorted, Float4* dev_vel_sorted,
	+ Float4* dev_angvel_sorted,
	+ Float radius_a, Float radius_b,
	+ Float3 x_ab, Float x_ab_length,
	+ Float delta_ab)
	+{
	+
	+ // If particles are not overlapping, apply bond force
	+ if (delta_ab > 0.0f) {
	+
	+ // Allocate variables and fetch missing time=t values for particle A and B
	+ Float4 vel_a = dev_vel_sorted[idx_a];
	+ Float4 vel_b = dev_vel_sorted[idx_b];
	+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
	+ Float4 angvel4_b = dev_angvel_sorted[idx_b];
	+
	+ // Convert to Float3's
	+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	+
	+ // Normal vector of contact
	+ Float3 n_ab = x_ab/x_ab_length;
	+
	+ // Relative contact interface velocity, w/o rolling
	+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);
	+
	+ // Relative contact interface velocity of particle surfaces at
	+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	+ Float3 vel_ab = vel_ab_linear
	+ + radius_a * cross(n_ab, angvel_a)
	+ + radius_b * cross(n_ab, angvel_b);
	+
	+ // Relative contact interface rolling velocity
	+ //Float3 angvel_ab = angvel_a - angvel_b;
	+ //Float angvel_ab_length = length(angvel_ab);
	+
	+ // Normal component of the relative contact interface velocity
	+ //Float vel_n_ab = dot(vel_ab_linear, n_ab);
	+
	+ // Tangential component of the relative contact interface velocity
	+ // Hinrichsen and Wolf 2004, eq. 13.9
	+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	+ //Float vel_t_ab_length = length(vel_t_ab);
	+
	+ Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+ Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Mean radius
	+ Float R_bar = (radius_a + radius_b)/2.0f;
	+
	+ // Normal force component: Elastic
	+ f_n = devC_k_n * delta_ab * n_ab;
	+
	+ if (length(vel_t_ab) > 0.f) {
	+ // Shear force component: Viscous
	+ f_t = -1.0f * devC_gamma_s * vel_t_ab;
	+
	+ // Shear friction production rate [W]
	+ //*es_dot += -dot(vel_t_ab, f_t);
	+ }
	+
	+ // Add force components from this bond to total force for particle
	+ *N += f_n + f_t;
	+ T += -R_bar cross(n_ab, f_t);
	+
	+ // Pressure excerted onto the particle from this bond
	+ p += length(f_n) / (4.0f PI * radius_a*radius_a);
	+
	+ }
	+} // End of bondLinear()
	+
	+
	+// Capillary cohesion after Richefeu et al. (2006)
	+__device__ void capillaryCohesion_exp(Float3* N, Float radius_a,
	+ Float radius_b, Float delta_ab,
	+ Float3 x_ab, Float x_ab_length,
	+ Float kappa)
	+{
	+
	+ // Normal vector
	+ Float3 n_ab = x_ab/x_ab_length;
	+
	+ Float3 f_c;
	+ Float lambda, R_geo, R_har, r, h;
	+
	+ // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
	+ if ((radius_a/radius_b) > (radius_b/radius_a))
	+ r = radius_a/radius_b;
	+ else
	+ r = radius_b/radius_a;
	+
	+ // Exponential decay function
	+ h = -sqrtf(r);
	+
	+ // The harmonic mean
	+ R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
	+
	+ // The geometrical mean
	+ R_geo = sqrtf(radius_a * radius_b);
	+
	+ // The exponential falloff of the capillary force with distance
	+ lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
	+
	+ // Calculate cohesional force
	+ f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
	+
	+ // Add force components from this collision to total force for particle
	+ *N += f_c;
	+
	+} // End of capillaryCohesion_exp
	+
	+
	+#endif
	diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
	t@@ -0,0 +1,411 @@
	+#ifndef CONTACTMODELS_CUH_
	+#define CONTACTMODELS_CUH_
	+
	+// contactmodels.cuh
	+// Functions governing repulsive forces between contacts
	+
	+
	+// Linear viscoelastic contact model for particle-wall interactions
	+// with tangential friction and rolling resistance
	+__device__ Float contactLinear_wall(Float3* N, Float3* T, Float* es_dot, Float…
	+ unsigned int idx_a, Float radius_a,
	+ Float4* dev_vel_sorted, Float4* dev_angvel…
	+ Float3 n, Float delta, Float wvel)
	+{
	+ // Fetch particle velocities from global memory
	+ Float4 linvel_tmp = dev_vel_sorted[idx_a];
	+ Float4 angvel_tmp = dev_angvel_sorted[idx_a];
	+
	+ // Convert velocities to three-component vectors
	+ Float3 linvel = MAKE_FLOAT3(linvel_tmp.x,
	+ linvel_tmp.y,
	+ linvel_tmp.z);
	+ Float3 angvel = MAKE_FLOAT3(angvel_tmp.x,
	+ angvel_tmp.y,
	+ angvel_tmp.z);
	+
	+ // Store the length of the angular velocity for later use
	+ Float angvel_length = length(angvel);
	+
	+ // Contact velocity is the sum of the linear and
	+ // rotational components
	+ Float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
	+
	+ // Normal component of the contact velocity
	+ Float vel_n = dot(vel, n);
	+
	+ // The tangential velocity is the contact velocity
	+ // with the normal component subtracted
	+ Float3 vel_t = vel - n * (dot(vel, n));
	+ Float vel_t_length = length(vel_t);
	+
	+ // Calculate elastic normal component
	+ //Float3 f_n = -devC_k_n * delta * n;
	+
	+ // Normal force component: Elastic - viscous damping
	+ Float3 f_n = (-devC_k_n * delta - devC_gamma_wn * vel_n) * n;
	+
	+ // Make sure the viscous damping doesn't exceed the elastic component,
	+ // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	+ if (dot(f_n, n) < 0.0f)
	+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ Float f_n_length = length(f_n); // Save length for later use
	+
	+ // Initialize vectors
	+ Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+ Float3 T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Check that the tangential velocity is high enough to avoid
	+ // divide by zero (producing a NaN)
	+ if (vel_t_length > 0.f) {
	+
	+ Float f_t_visc = devC_gamma_ws * vel_t_length; // Tangential force by vis…
	+ Float f_t_limit = devC_mu_s * f_n_length; // Max. friction
	+
	+ // If the shear force component exceeds the friction,
	+ // the particle slips and energy is dissipated
	+ if (f_t_visc < f_t_limit) {
	+ f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
	+
	+ } else { // Dynamic friction, friction failure
	+ f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
	+
	+ // Shear energy production rate [W]
	+ //*es_dot += -dot(vel_t, f_t);
	+ }
	+ }
	+
	+/* if (angvel_length > 0.f) {
	+ // Apply rolling resistance (Zhou et al. 1999)
	+ //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
	+
	+ // New rolling resistance model
	+ T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
	+ devC_mu_r * radius_a * f_n_length)
	+ * angvel_a/angvel_length;
	+ }*/
	+
	+ // Total force from wall
	+ *N += f_n + f_t;
	+
	+ // Total torque from wall
	+ T += -radius_a cross(n, f_t) + T_res;
	+
	+ // Pressure excerted onto particle from this contact
	+ p += f_n_length / (4.0f PI * radius_a*radius_a);
	+
	+ // Return force excerted onto the wall
	+ //return -dot(*N, n);
	+ return dot(f_n, n);
	+}
	+
	+
	+// Linear vicoelastic contact model for particle-particle interactions
	+// with tangential friction and rolling resistance
	+__device__ void contactLinearViscous(Float3* N, Float3* T, Float* es_dot, Floa…
	+ unsigned int idx_a, unsigned in…
	+ Float4* dev_vel_sorted,
	+ Float4* dev_angvel_sorted,
	+ Float radius_a, Float radius_b,
	+ Float3 x_ab, Float x_ab_length,
	+ Float delta_ab, Float kappa)
	+{
	+
	+ // Allocate variables and fetch missing time=t values for particle A and B
	+ Float4 vel_a = dev_vel_sorted[idx_a];
	+ Float4 vel_b = dev_vel_sorted[idx_b];
	+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
	+ Float4 angvel4_b = dev_angvel_sorted[idx_b];
	+
	+ // Convert to Float3's
	+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	+
	+ // Force between grain pair decomposed into normal- and tangential part
	+ Float3 f_n, f_t, f_c, T_res;
	+
	+ // Normal vector of contact
	+ Float3 n_ab = x_ab/x_ab_length;
	+
	+ // Relative contact interface velocity, w/o rolling
	+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);
	+
	+ // Relative contact interface velocity of particle surfaces at
	+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	+ Float3 vel_ab = vel_ab_linear
	+ + radius_a * cross(n_ab, angvel_a)
	+ + radius_b * cross(n_ab, angvel_b);
	+
	+ // Relative contact interface rolling velocity
	+ Float3 angvel_ab = angvel_a - angvel_b;
	+ Float angvel_ab_length = length(angvel_ab);
	+
	+ // Normal component of the relative contact interface velocity
	+ Float vel_n_ab = dot(vel_ab_linear, n_ab);
	+
	+ // Tangential component of the relative contact interface velocity
	+ // Hinrichsen and Wolf 2004, eq. 13.9
	+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	+ Float vel_t_ab_length = length(vel_t_ab);
	+
	+ // Compute the normal stiffness of the contact
	+ //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
	+
	+ // Calculate rolling radius
	+ Float R_bar = (radius_a + radius_b) / 2.0f;
	+
	+ // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
	+ // with velocity dependant damping
	+ // Damping coefficient: alpha = 0.8
	+ //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_effk_n_ab) vel_ab) * …
	+
	+ // Linear spring for normal component (Renzo 2004, eq. 35)
	+ // Dissipation due to plastic deformation is modelled by using a different
	+ // unloading spring constant (Walton and Braun 1986)
	+ // Here the factor in the second term determines the relative strength of the
	+ // unloading spring relative to the loading spring.
	+ /* if (vel_n_ab > 0.0f) { // Loading
	+ f_n = (-k_n_ab * delta_ab) * n_ab;
	+ } else { // Unloading
	+ f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
	+ } // f_n is OK! */
	+
	+ // Normal force component: Elastic
	+ //f_n = -k_n_ab * delta_ab * n_ab;
	+
	+ // Normal force component: Elastic - viscous damping
	+ f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
	+
	+ // Make sure the viscous damping doesn't exceed the elastic component,
	+ // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	+ if (dot(f_n, n_ab) < 0.0f)
	+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ Float f_n_length = length(f_n);
	+
	+ // Add max. capillary force
	+ f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
	+
	+ // Initialize force vectors to zero
	+ f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+ T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Shear force component: Nonlinear relation
	+ // Coulomb's law of friction limits the tangential force to less or equal
	+ // to the normal force
	+ if (vel_t_ab_length > 0.f) {
	+
	+ // Tangential force by viscous model
	+ Float f_t_visc = devC_gamma_s * vel_t_ab_length;
	+
	+ // Determine max. friction
	+ Float f_t_limit;
	+ if (vel_t_ab_length > 0.001f) { // Dynamic
	+ f_t_limit = devC_mu_d * length(f_n-f_c);
	+ } else { // Static
	+ f_t_limit = devC_mu_s * length(f_n-f_c);
	+ }
	+
	+ // If the shear force component exceeds the friction,
	+ // the particle slips and energy is dissipated
	+ if (f_t_visc < f_t_limit) { // Static
	+ f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
	+
	+ } else { // Dynamic, friction failure
	+ f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
	+
	+ // Shear friction production rate [W]
	+ //*es_dot += -dot(vel_t_ab, f_t);
	+ }
	+ }
	+
	+/* if (angvel_ab_length > 0.f) {
	+ // Apply rolling resistance (Zhou et al. 1999)
	+ //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
	+
	+ // New rolling resistance model
	+ T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
	+ devC_mu_r * R_bar * f_n_length)
	+ * angvel_ab/angvel_ab_length;
	+ }
	+*/
	+
	+ // Add force components from this collision to total force for particle
	+ *N += f_n + f_t + f_c;
	+ T += -R_bar cross(n_ab, f_t) + T_res;
	+
	+ // Pressure excerted onto the particle from this contact
	+ p += f_n_length / (4.0f PI * radius_a*radius_a);
	+
	+} // End of contactLinearViscous()
	+
	+
	+// Linear elastic contact model for particle-particle interactions
	+__device__ void contactLinear(Float3* N, Float3* T,
	+ Float* es_dot, Float* p,
	+ unsigned int idx_a_orig,
	+ unsigned int idx_b_orig,
	+ Float4 vel_a,
	+ Float4* dev_vel,
	+ Float3 angvel_a,
	+ Float4* dev_angvel,
	+ Float radius_a, Float radius_b,
	+ Float3 x_ab, Float x_ab_length,
	+ Float delta_ab, Float4* dev_delta_t,
	+ unsigned int mempos)
	+{
	+
	+ // Allocate variables and fetch missing time=t values for particle A and B
	+ Float4 vel_b = dev_vel[idx_b_orig];
	+ Float4 angvel4_b = dev_angvel[idx_b_orig];
	+
	+ // Fetch previous sum of shear displacement for the contact pair
	+ Float4 delta_t0_4 = dev_delta_t[mempos];
	+
	+ Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
	+ delta_t0_4.y,
	+ delta_t0_4.z);
	+
	+ // Convert to Float3
	+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
	+
	+ // Force between grain pair decomposed into normal- and tangential part
	+ Float3 f_n, f_t, f_c, T_res;
	+
	+ // Normal vector of contact
	+ Float3 n_ab = x_ab/x_ab_length;
	+
	+ // Relative contact interface velocity, w/o rolling
	+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
	+ vel_a.y - vel_b.y,
	+ vel_a.z - vel_b.z);
	+
	+ // Relative contact interface velocity of particle surfaces at
	+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
	+ Float3 vel_ab = vel_ab_linear
	+ + radius_a * cross(n_ab, angvel_a)
	+ + radius_b * cross(n_ab, angvel_b);
	+
	+ // Relative contact interface rolling velocity
	+ Float3 angvel_ab = angvel_a - angvel_b;
	+ Float angvel_ab_length = length(angvel_ab);
	+
	+ // Normal component of the relative contact interface velocity
	+ Float vel_n_ab = dot(vel_ab_linear, n_ab);
	+
	+ // Tangential component of the relative contact interface velocity
	+ // Hinrichsen and Wolf 2004, eq. 13.9
	+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
	+ Float vel_t_ab_length = length(vel_t_ab);
	+
	+ // Correct tangential displacement vector, which is
	+ // necessary if the tangential plane rotated
	+ Float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
	+ Float delta_t0_length = length(delta_t0);
	+
	+ // New tangential displacement vector
	+ Float3 delta_t;
	+
	+ // Compute the normal stiffness of the contact
	+ //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
	+
	+ // Calculate rolling radius
	+ Float R_bar = (radius_a + radius_b) / 2.0f;
	+
	+ // Normal force component: Elastic
	+ //f_n = -devC_k_n * delta_ab * n_ab;
	+
	+ // Normal force component: Elastic - viscous damping
	+ f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
	+
	+ // Make sure the viscous damping doesn't exceed the elastic component,
	+ // i.e. the damping factor doesn't exceed the critical damping, 2sqrt(mk_n)
	+ if (dot(f_n, n_ab) < 0.0f)
	+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ Float f_n_length = length(f_n);
	+
	+ // Add max. capillary force
	+ f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
	+
	+ // Initialize force vectors to zero
	+ f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+ T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Apply a tangential force if the previous tangential displacement
	+ // is non-zero, or the current sliding velocity is non-zero.
	+ if (delta_t0_length > 0.f \|\| vel_t_ab_length > 0.f) {
	+
	+ // Shear force: Visco-Elastic, limited by Coulomb friction
	+ Float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
	+ Float3 f_t_visc = -1.0f * devC_gamma_s * vel_t_ab;
	+
	+ Float f_t_limit;
	+
	+ if (vel_t_ab_length > 0.001f) { // Dynamic friciton
	+ f_t_limit = devC_mu_d * length(f_n-f_c);
	+ } else { // Static friction
	+ f_t_limit = devC_mu_s * length(f_n-f_c);
	+ }
	+
	+ // Tangential force before friction limit correction
	+ f_t = f_t_elast + f_t_visc;
	+ Float f_t_length = length(f_t);
	+
	+ // If failure criterion is not met, contact is viscous-linear elastic.
	+ // If failure criterion is met, contact force is limited,
	+ // resulting in a slip and energy dissipation
	+ if (f_t_length > f_t_limit) { // Dynamic case
	+
	+ // Frictional force is reduced to equal the limit
	+ f_t *= f_t_limit/f_t_length;
	+
	+ // A slip event zeros the displacement vector
	+ //delta_t = make_Float3(0.0f, 0.0f, 0.0f);
	+
	+ // In a slip event, the tangential spring is adjusted to a
	+ // length which is consistent with Coulomb's equation
	+ // (Hinrichsen and Wolf, 2004)
	+ delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
	+
	+ // Shear friction heat production rate:
	+ // The energy lost from the tangential spring is dissipated as heat
	+ //*es_dot += -dot(vel_t_ab, f_t);
	+ es_dot += length(delta_t0 - delta_t) devC_k_s / devC_dt; // Seen in E…
	+
	+ } else { // Static case
	+
	+ // No correction of f_t is required
	+
	+ // Add tangential displacement to total tangential displacement
	+ delta_t = delta_t0 + vel_t_ab * devC_dt;
	+ }
	+ }
	+
	+ if (angvel_ab_length > 0.f) {
	+ // Apply rolling resistance (Zhou et al. 1999)
	+ //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
	+
	+ // New rolling resistance model
	+ T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
	+ devC_mu_r * R_bar * f_n_length)
	+ * angvel_ab/angvel_ab_length;
	+ }
	+
	+ // Add force components from this collision to total force for particle
	+ *N += f_n + f_t + f_c;
	+ T += -R_bar cross(n_ab, f_t) + T_res;
	+
	+ // Pressure excerted onto the particle from this contact
	+ p += f_n_length / (4.0f PI * radius_a*radius_a);
	+
	+ // Store sum of tangential displacements
	+ dev_delta_t[mempos] = MAKE_FLOAT4(delta_t.x, delta_t.y, delta_t.z, 0.0f);
	+
	+} // End of contactLinear()
	+
	+
	+#endif
	diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
	t@@ -0,0 +1,625 @@
	+#ifndef CONTACTSEARCH_CUH_
	+#define CONTACTSEARCH_CUH_
	+
	+// contactsearch.cuh
	+// Functions for identifying contacts and processing boundaries
	+
	+
	+// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
	+// Kernel executed on device, and callable from device only.
	+__device__ void overlapsInCell(int3 targetCell,
	+ unsigned int idx_a,
	+ Float4 x_a, Float radius_a,
	+ Float3* N, Float3* T,
	+ Float* es_dot, Float* p,
	+ Float4* dev_x_sorted,
	+ Float* dev_radius_sorted,
	+ Float4* dev_vel_sorted,
	+ Float4* dev_angvel_sorted,
	+ unsigned int* dev_cellStart,
	+ unsigned int* dev_cellEnd,
	+ Float4* dev_w_nx,
	+ Float4* dev_w_mvfd)
	+ //uint4 bonds)
	+{
	+
	+ // Variable containing modifier for interparticle
	+ // vector, if it crosses a periodic boundary
	+ Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Check whether x- and y boundaries are to be treated as periodic
	+ // 1: x- and y boundaries periodic
	+ // 2: x boundaries periodic
	+ if (devC_periodic == 1) {
	+
	+ // Periodic x-boundary
	+ if (targetCell.x < 0) {
	+ targetCell.x = devC_num[0] - 1;
	+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+ if (targetCell.x == devC_num[0]) {
	+ targetCell.x = 0;
	+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+
	+ // Periodic y-boundary
	+ if (targetCell.y < 0) {
	+ targetCell.y = devC_num[1] - 1;
	+ distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
	+ }
	+ if (targetCell.y == devC_num[1]) {
	+ targetCell.y = 0;
	+ distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
	+ }
	+
	+ } else if (devC_periodic == 2) {
	+
	+ // Periodic x-boundary
	+ if (targetCell.x < 0) {
	+ targetCell.x = devC_num[0] - 1;
	+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+ if (targetCell.x == devC_num[0]) {
	+ targetCell.x = 0;
	+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+
	+ // Hande out-of grid cases on y-axis
	+ if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	+ return;
	+
	+ } else {
	+
	+ // Hande out-of grid cases on x- and y-axes
	+ if (targetCell.x < 0 \|\| targetCell.x == devC_num[0])
	+ return;
	+ if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	+ return;
	+ }
	+
	+ // Handle out-of-grid cases on z-axis
	+ if (targetCell.z < 0 \|\| targetCell.z == devC_num[2])
	+ return;
	+
	+
	+ //// Check and process particle-particle collisions
	+
	+ // Calculate linear cell ID
	+ unsigned int cellID = targetCell.x
	+ + __umul24(targetCell.y, devC_num[0])
	+ + __umul24(__umul24(devC_num[0],
	+ devC_num[1]),
	+ targetCell.z);
	+
	+ // Lowest particle index in cell
	+ unsigned int startIdx = dev_cellStart[cellID];
	+
	+ // Make sure cell is not empty
	+ if (startIdx != 0xffffffff) {
	+
	+ // Highest particle index in cell + 1
	+ unsigned int endIdx = dev_cellEnd[cellID];
	+
	+ // Iterate over cell particles
	+ for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
	+ if (idx_b != idx_a) { // Do not collide particle with itself
	+
	+ // Fetch position and velocity of particle B.
	+ Float4 x_b = dev_x_sorted[idx_b];
	+ Float radius_b = dev_radius_sorted[idx_b];
	+ Float kappa = devC_kappa;
	+
	+ // Distance between particle centers (Float4 -> Float3)
	+ Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
	+ x_a.y - x_b.y,
	+ x_a.z - x_b.z);
	+
	+ // Adjust interparticle vector if periodic boundary/boundaries
	+ // are crossed
	+ x_ab += distmod;
	+
	+ Float x_ab_length = length(x_ab);
	+
	+ // Distance between particle perimeters
	+ Float delta_ab = x_ab_length - (radius_a + radius_b);
	+
	+ // Check for particle overlap
	+ if (delta_ab < 0.0f) {
	+ contactLinearViscous(N, T, es_dot, p,
	+ idx_a, idx_b,
	+ dev_vel_sorted,
	+ dev_angvel_sorted,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_ab, kappa);
	+ } else if (delta_ab < devC_db) {
	+ // Check wether particle distance satisfies the capillary bond dista…
	+ capillaryCohesion_exp(N, radius_a, radius_b, delta_ab,
	+ x_ab, x_ab_length, kappa);
	+ }
	+
	+ // Check wether particles are bonded together
	+ /*if (bonds.x == idx_b \|\| bonds.y == idx_b \|\|
	+ bonds.z == idx_b \|\| bonds.w == idx_b) {
	+ bondLinear(N, T, es_dot, p,
	+ idx_a, idx_b,
	+ dev_x_sorted, dev_vel_sorted,
	+ dev_angvel_sorted,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_ab);
	+ }*/
	+
	+ } // Do not collide particle with itself end
	+ } // Iterate over cell particles end
	+ } // Check wether cell is empty end
	+} // End of overlapsInCell(...)
	+
	+// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
	+// Write the indexes of the overlaps in array contacts[]
	+// Kernel executed on device, and callable from device only.
	+__device__ void findContactsInCell(int3 targetCell,
	+ unsigned int idx_a,
	+ Float4 x_a, Float radius_a,
	+ Float4* dev_x_sorted,
	+ Float* dev_radius_sorted,
	+ unsigned int* dev_cellStart,
	+ unsigned int* dev_cellEnd,
	+ unsigned int* dev_gridParticleIndex,
	+ int* nc,
	+ unsigned int* dev_contacts,
	+ Float4* dev_distmod)
	+{
	+ // Variable containing modifier for interparticle
	+ // vector, if it crosses a periodic boundary
	+ Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Check whether x- and y boundaries are to be treated as periodic
	+ // 1: x- and y boundaries periodic
	+ // 2: x boundaries periodic
	+ if (devC_periodic == 1) {
	+
	+ // Periodic x-boundary
	+ if (targetCell.x < 0) {
	+ targetCell.x = devC_num[0] - 1;
	+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+ if (targetCell.x == devC_num[0]) {
	+ targetCell.x = 0;
	+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+
	+ // Periodic y-boundary
	+ if (targetCell.y < 0) {
	+ targetCell.y = devC_num[1] - 1;
	+ distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
	+ }
	+ if (targetCell.y == devC_num[1]) {
	+ targetCell.y = 0;
	+ distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
	+ }
	+
	+ } else if (devC_periodic == 2) {
	+
	+ // Periodic x-boundary
	+ if (targetCell.x < 0) {
	+ targetCell.x = devC_num[0] - 1;
	+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+ if (targetCell.x == devC_num[0]) {
	+ targetCell.x = 0;
	+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
	+ }
	+
	+ // Hande out-of grid cases on y-axis
	+ if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	+ return;
	+
	+ } else {
	+
	+ // Hande out-of grid cases on x- and y-axes
	+ if (targetCell.x < 0 \|\| targetCell.x == devC_num[0])
	+ return;
	+ if (targetCell.y < 0 \|\| targetCell.y == devC_num[1])
	+ return;
	+ }
	+
	+ // Handle out-of-grid cases on z-axis
	+ if (targetCell.z < 0 \|\| targetCell.z == devC_num[2])
	+ return;
	+
	+
	+ //// Check and process particle-particle collisions
	+
	+ // Calculate linear cell ID
	+ unsigned int cellID = targetCell.x
	+ + __umul24(targetCell.y, devC_num[0])
	+ + __umul24(__umul24(devC_num[0],
	+ devC_num[1]),
	+ targetCell.z);
	+
	+ // Lowest particle index in cell
	+ unsigned int startIdx = dev_cellStart[cellID];
	+
	+ // Make sure cell is not empty
	+ if (startIdx != 0xffffffff) {
	+
	+ // Highest particle index in cell + 1
	+ unsigned int endIdx = dev_cellEnd[cellID];
	+
	+ // Read the original index of particle A
	+ unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
	+
	+ // Iterate over cell particles
	+ for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
	+ if (idx_b != idx_a) { // Do not collide particle with itself
	+
	+ // Fetch position and radius of particle B.
	+ Float4 x_b = dev_x_sorted[idx_b];
	+ Float radius_b = dev_radius_sorted[idx_b];
	+
	+ // Read the original index of particle B
	+ unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
	+
	+ __syncthreads();
	+
	+ // Distance between particle centers (Float4 -> Float3)
	+ Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
	+ x_a.y - x_b.y,
	+ x_a.z - x_b.z);
	+
	+ // Adjust interparticle vector if periodic boundary/boundaries
	+ // are crossed
	+ x_ab += distmod;
	+
	+ Float x_ab_length = length(x_ab);
	+
	+ // Distance between particle perimeters
	+ Float delta_ab = x_ab_length - (radius_a + radius_b);
	+
	+ // Check for particle overlap
	+ if (delta_ab < 0.0f) {
	+
	+ // If the particle is not yet registered in the contact list,
	+ // use the next position in the array
	+ int cpos = *nc;
	+ unsigned int cidx;
	+
	+ // Find out, if particle is already registered in contact list
	+ for (int i=0; i<devC_nc; ++i) {
	+ __syncthreads();
	+ cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
	+ if (cidx == idx_b_orig)
	+ cpos = i;
	+ }
	+
	+ __syncthreads();
	+
	+ // Write the particle index to the relevant position,
	+ // no matter if it already is there or not (concurrency of write)
	+ dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
	+
	+
	+ // Write the interparticle vector and radius of particle B
	+ //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4…
	+ dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = MAKE_FLOAT4(d…
	+
	+ // Increment contact counter
	+ ++*nc;
	+ }
	+
	+ // Write the inter-particle position vector correction and radius of p…
	+ //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4(d…
	+
	+ // Check wether particles are bonded together
	+ /*if (bonds.x == idx_b \|\| bonds.y == idx_b \|\|
	+ bonds.z == idx_b \|\| bonds.w == idx_b) {
	+ bondLinear(N, T, es_dot, p,
	+ idx_a, idx_b,
	+ dev_x_sorted, dev_vel_sorted,
	+ dev_angvel_sorted,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_ab);
	+ }*/
	+
	+ } // Do not collide particle with itself end
	+ } // Iterate over cell particles end
	+ } // Check wether cell is empty end
	+} // End of findContactsInCell(...)
	+
	+
	+// For a single particle:
	+// Search for neighbors to particle 'idx' inside the 27 closest cells,
	+// and save the contact pairs in global memory.
	+__global__ void topology(unsigned int* dev_cellStart,
	+ unsigned int* dev_cellEnd, // Input: Particles in…
	+ unsigned int* dev_gridParticleIndex, // Input: Unsort…
	+ Float4* dev_x_sorted, Float* dev_radius_sorted,
	+ unsigned int* dev_contacts,
	+ Float4* dev_distmod)
	+{
	+ // Thread index equals index of particle A
	+ unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	+ if (idx_a < devC_np) {
	+ // Fetch particle data in global read
	+ Float4 x_a = dev_x_sorted[idx_a];
	+ Float radius_a = dev_radius_sorted[idx_a];
	+
	+ // Count the number of contacts in this time step
	+ int nc = 0;
	+
	+ // Grid position of host and neighbor cells in uniform, cubic grid
	+ int3 gridPos;
	+ int3 targetPos;
	+
	+ // Calculate cell address in grid from position of particle
	+ gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	+ gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	+ gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	+
	+ // Find overlaps between particle no. idx and all particles
	+ // from its own cell + 26 neighbor cells
	+ for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	+ targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	+ findContactsInCell(targetPos, idx_a, x_a, radius_a,
	+ dev_x_sorted, dev_radius_sorted,
	+ dev_cellStart, dev_cellEnd,
	+ dev_gridParticleIndex,
	+ &nc, dev_contacts, dev_distmod);
	+ }
	+ }
	+ }
	+ }
	+} // End of topology(...)
	+
	+
	+// For a single particle:
	+// Search for neighbors to particle 'idx' inside the 27 closest cells,
	+// and compute the resulting normal- and shear force on it.
	+// Collide with top- and bottom walls, save resulting force on upper wall.
	+// Kernel is executed on device, and is callable from host only.
	+__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsort…
	+ unsigned int* dev_cellStart,
	+ unsigned int* dev_cellEnd,
	+ Float4* dev_x, Float* dev_radius,
	+ Float4* dev_x_sorted, Float* dev_radius_sorted,
	+ Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
	+ Float4* dev_vel, Float4* dev_angvel,
	+ Float4* dev_force, Float4* dev_torque,
	+ Float* dev_es_dot, Float* dev_es, Float* dev_p,
	+ Float4* dev_w_nx, Float4* dev_w_mvfd,
	+ Float* dev_w_force, //uint4* dev_bonds_sorted,
	+ unsigned int* dev_contacts,
	+ Float4* dev_distmod,
	+ Float4* dev_delta_t)
	+{
	+ // Thread index equals index of particle A
	+ unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	+
	+ if (idx_a < devC_np) {
	+
	+ // Fetch particle data in global read
	+ unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
	+ Float4 x_a = dev_x_sorted[idx_a];
	+ Float radius_a = dev_radius_sorted[idx_a];
	+
	+ // Fetch wall data in global read
	+ Float4 w_up_nx = dev_w_nx[0];
	+ Float4 w_up_mvfd = dev_w_mvfd[0];
	+
	+ // Fetch world dimensions in constant memory read
	+ Float3 origo = MAKE_FLOAT3(devC_origo[0],
	+ devC_origo[1],
	+ devC_origo[2]);
	+ Float3 L = MAKE_FLOAT3(devC_L[0],
	+ devC_L[1],
	+ devC_L[2]);
	+
	+ // Index of particle which is bonded to particle A.
	+ // The index is equal to the particle no (p.np)
	+ // if particle A is bond-less.
	+ //uint4 bonds = dev_bonds_sorted[idx_a];
	+
	+ // Initiate shear friction loss rate at 0.0
	+ Float es_dot = 0.0f;
	+
	+ // Initiate pressure on particle at 0.0
	+ Float p = 0.0f;
	+
	+ // Allocate memory for temporal force/torque vector values
	+ Float3 N = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+ Float3 T = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
	+
	+ // Apply linear elastic, frictional contact model to registered contacts
	+ if (devC_shearmodel == 2) {
	+ unsigned int idx_b_orig, mempos;
	+ Float delta_n, x_ab_length, radius_b;
	+ Float3 x_ab;
	+ Float4 x_b, distmod;
	+ Float4 vel_a = dev_vel_sorted[idx_a];
	+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
	+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
	+
	+ // Loop over all possible contacts, and remove contacts
	+ // that no longer are valid (delta_n > 0.0)
	+ for (int i = 0; i<(int)devC_nc; ++i) {
	+ mempos = (unsigned int)(idx_a_orig * devC_nc + i);
	+ __syncthreads();
	+ idx_b_orig = dev_contacts[mempos];
	+ distmod = dev_distmod[mempos];
	+ x_b = dev_x[idx_b_orig];
	+ //radius_b = dev_radius[idx_b_orig];
	+ radius_b = distmod.w;
	+
	+ // Inter-particle vector, corrected for periodic boundaries
	+ x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
	+ x_a.y - x_b.y + distmod.y,
	+ x_a.z - x_b.z + distmod.z);
	+
	+ x_ab_length = length(x_ab);
	+ delta_n = x_ab_length - (radius_a + radius_b);
	+
	+
	+ if (idx_b_orig != (unsigned int)devC_np) {
	+
	+ // Process collision if the particles are overlapping
	+ if (delta_n < 0.0f) {
	+ //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u…
	+ // idx_a_orig, idx_b_orig, i, delta_n);
	+ contactLinear(&N, &T, &es_dot, &p,
	+ idx_a_orig,
	+ idx_b_orig,
	+ vel_a,
	+ dev_vel,
	+ angvel_a,
	+ dev_angvel,
	+ radius_a, radius_b,
	+ x_ab, x_ab_length,
	+ delta_n, dev_delta_t,
	+ mempos);
	+ } else {
	+ __syncthreads();
	+ // Remove this contact (there is no particle with index=np)
	+ dev_contacts[mempos] = devC_np;
	+ // Zero sum of shear displacement in this position
	+ dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ }
	+ } else {
	+ __syncthreads();
	+ dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ }
	+ } // Contact loop end
	+
	+ } else if (devC_shearmodel == 1) {
	+
	+ int3 gridPos;
	+ int3 targetPos;
	+
	+ // Calculate address in grid from position
	+ gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	+ gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	+ gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	+
	+ // Find overlaps between particle no. idx and all particles
	+ // from its own cell + 26 neighbor cells.
	+ // Calculate resulting normal- and shear-force components and
	+ // torque for the particle on the base of contactLinearViscous()
	+ for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
	+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
	+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
	+ targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
	+ overlapsInCell(targetPos, idx_a, x_a, radius_a,
	+ &N, &T, &es_dot, &p,
	+ dev_x_sorted, dev_radius_sorted,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ dev_cellStart, dev_cellEnd,
	+ dev_w_nx, dev_w_mvfd);
	+ }
	+ }
	+ }
	+
	+ }
	+
	+ //// Interact with walls
	+ Float delta_w; // Overlap distance
	+ Float3 w_n; // Wall surface normal
	+ Float w_force = 0.0f; // Force on wall from particle A
	+
	+ // Upper wall (idx 0)
	+ delta_w = w_up_nx.w - (x_a.z + radius_a);
	+ w_n = MAKE_FLOAT3(0.0f, 0.0f, -1.0f);
	+ if (delta_w < 0.0f) {
	+ w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, w_up_mvfd.y);
	+ }
	+
	+ // Lower wall (force on wall not stored)
	+ delta_w = x_a.z - radius_a - origo.z;
	+ w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+
	+ if (devC_periodic == 0) {
	+
	+ // Left wall
	+ delta_w = x_a.x - radius_a - origo.x;
	+ w_n = MAKE_FLOAT3(1.0f, 0.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+ // Right wall
	+ delta_w = L.x - (x_a.x + radius_a);
	+ w_n = MAKE_FLOAT3(-1.0f, 0.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+ // Front wall
	+ delta_w = x_a.y - radius_a - origo.y;
	+ w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+ // Back wall
	+ delta_w = L.y - (x_a.y + radius_a);
	+ w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+ } else if (devC_periodic == 2) {
	+
	+ // Front wall
	+ delta_w = x_a.y - radius_a - origo.y;
	+ w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+
	+ // Back wall
	+ delta_w = L.y - (x_a.y + radius_a);
	+ w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
	+ if (delta_w < 0.0f) {
	+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
	+ dev_vel_sorted, dev_angvel_sorted,
	+ w_n, delta_w, 0.0f);
	+ }
	+ }
	+
	+
	+ // Hold threads for coalesced write
	+ __syncthreads();
	+
	+ // Write force to unsorted position
	+ unsigned int orig_idx = dev_gridParticleIndex[idx_a];
	+ dev_force[orig_idx] = MAKE_FLOAT4(N.x, N.y, N.z, 0.0f);
	+ dev_torque[orig_idx] = MAKE_FLOAT4(T.x, T.y, T.z, 0.0f);
	+ dev_es_dot[orig_idx] = es_dot;
	+ dev_es[orig_idx] += es_dot * devC_dt;
	+ dev_p[orig_idx] = p;
	+ dev_w_force[orig_idx] = w_force;
	+ }
	+} // End of interact(...)
	+
	+
	+#endif
	diff --git a/src/integration.cuh b/src/integration.cuh
	t@@ -0,0 +1,207 @@
	+#ifndef INTEGRATION_CUH_
	+#define INTEGRATION_CUH_
	+
	+// integration.cuh
	+// Functions responsible for temporal integration
	+
	+
	+// Second order integration scheme based on Taylor expansion of particle kinem…
	+// Kernel executed on device, and callable from host only.
	+__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Inp…
	+ Float4* dev_angvel_sorted, Float* dev_radius_sorted,…
	+ Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, …
	+ Float4* dev_force, Float4* dev_torque, // Input
	+ unsigned int* dev_gridParticleIndex) // Input: Sorte…
	+{
	+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	+
	+ if (idx < devC_np) { // Condition prevents block size error
	+
	+ // Copy data to temporary arrays to avoid any potential read-after-write,
	+ // write-after-read, or write-after-write hazards.
	+ unsigned int orig_idx = dev_gridParticleIndex[idx];
	+ Float4 force = dev_force[orig_idx];
	+ Float4 torque = dev_torque[orig_idx];
	+
	+ // Initialize acceleration vectors to zero
	+ Float4 acc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ Float4 angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+
	+ // Fetch particle position and velocity values from global read
	+ Float4 x = dev_x_sorted[idx];
	+ Float4 vel = dev_vel_sorted[idx];
	+ Float4 angvel = dev_angvel_sorted[idx];
	+ Float radius = dev_radius_sorted[idx];
	+
	+ // Coherent read from constant memory to registers
	+ Float dt = devC_dt;
	+ Float3 origo = MAKE_FLOAT3(devC_origo[0], devC_origo[1], devC_origo[2]);
	+ Float3 L = MAKE_FLOAT3(devC_L[0], devC_L[1], devC_L[2]);
	+ Float rho = devC_rho;
	+
	+ // Particle mass
	+ Float m = 4.0f/3.0f * PI * radiusradiusradius * rho;
	+
	+ // Update linear acceleration of particle
	+ acc.x = force.x / m;
	+ acc.y = force.y / m;
	+ acc.z = force.z / m;
	+
	+ // Update angular acceleration of particle
	+ // (angacc = (total moment)/Intertia, intertia = 2/5mr^2)
	+ angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
	+ angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
	+ angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
	+
	+ // Add gravity
	+ acc.x += devC_g[0];
	+ acc.y += devC_g[1];
	+ acc.z += devC_g[2];
	+
	+ // Check if particle has a fixed horizontal velocity
	+ if (vel.w > 0.0f) {
	+
	+ // Zero horizontal acceleration and disable
	+ // gravity to counteract segregation.
	+ // Particles may move in the z-dimension,
	+ // to allow for dilation.
	+ acc.x = 0.0f;
	+ acc.y = 0.0f;
	+ acc.z -= devC_g[2];
	+
	+ // Zero the angular acceleration
	+ angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
	+ }
	+
	+ // Update angular velocity
	+ angvel.x += angacc.x * dt;
	+ angvel.y += angacc.y * dt;
	+ angvel.z += angacc.z * dt;
	+
	+ // Update linear velocity
	+ vel.x += acc.x * dt;
	+ vel.y += acc.y * dt;
	+ vel.z += acc.z * dt;
	+
	+ // Update position. First-order Euler's scheme:
	+ //x.x += vel.x * dt;
	+ //x.y += vel.y * dt;
	+ //x.z += vel.z * dt;
	+
	+ // Update position. Second-order scheme based on Taylor expansion
	+ // (greater accuracy than the first-order Euler's scheme)
	+ x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
	+ x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
	+ x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
	+
	+ // Add x-displacement for this time step to
	+ // sum of x-displacements
	+ x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
	+
	+ // Move particle across boundary if it is periodic
	+ if (devC_periodic == 1) {
	+ if (x.x < origo.x)
	+ x.x += L.x;
	+ if (x.x > L.x)
	+ x.x -= L.x;
	+ if (x.y < origo.y)
	+ x.y += L.y;
	+ if (x.y > L.y)
	+ x.y -= L.y;
	+ } else if (devC_periodic == 2) {
	+ if (x.x < origo.x)
	+ x.x += L.x;
	+ if (x.x > L.x)
	+ x.x -= L.x;
	+ }
	+
	+ // Hold threads for coalesced write
	+ __syncthreads();
	+
	+ // Store data in global memory at original, pre-sort positions
	+ dev_angvel[orig_idx] = angvel;
	+ dev_vel[orig_idx] = vel;
	+ dev_x[orig_idx] = x;
	+ }
	+} // End of integrate(...)
	+
	+
	+// Reduce wall force contributions from particles to a single value per wall
	+__global__ void summation(Float* in, Float *out)
	+{
	+ __shared__ Float cache[256];
	+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
	+ unsigned int cacheIdx = threadIdx.x;
	+
	+ Float temp = 0.0f;
	+ while (idx < devC_np) {
	+ temp += in[idx];
	+ idx += blockDim.x * gridDim.x;
	+ }
	+
	+ // Set the cache values
	+ cache[cacheIdx] = temp;
	+
	+ __syncthreads();
	+
	+ // For reductions, threadsPerBlock must be a power of two
	+ // because of the following code
	+ unsigned int i = blockDim.x/2;
	+ while (i != 0) {
	+ if (cacheIdx < i)
	+ cache[cacheIdx] += cache[cacheIdx + i];
	+ __syncthreads();
	+ i /= 2;
	+ }
	+
	+ // Write sum for block to global memory
	+ if (cacheIdx == 0)
	+ out[blockIdx.x] = cache[0];
	+}
	+
	+// Update wall positions
	+__global__ void integrateWalls(Float4* dev_w_nx,
	+ Float4* dev_w_mvfd,
	+ Float* dev_w_force_partial,
	+ unsigned int blocksPerGrid)
	+{
	+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	+
	+ if (idx < devC_nw) { // Condition prevents block size error
	+
	+ // Copy data to temporary arrays to avoid any potential read-after-write,
	+ // write-after-read, or write-after-write hazards.
	+ Float4 w_nx = dev_w_nx[idx];
	+ Float4 w_mvfd = dev_w_mvfd[idx];
	+
	+ // Find the final sum of forces on wall
	+ w_mvfd.z = 0.0f;
	+ for (int i=0; i<blocksPerGrid; ++i) {
	+ w_mvfd.z += dev_w_force_partial[i];
	+ }
	+
	+ Float dt = devC_dt;
	+
	+ // Normal load = Deviatoric stress times wall surface area,
	+ // directed downwards.
	+ Float N = -w_mvfd.wdevC_L[0]devC_L[1];
	+
	+ // Calculate resulting acceleration of wall
	+ // (Wall mass is stored in w component of position Float4)
	+ Float acc = (w_mvfd.z+N)/w_mvfd.x;
	+
	+ // Update linear velocity
	+ w_mvfd.y += acc * dt;
	+
	+ // Update position. Second-order scheme based on Taylor expansion
	+ // (greater accuracy than the first-order Euler's scheme)
	+ w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
	+
	+ // Store data in global memory
	+ dev_w_nx[idx] = w_nx;
	+ dev_w_mvfd[idx] = w_mvfd;
	+ }
	+} // End of integrateWalls(...)
	+
	+
	+#endif
	diff --git a/src/sorting.cuh b/src/sorting.cuh
	t@@ -0,0 +1,127 @@
	+// Returns the cellID containing the particle, based cubic grid
	+// See Bayraktar et al. 2009
	+// Kernel is executed on the device, and is callable from the device only
	+__device__ int calcCellID(Float3 x)
	+{
	+ unsigned int i_x, i_y, i_z;
	+
	+ // Calculate integral coordinates:
	+ i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
	+ i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
	+ i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
	+
	+ // Integral coordinates are converted to 1D coordinate:
	+ return __umul24(__umul24(i_z, devC_num[1]),
	+ devC_num[0])
	+ + __umul24(i_y, devC_num[0]) + i_x;
	+
	+} // End of calcCellID(...)
	+
	+
	+// Calculate hash value for each particle, based on position in grid.
	+// Kernel executed on device, and callable from host only.
	+__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
	+ unsigned int* dev_gridParticleIndex,
	+ Float4* dev_x)
	+{
	+ //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
	+ unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	+
	+ if (idx < devC_np) { // Condition prevents block size error
	+
	+ //volatile Float4 x = dev_x[idx]; // Ensure coalesced read
	+ Float4 x = dev_x[idx]; // Ensure coalesced read
	+
	+ unsigned int cellID = calcCellID(MAKE_FLOAT3(x.x, x.y, x.z));
	+
	+ // Store values
	+ dev_gridParticleCellID[idx] = cellID;
	+ dev_gridParticleIndex[idx] = idx;
	+
	+ }
	+} // End of calcParticleCellID(...)
	+
	+
	+// Reorder particle data into sorted order, and find the start and end particl…
	+// of each cell in the sorted hash array.
	+// Kernel executed on device, and callable from host only.
	+__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_c…
	+ unsigned int* dev_gridParticleCellID,
	+ unsigned int* dev_gridParticleIndex,
	+ Float4* dev_x, Float4* dev_vel,
	+ Float4* dev_angvel, Float* dev_radius,
	+ //uint4* dev_bonds,
	+ Float4* dev_x_sorted, Float4* dev_vel_sorted,
	+ Float4* dev_angvel_sorted, Float* dev_radius_sor…
	+ //uint4* dev_bonds_sorted)
	+{
	+
	+ // Create hash array in shared on-chip memory. The size of the array
	+ // (threadsPerBlock + 1) is determined at launch time (extern notation).
	+ extern __shared__ unsigned int shared_data[];
	+
	+ // Thread index in block
	+ unsigned int tidx = threadIdx.x;
	+
	+ // Thread index in grid
	+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
	+ //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
	+
	+ // CellID hash value of particle idx
	+ unsigned int cellID;
	+
	+ // Read cellID data and store it in shared memory (shared_data)
	+ if (idx < devC_np) { // Condition prevents block size error
	+ cellID = dev_gridParticleCellID[idx];
	+
	+ // Load hash data into shared memory, allowing access to neighbor particle…
	+ shared_data[tidx+1] = cellID;
	+
	+ if (idx > 0 && tidx == 0) {
	+ // First thread in block must load neighbor particle hash
	+ shared_data[0] = dev_gridParticleCellID[idx-1];
	+ }
	+ }
	+
	+ // Pause completed threads in this block, until all
	+ // threads are done loading data into shared memory
	+ __syncthreads();
	+
	+ // Find lowest and highest particle index in each cell
	+ if (idx < devC_np) { // Condition prevents block size error
	+ // If this particle has a different cell index to the previous particle, i…
	+ // particle in the cell -> Store the index of this particle in the cell.
	+ // The previous particle must be the last particle in the previous cell.
	+ if (idx == 0 \|\| cellID != shared_data[tidx]) {
	+ dev_cellStart[cellID] = idx;
	+ if (idx > 0)
	+ dev_cellEnd[shared_data[tidx]] = idx;
	+ }
	+
	+ // Check wether the thread is the last one
	+ if (idx == (devC_np - 1))
	+ dev_cellEnd[cellID] = idx + 1;
	+
	+
	+ // Use the sorted index to reorder the position and velocity data
	+ unsigned int sortedIndex = dev_gridParticleIndex[idx];
	+
	+ // Fetch from global read
	+ Float4 x = dev_x[sortedIndex];
	+ Float4 vel = dev_vel[sortedIndex];
	+ Float4 angvel = dev_angvel[sortedIndex];
	+ Float radius = dev_radius[sortedIndex];
	+ //uint4 bonds = dev_bonds[sortedIndex];
	+
	+ __syncthreads();
	+ // Write sorted data to global memory
	+ dev_x_sorted[idx] = x;
	+ dev_vel_sorted[idx] = vel;
	+ dev_angvel_sorted[idx] = angvel;
	+ dev_radius_sorted[idx] = radius;
	+ //dev_bonds_sorted[idx] = bonds;
	+ }
	+} // End of reorderArrays(...)
	+
	+
	+
	diff --git a/src/vector_arithmetic.h b/src/vector_arithmetic.h
	t@@ -0,0 +1,994 @@
	+#ifndef VECTOR_ARITHMETIC_H_
	+#define VECTOR_ARITHMETIC_H_
	+
	+#include "cuda_runtime.h"
	+#include "datatypes.h"
	+
	+typedef unsigned int uint;
	+typedef unsigned short ushort;
	+
	+#ifndef __CUDACC__
	+#include <math.h>
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// host implementations of CUDA functions
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline Float fminf(Float a, Float b)
	+{
	+ return a < b ? a : b;
	+}
	+
	+inline Float fmaxf(Float a, Float b)
	+{
	+ return a > b ? a : b;
	+}
	+
	+inline int max(int a, int b)
	+{
	+ return a > b ? a : b;
	+}
	+
	+inline int min(int a, int b)
	+{
	+ return a < b ? a : b;
	+}
	+
	+inline Float rsqrtf(Float x)
	+{
	+ return 1.0f / sqrtf(x);
	+}
	+#endif
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// constructors
	+//////////////////////////////////////////////////////////////////////////////…
	+/*inline __host__ __device__ Float3 MAKE_FLOAT3(Float s)
	+{
	+ return MAKE_FLOAT3(s, s, s);
	+}
	+inline __host__ __device__ Float3 MAKE_FLOAT3(Float4 a)
	+{
	+ return MAKE_FLOAT3(a.x, a.y, a.z);
	+}
	+inline __host__ __device__ Float3 MAKE_FLOAT3(int3 a)
	+{
	+ return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
	+}
	+inline __host__ __device__ Float3 MAKE_FLOAT3(uint3 a)
	+{
	+ return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
	+}
	+
	+inline __host__ __device__ Float4 MAKE_FLOAT4(Float s)
	+{
	+ return MAKE_FLOAT4(s, s, s, s);
	+}
	+inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a)
	+{
	+ return MAKE_FLOAT4(a.x, a.y, a.z, 0.0f);
	+}
	+inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a, Float w)
	+{
	+ return MAKE_FLOAT4(a.x, a.y, a.z, w);
	+}
	+inline __host__ __device__ Float4 MAKE_FLOAT4(int4 a)
	+{
	+ return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
	+}
	+inline __host__ __device__ Float4 MAKE_FLOAT4(uint4 a)
	+{
	+ return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
	+}
	+*/
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// negate
	+//////////////////////////////////////////////////////////////////////////////…
	+inline __host__ __device__ Float3 operator-(Float3 &a)
	+{
	+ return MAKE_FLOAT3(-a.x, -a.y, -a.z);
	+}
	+inline __host__ __device__ Float4 operator-(Float4 &a)
	+{
	+ return MAKE_FLOAT4(-a.x, -a.y, -a.z, -a.w);
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// addition
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ int2 operator+(int2 a, int2 b)
	+{
	+ return make_int2(a.x + b.x, a.y + b.y);
	+}
	+inline __host__ __device__ void operator+=(int2 &a, int2 b)
	+{
	+ a.x += b.x; a.y += b.y;
	+}
	+inline __host__ __device__ int2 operator+(int2 a, int b)
	+{
	+ return make_int2(a.x + b, a.y + b);
	+}
	+inline __host__ __device__ int2 operator+(int b, int2 a)
	+{
	+ return make_int2(a.x + b, a.y + b);
	+}
	+inline __host__ __device__ void operator+=(int2 &a, int b)
	+{
	+ a.x += b; a.y += b;
	+}
	+
	+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
	+{
	+ return make_uint2(a.x + b.x, a.y + b.y);
	+}
	+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
	+{
	+ a.x += b.x; a.y += b.y;
	+}
	+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
	+{
	+ return make_uint2(a.x + b, a.y + b);
	+}
	+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
	+{
	+ return make_uint2(a.x + b, a.y + b);
	+}
	+inline __host__ __device__ void operator+=(uint2 &a, uint b)
	+{
	+ a.x += b; a.y += b;
	+}
	+
	+
	+inline __host__ __device__ Float3 operator+(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(a.x + b.x, a.y + b.y, a.z + b.z);
	+}
	+inline __host__ __device__ void operator+=(Float3 &a, Float3 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z;
	+}
	+inline __host__ __device__ Float3 operator+(Float3 a, Float b)
	+{
	+ return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
	+}
	+inline __host__ __device__ void operator+=(Float3 &a, Float b)
	+{
	+ a.x += b; a.y += b; a.z += b;
	+}
	+
	+inline __host__ __device__ int3 operator+(int3 a, int3 b)
	+{
	+ return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
	+}
	+inline __host__ __device__ void operator+=(int3 &a, int3 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z;
	+}
	+inline __host__ __device__ int3 operator+(int3 a, int b)
	+{
	+ return make_int3(a.x + b, a.y + b, a.z + b);
	+}
	+inline __host__ __device__ void operator+=(int3 &a, int b)
	+{
	+ a.x += b; a.y += b; a.z += b;
	+}
	+
	+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
	+{
	+ return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
	+}
	+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z;
	+}
	+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
	+{
	+ return make_uint3(a.x + b, a.y + b, a.z + b);
	+}
	+inline __host__ __device__ void operator+=(uint3 &a, uint b)
	+{
	+ a.x += b; a.y += b; a.z += b;
	+}
	+
	+inline __host__ __device__ int3 operator+(int b, int3 a)
	+{
	+ return make_int3(a.x + b, a.y + b, a.z + b);
	+}
	+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
	+{
	+ return make_uint3(a.x + b, a.y + b, a.z + b);
	+}
	+inline __host__ __device__ Float3 operator+(Float b, Float3 a)
	+{
	+ return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
	+}
	+
	+inline __host__ __device__ Float4 operator+(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
	+}
	+inline __host__ __device__ void operator+=(Float4 &a, Float4 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
	+}
	+inline __host__ __device__ Float4 operator+(Float4 a, Float b)
	+{
	+ return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ Float4 operator+(Float b, Float4 a)
	+{
	+ return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ void operator+=(Float4 &a, Float b)
	+{
	+ a.x += b; a.y += b; a.z += b; a.w += b;
	+}
	+
	+inline __host__ __device__ int4 operator+(int4 a, int4 b)
	+{
	+ return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
	+}
	+inline __host__ __device__ void operator+=(int4 &a, int4 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
	+}
	+inline __host__ __device__ int4 operator+(int4 a, int b)
	+{
	+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ int4 operator+(int b, int4 a)
	+{
	+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ void operator+=(int4 &a, int b)
	+{
	+ a.x += b; a.y += b; a.z += b; a.w += b;
	+}
	+
	+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
	+{
	+ return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
	+}
	+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
	+{
	+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
	+}
	+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
	+{
	+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
	+{
	+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
	+}
	+inline __host__ __device__ void operator+=(uint4 &a, uint b)
	+{
	+ a.x += b; a.y += b; a.z += b; a.w += b;
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// subtract
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+
	+inline __host__ __device__ int2 operator-(int2 a, int2 b)
	+{
	+ return make_int2(a.x - b.x, a.y - b.y);
	+}
	+inline __host__ __device__ void operator-=(int2 &a, int2 b)
	+{
	+ a.x -= b.x; a.y -= b.y;
	+}
	+inline __host__ __device__ int2 operator-(int2 a, int b)
	+{
	+ return make_int2(a.x - b, a.y - b);
	+}
	+inline __host__ __device__ int2 operator-(int b, int2 a)
	+{
	+ return make_int2(b - a.x, b - a.y);
	+}
	+inline __host__ __device__ void operator-=(int2 &a, int b)
	+{
	+ a.x -= b; a.y -= b;
	+}
	+
	+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
	+{
	+ return make_uint2(a.x - b.x, a.y - b.y);
	+}
	+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
	+{
	+ a.x -= b.x; a.y -= b.y;
	+}
	+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
	+{
	+ return make_uint2(a.x - b, a.y - b);
	+}
	+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
	+{
	+ return make_uint2(b - a.x, b - a.y);
	+}
	+inline __host__ __device__ void operator-=(uint2 &a, uint b)
	+{
	+ a.x -= b; a.y -= b;
	+}
	+
	+inline __host__ __device__ Float3 operator-(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(a.x - b.x, a.y - b.y, a.z - b.z);
	+}
	+inline __host__ __device__ void operator-=(Float3 &a, Float3 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
	+}
	+inline __host__ __device__ Float3 operator-(Float3 a, Float b)
	+{
	+ return MAKE_FLOAT3(a.x - b, a.y - b, a.z - b);
	+}
	+inline __host__ __device__ Float3 operator-(Float b, Float3 a)
	+{
	+ return MAKE_FLOAT3(b - a.x, b - a.y, b - a.z);
	+}
	+inline __host__ __device__ void operator-=(Float3 &a, Float b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b;
	+}
	+
	+inline __host__ __device__ int3 operator-(int3 a, int3 b)
	+{
	+ return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
	+}
	+inline __host__ __device__ void operator-=(int3 &a, int3 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
	+}
	+inline __host__ __device__ int3 operator-(int3 a, int b)
	+{
	+ return make_int3(a.x - b, a.y - b, a.z - b);
	+}
	+inline __host__ __device__ int3 operator-(int b, int3 a)
	+{
	+ return make_int3(b - a.x, b - a.y, b - a.z);
	+}
	+inline __host__ __device__ void operator-=(int3 &a, int b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b;
	+}
	+
	+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
	+{
	+ return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
	+}
	+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
	+}
	+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
	+{
	+ return make_uint3(a.x - b, a.y - b, a.z - b);
	+}
	+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
	+{
	+ return make_uint3(b - a.x, b - a.y, b - a.z);
	+}
	+inline __host__ __device__ void operator-=(uint3 &a, uint b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b;
	+}
	+
	+inline __host__ __device__ Float4 operator-(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
	+}
	+inline __host__ __device__ void operator-=(Float4 &a, Float4 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
	+}
	+inline __host__ __device__ Float4 operator-(Float4 a, Float b)
	+{
	+ return MAKE_FLOAT4(a.x - b, a.y - b, a.z - b, a.w - b);
	+}
	+inline __host__ __device__ void operator-=(Float4 &a, Float b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
	+}
	+
	+inline __host__ __device__ int4 operator-(int4 a, int4 b)
	+{
	+ return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
	+}
	+inline __host__ __device__ void operator-=(int4 &a, int4 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
	+}
	+inline __host__ __device__ int4 operator-(int4 a, int b)
	+{
	+ return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
	+}
	+inline __host__ __device__ int4 operator-(int b, int4 a)
	+{
	+ return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
	+}
	+inline __host__ __device__ void operator-=(int4 &a, int b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
	+}
	+
	+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
	+{
	+ return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
	+}
	+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
	+{
	+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
	+}
	+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
	+{
	+ return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
	+}
	+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
	+{
	+ return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
	+}
	+inline __host__ __device__ void operator-=(uint4 &a, uint b)
	+{
	+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// multiply
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ int2 operator*(int2 a, int2 b)
	+{
	+ return make_int2(a.x * b.x, a.y * b.y);
	+}
	+inline __host__ __device__ void operator*=(int2 &a, int2 b)
	+{
	+ a.x = b.x; a.y = b.y;
	+}
	+inline __host__ __device__ int2 operator*(int2 a, int b)
	+{
	+ return make_int2(a.x * b, a.y * b);
	+}
	+inline __host__ __device__ int2 operator*(int b, int2 a)
	+{
	+ return make_int2(b * a.x, b * a.y);
	+}
	+inline __host__ __device__ void operator*=(int2 &a, int b)
	+{
	+ a.x = b; a.y = b;
	+}
	+
	+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
	+{
	+ return make_uint2(a.x * b.x, a.y * b.y);
	+}
	+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
	+{
	+ a.x = b.x; a.y = b.y;
	+}
	+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
	+{
	+ return make_uint2(a.x * b, a.y * b);
	+}
	+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
	+{
	+ return make_uint2(b * a.x, b * a.y);
	+}
	+inline __host__ __device__ void operator*=(uint2 &a, uint b)
	+{
	+ a.x = b; a.y = b;
	+}
	+
	+inline __host__ __device__ Float3 operator*(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(a.x * b.x, a.y * b.y, a.z * b.z);
	+}
	+inline __host__ __device__ void operator*=(Float3 &a, Float3 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z *= b.z;
	+}
	+inline __host__ __device__ Float3 operator*(Float3 a, Float b)
	+{
	+ return MAKE_FLOAT3(a.x * b, a.y * b, a.z * b);
	+}
	+inline __host__ __device__ Float3 operator*(Float b, Float3 a)
	+{
	+ return MAKE_FLOAT3(b * a.x, b * a.y, b * a.z);
	+}
	+inline __host__ __device__ void operator*=(Float3 &a, Float b)
	+{
	+ a.x = b; a.y = b; a.z *= b;
	+}
	+
	+inline __host__ __device__ int3 operator*(int3 a, int3 b)
	+{
	+ return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
	+}
	+inline __host__ __device__ void operator*=(int3 &a, int3 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z *= b.z;
	+}
	+inline __host__ __device__ int3 operator*(int3 a, int b)
	+{
	+ return make_int3(a.x * b, a.y * b, a.z * b);
	+}
	+inline __host__ __device__ int3 operator*(int b, int3 a)
	+{
	+ return make_int3(b * a.x, b * a.y, b * a.z);
	+}
	+inline __host__ __device__ void operator*=(int3 &a, int b)
	+{
	+ a.x = b; a.y = b; a.z *= b;
	+}
	+
	+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
	+{
	+ return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
	+}
	+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z *= b.z;
	+}
	+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
	+{
	+ return make_uint3(a.x * b, a.y * b, a.z * b);
	+}
	+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
	+{
	+ return make_uint3(b * a.x, b * a.y, b * a.z);
	+}
	+inline __host__ __device__ void operator*=(uint3 &a, uint b)
	+{
	+ a.x = b; a.y = b; a.z *= b;
	+}
	+
	+inline __host__ __device__ Float4 operator*(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
	+}
	+inline __host__ __device__ void operator*=(Float4 &a, Float4 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w;
	+}
	+inline __host__ __device__ Float4 operator*(Float4 a, Float b)
	+{
	+ return MAKE_FLOAT4(a.x * b, a.y * b, a.z * b, a.w * b);
	+}
	+inline __host__ __device__ Float4 operator*(Float b, Float4 a)
	+{
	+ return MAKE_FLOAT4(b * a.x, b * a.y, b * a.z, b * a.w);
	+}
	+inline __host__ __device__ void operator*=(Float4 &a, Float b)
	+{
	+ a.x = b; a.y = b; a.z = b; a.w = b;
	+}
	+
	+inline __host__ __device__ int4 operator*(int4 a, int4 b)
	+{
	+ return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
	+}
	+inline __host__ __device__ void operator*=(int4 &a, int4 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w;
	+}
	+inline __host__ __device__ int4 operator*(int4 a, int b)
	+{
	+ return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
	+}
	+inline __host__ __device__ int4 operator*(int b, int4 a)
	+{
	+ return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
	+}
	+inline __host__ __device__ void operator*=(int4 &a, int b)
	+{
	+ a.x = b; a.y = b; a.z = b; a.w = b;
	+}
	+
	+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
	+{
	+ return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
	+}
	+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
	+{
	+ a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w;
	+}
	+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
	+{
	+ return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
	+}
	+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
	+{
	+ return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
	+}
	+inline __host__ __device__ void operator*=(uint4 &a, uint b)
	+{
	+ a.x = b; a.y = b; a.z = b; a.w = b;
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// divide
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 operator/(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(a.x / b.x, a.y / b.y, a.z / b.z);
	+}
	+inline __host__ __device__ void operator/=(Float3 &a, Float3 b)
	+{
	+ a.x /= b.x; a.y /= b.y; a.z /= b.z;
	+}
	+inline __host__ __device__ Float3 operator/(Float3 a, Float b)
	+{
	+ return MAKE_FLOAT3(a.x / b, a.y / b, a.z / b);
	+}
	+inline __host__ __device__ void operator/=(Float3 &a, Float b)
	+{
	+ a.x /= b; a.y /= b; a.z /= b;
	+}
	+inline __host__ __device__ Float3 operator/(Float b, Float3 a)
	+{
	+ return MAKE_FLOAT3(b / a.x, b / a.y, b / a.z);
	+}
	+
	+inline __host__ __device__ Float4 operator/(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
	+}
	+inline __host__ __device__ void operator/=(Float4 &a, Float4 b)
	+{
	+ a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
	+}
	+inline __host__ __device__ Float4 operator/(Float4 a, Float b)
	+{
	+ return MAKE_FLOAT4(a.x / b, a.y / b, a.z / b, a.w / b);
	+}
	+inline __host__ __device__ void operator/=(Float4 &a, Float b)
	+{
	+ a.x /= b; a.y /= b; a.z /= b; a.w /= b;
	+}
	+inline __host__ __device__ Float4 operator/(Float b, Float4 a){
	+ return MAKE_FLOAT4(b / a.x, b / a.y, b / a.z, b / a.w);
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// min
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 fminf(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
	+}
	+inline __host__ __device__ Float4 fminf(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fmi…
	+}
	+
	+inline __host__ __device__ int2 min(int2 a, int2 b)
	+{
	+ return make_int2(min(a.x,b.x), min(a.y,b.y));
	+}
	+inline __host__ __device__ int3 min(int3 a, int3 b)
	+{
	+ return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
	+}
	+inline __host__ __device__ int4 min(int4 a, int4 b)
	+{
	+ return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
	+}
	+
	+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
	+{
	+ return make_uint2(min(a.x,b.x), min(a.y,b.y));
	+}
	+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
	+{
	+ return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
	+}
	+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
	+{
	+ return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// max
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 fmaxf(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
	+}
	+inline __host__ __device__ Float4 fmaxf(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fma…
	+}
	+
	+inline __host__ __device__ int2 max(int2 a, int2 b)
	+{
	+ return make_int2(max(a.x,b.x), max(a.y,b.y));
	+}
	+inline __host__ __device__ int3 max(int3 a, int3 b)
	+{
	+ return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
	+}
	+inline __host__ __device__ int4 max(int4 a, int4 b)
	+{
	+ return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
	+}
	+
	+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
	+{
	+ return make_uint2(max(a.x,b.x), max(a.y,b.y));
	+}
	+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
	+{
	+ return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
	+}
	+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
	+{
	+ return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// lerp
	+// - linear interpolation between a and b, based on value t in [0, 1] range
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __device__ __host__ Float lerp(Float a, Float b, Float t)
	+{
	+ return a + t*(b-a);
	+}
	+inline __device__ __host__ Float3 lerp(Float3 a, Float3 b, Float t)
	+{
	+ return a + t*(b-a);
	+}
	+inline __device__ __host__ Float4 lerp(Float4 a, Float4 b, Float t)
	+{
	+ return a + t*(b-a);
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// clamp
	+// - clamp the value v to be in the range [a, b]
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __device__ __host__ Float clamp(Float f, Float a, Float b)
	+{
	+ return fmaxf(a, fminf(f, b));
	+}
	+inline __device__ __host__ int clamp(int f, int a, int b)
	+{
	+ return max(a, min(f, b));
	+}
	+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
	+{
	+ return max(a, min(f, b));
	+}
	+
	+inline __device__ __host__ Float3 clamp(Float3 v, Float a, Float b)
	+{
	+ return MAKE_FLOAT3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
	+}
	+inline __device__ __host__ Float3 clamp(Float3 v, Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, …
	+}
	+inline __device__ __host__ Float4 clamp(Float4 v, Float a, Float b)
	+{
	+ return MAKE_FLOAT4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), c…
	+}
	+inline __device__ __host__ Float4 clamp(Float4 v, Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, …
	+}
	+
	+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
	+{
	+ return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
	+}
	+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
	+{
	+ return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
	+}
	+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
	+{
	+ return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
	+}
	+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
	+{
	+ return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.…
	+}
	+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
	+{
	+ return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), cla…
	+}
	+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
	+{
	+ return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.…
	+}
	+
	+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
	+{
	+ return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
	+}
	+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
	+{
	+ return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
	+}
	+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
	+{
	+ return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
	+}
	+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
	+{
	+ return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a…
	+}
	+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
	+{
	+ return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), cl…
	+}
	+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
	+{
	+ return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a…
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// dot product
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float dot(Float3 a, Float3 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z;
	+}
	+inline __host__ __device__ Float dot(Float4 a, Float4 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
	+}
	+
	+inline __host__ __device__ int dot(int2 a, int2 b)
	+{
	+ return a.x * b.x + a.y * b.y;
	+}
	+inline __host__ __device__ int dot(int3 a, int3 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z;
	+}
	+inline __host__ __device__ int dot(int4 a, int4 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
	+}
	+
	+inline __host__ __device__ uint dot(uint2 a, uint2 b)
	+{
	+ return a.x * b.x + a.y * b.y;
	+}
	+inline __host__ __device__ uint dot(uint3 a, uint3 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z;
	+}
	+inline __host__ __device__ uint dot(uint4 a, uint4 b)
	+{
	+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// length
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float length(Float3 v)
	+{
	+ return sqrtf(dot(v, v));
	+}
	+inline __host__ __device__ Float length(Float4 v)
	+{
	+ return sqrtf(dot(v, v));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// normalize
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 normalize(Float3 v)
	+{
	+ Float invLen = rsqrtf(dot(v, v));
	+ return v * invLen;
	+}
	+inline __host__ __device__ Float4 normalize(Float4 v)
	+{
	+ Float invLen = rsqrtf(dot(v, v));
	+ return v * invLen;
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// floor
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 floorf(Float3 v)
	+{
	+ return MAKE_FLOAT3(floorf(v.x), floorf(v.y), floorf(v.z));
	+}
	+inline __host__ __device__ Float4 floorf(Float4 v)
	+{
	+ return MAKE_FLOAT4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// frac - returns the fractional portion of a scalar or each vector component
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float fracf(Float v)
	+{
	+ return v - floorf(v);
	+}
	+inline __host__ __device__ Float3 fracf(Float3 v)
	+{
	+ return MAKE_FLOAT3(fracf(v.x), fracf(v.y), fracf(v.z));
	+}
	+inline __host__ __device__ Float4 fracf(Float4 v)
	+{
	+ return MAKE_FLOAT4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// fmod
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 fmodf(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
	+}
	+inline __host__ __device__ Float4 fmodf(Float4 a, Float4 b)
	+{
	+ return MAKE_FLOAT4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmod…
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// absolute value
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 fabs(Float3 v)
	+{
	+ return MAKE_FLOAT3(fabs(v.x), fabs(v.y), fabs(v.z));
	+}
	+inline __host__ __device__ Float4 fabs(Float4 v)
	+{
	+ return MAKE_FLOAT4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
	+}
	+
	+inline __host__ __device__ int2 abs(int2 v)
	+{
	+ return make_int2(abs(v.x), abs(v.y));
	+}
	+inline __host__ __device__ int3 abs(int3 v)
	+{
	+ return make_int3(abs(v.x), abs(v.y), abs(v.z));
	+}
	+inline __host__ __device__ int4 abs(int4 v)
	+{
	+ return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// reflect
	+// - returns reflection of incident ray I around surface normal N
	+// - N should be normalized, reflected vector's length is equal to length of I
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 reflect(Float3 i, Float3 n)
	+{
	+ return i - 2.0f * n * dot(n,i);
	+}
	+
	+//////////////////////////////////////////////////////////////////////////////…
	+// cross product
	+//////////////////////////////////////////////////////////////////////////////…
	+
	+inline __host__ __device__ Float3 cross(Float3 a, Float3 b)
	+{
	+ return MAKE_FLOAT3(a.yb.z - a.zb.y, a.zb.x - a.xb.z, a.xb.y - a.yb.x…
	+}
	+
	+#endif