| tRemoved legacy raytracer folder, moved rt doc into main doc - sphere - GPU-bas… | |
| git clone git://src.adamsgaard.dk/sphere | |
| Log | |
| Files | |
| Refs | |
| LICENSE | |
| --- | |
| commit d30540ae62abcfd984b3eda396cf534e72aaad66 | |
| parent 5cd210a9136c5c2dfee90711cedf294fc6f0484e | |
| Author: Anders Damsgaard <[email protected]> | |
| Date: Tue, 18 Dec 2012 13:58:29 +0100 | |
| Removed legacy raytracer folder, moved rt doc into main doc | |
| Diffstat: | |
| R raytracer/doc/IEEEtran.cls -> doc/… | 0 | |
| R raytracer/doc/np-performance.pdf -… | 0 | |
| R raytracer/doc/np-performance.txt -… | 0 | |
| R raytracer/doc/np500.png -> doc/ray… | 0 | |
| R raytracer/doc/np50000.png -> doc/r… | 0 | |
| R raytracer/doc/px-performance.pdf -… | 0 | |
| R raytracer/doc/px-performance.txt -… | 0 | |
| A doc/raytracer/raytracer.pdf | 0 | |
| A doc/raytracer/raytracer.tex | 264 +++++++++++++++++++++++++++++… | |
| D raytracer/Makefile | 101 -----------------------------… | |
| D raytracer/README | 43 ------------------------------ | |
| D raytracer/colorbar.h | 22 ---------------------- | |
| D raytracer/doc/.DS_Store | 0 | |
| D raytracer/doc/.raytracer.tex.swp | 0 | |
| D raytracer/doc/bloopers/black.png | 0 | |
| D raytracer/doc/bloopers/colorful.png | 0 | |
| D raytracer/doc/bloopers/cpu_wrong_c… | 0 | |
| D raytracer/doc/bloopers/twisted.png | 0 | |
| D raytracer/doc/bloopers/wrong_order… | 0 | |
| D raytracer/doc/raytracer.pdf | 0 | |
| D raytracer/doc/raytracer.tex | 317 -----------------------------… | |
| D raytracer/header.h | 26 -------------------------- | |
| D raytracer/main.cpp | 264 -----------------------------… | |
| D raytracer/o-ppm.cpp | 30 ------------------------------ | |
| D raytracer/o-ppm.h | 7 ------- | |
| D raytracer/render_all_outputs_CPU.sh | 27 --------------------------- | |
| D raytracer/render_all_outputs_GPU.sh | 29 ----------------------------- | |
| D raytracer/rt-kernel-cpu.cpp | 273 -----------------------------… | |
| D raytracer/rt-kernel-cpu.h | 14 -------------- | |
| D raytracer/rt-kernel.cu | 454 -----------------------------… | |
| D raytracer/rt-kernel.h | 35 -----------------------------… | |
| D raytracer/rt_GPU_init_pres.sh | 36 -----------------------------… | |
| D raytracer/rt_GPU_pres.sh | 36 -----------------------------… | |
| D raytracer/rt_GPU_tall_pres.sh | 36 -----------------------------… | |
| 34 files changed, 264 insertions(+), 1750 deletions(-) | |
| --- | |
| diff --git a/raytracer/doc/IEEEtran.cls b/doc/raytracer/IEEEtran.cls | |
| diff --git a/raytracer/doc/np-performance.pdf b/doc/raytracer/np-performance.pdf | |
| Binary files differ. | |
| diff --git a/raytracer/doc/np-performance.txt b/doc/raytracer/np-performance.txt | |
| diff --git a/raytracer/doc/np500.png b/doc/raytracer/np500.png | |
| Binary files differ. | |
| diff --git a/raytracer/doc/np50000.png b/doc/raytracer/np50000.png | |
| Binary files differ. | |
| diff --git a/raytracer/doc/px-performance.pdf b/doc/raytracer/px-performance.pdf | |
| Binary files differ. | |
| diff --git a/raytracer/doc/px-performance.txt b/doc/raytracer/px-performance.txt | |
| diff --git a/doc/raytracer/raytracer.pdf b/doc/raytracer/raytracer.pdf | |
| Binary files differ. | |
| diff --git a/doc/raytracer/raytracer.tex b/doc/raytracer/raytracer.tex | |
| t@@ -0,0 +1,264 @@ | |
| +\documentclass[journal]{IEEEtran} | |
| + | |
| + | |
| +\ifCLASSINFOpdf | |
| + \usepackage[pdftex]{graphicx} | |
| +\else | |
| + \usepackage[dvips]{graphicx} | |
| +\fi | |
| + | |
| +\usepackage{color} | |
| + | |
| +\usepackage{url} | |
| +%\usepackage{showkeys} | |
| +\usepackage{siunitx} | |
| + | |
| +% correct bad hyphenation here | |
| +\hyphenation{op-tical net-works semi-conduc-tor} | |
| + | |
| +\definecolor{DarkGreen}{rgb}{0,0.45,0.08} | |
| +\usepackage{listings} | |
| +\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,… | |
| + | |
| + | |
| +\begin{document} | |
| +\title{CUDA raytracing algorithm for visualizing\\ discrete element model outp… | |
| + | |
| +\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213} | |
| +\thanks{Contact: {[email protected]}}% | |
| +\thanks{Webpage: {http://users-cs.au.dk/adc}}% | |
| +\thanks{Manuscript, last revision: \today.}} | |
| + | |
| +% The paper headers | |
| +\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm… | |
| + | |
| +% make the title area | |
| +\maketitle | |
| + | |
| + | |
| +\begin{abstract} | |
| +%\boldmath | |
| +A raytracing algorithm is constructed using the CUDA API for visualizing outpu… | |
| +The raytracing algorithm is optimized with constant memory and compilation fla… | |
| +\end{abstract} | |
| +\begin{IEEEkeywords} | |
| +CUDA, discrete element method, raytracing | |
| +\end{IEEEkeywords} | |
| + | |
| + | |
| +\IEEEpeerreviewmaketitle | |
| + | |
| + | |
| +\section{Introduction} | |
| +\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional… | |
| + | |
| +Previous studies of GPU or CUDA implementations of ray tracing algorithms repo… | |
| + | |
| +\subsection{Discrete Element Method} | |
| +The input particle data to the raytracer is the output of a custom CUDA-based … | |
| + | |
| +The discrete element method (DEM) is a subtype of molecular dynamics (MD), and… | |
| + | |
| +\subsection{Application usage} | |
| +The CUDA DEM application is a command line executable, and writes updated part… | |
| + | |
| +Both the CUDA DEM and raytracing applications are open-source\footnote{\url{ht… | |
| + | |
| +This document consists of a short introduction to the basic mathematics behind… | |
| + | |
| + | |
| +\section{Ray tracing algorithm} | |
| +The goal of the ray tracing algorithm is to compute the shading of each pixel … | |
| +\begin{lstlisting}[basicstyle=\footnotesize] | |
| +for each pixel do | |
| + compute viewing ray origin and direction | |
| + iterate through objects and find the closest hit | |
| + set pixel color to value computed from hit point, light, n | |
| +\end{lstlisting} | |
| +The implemented code does not utilize recursive rays, since the modeled materi… | |
| + | |
| +\subsection{Ray generation} | |
| +The rays are in vector form defined as: | |
| +\begin{equation} | |
| + \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e}) | |
| + \label{eq:raygen} | |
| +\end{equation} | |
| +The perspective can be either \emph{orthograpic}, where all viewing rays have … | |
| + | |
| +The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The d… | |
| +\begin{equation} | |
| + \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v} | |
| + \label{eq:raydirection} | |
| +\end{equation} | |
| +where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the … | |
| +\[ u = l + (r-l)(i+0.5)/n \] | |
| +\[ v = b + (t-b)(j+0.5)/m \] | |
| +where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, righ… | |
| + | |
| + | |
| +\subsection{Ray-sphere intersection} | |
| +Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be c… | |
| +\begin{equation} | |
| + (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0 | |
| + \label{eq:sphere} | |
| +\end{equation} | |
| +By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and… | |
| +\begin{equation} | |
| + (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t | |
| + + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0 | |
| + \label{eq:quad} | |
| +\end{equation} | |
| +The number of ray steps $t$ is the only unknown, so the number of intersection… | |
| +\begin{equation} | |
| + \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\m… | |
| + \label{eq:Delta} | |
| +\end{equation} | |
| +A negative value denotes no intersection between the sphere and the ray, a val… | |
| +\begin{equation} | |
| + t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\math… | |
| + \label{eq:intersect} | |
| +\end{equation} | |
| +where | |
| +\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf… | |
| +Only the smallest intersection ($t_\text{minus}$) is calculated, since this ma… | |
| +\begin{equation} | |
| + \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d} | |
| + \label{eq:intersection} | |
| +\end{equation} | |
| +\begin{equation} | |
| + \mathbf{n} = 2(\mathbf{p}-\mathbf{c}) | |
| + \label{eq:normal} | |
| +\end{equation} | |
| +The intersection distance in vector steps ($t_\text{minus}$) is saved in order… | |
| + | |
| +\subsection{Pixel shading} | |
| +The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where… | |
| +\begin{equation} | |
| + L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l})) | |
| + \label{eq:shading} | |
| +\end{equation} | |
| +where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian)… | |
| + | |
| + | |
| +\subsection{Computational implementation} | |
| +The above routines were first implemented in CUDA for device execution, and af… | |
| + | |
| + | |
| +\section{CUDA implementation} | |
| +When constructing the algorithm for execution on the GPGPU device, the data-pa… | |
| + | |
| +The application starts by reading the discrete element method data from a cust… | |
| +Divergent branches in the kernel code were avoided as much as possible \cite{N… | |
| + | |
| +\begin{figure}[!t] | |
| +\centering | |
| +\includegraphics[width=0.47\textwidth]{np500.png} | |
| +\caption{Sample output of GPU raytracer rendering of \num{512} particles.} | |
| +\label{fig:np500.png} | |
| +\end{figure} | |
| + | |
| + | |
| +\begin{figure}[!t] | |
| +\centering | |
| +\includegraphics[width=0.47\textwidth]{np50000.png} | |
| +\caption{Sample output of GPU raytracer rendering of \num{50653} particles.} | |
| +\label{fig:np50000.png} | |
| +\end{figure} | |
| + | |
| +The algorithm starts by allocating memory on the device for the particle data,… | |
| + | |
| +All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the i… | |
| + | |
| +After all pixel values have been computed, the image data is transfered back t… | |
| + | |
| +\subsection{Thread and block layout} | |
| +The thread/block layout passed during kernel launches is arranged in the follo… | |
| +\begin{lstlisting} | |
| + dim3 threads(16, 16); | |
| + dim3 blocks((width+15)/16, (height+15)/16); | |
| +\end{lstlisting} | |
| +The image pixel position of the thread can be determined from the thread- and … | |
| +\begin{lstlisting} | |
| + int i = threadIdx.x + blockIdx.x * blockDim.x; | |
| + int j = threadIdx.y + blockIdx.y * blockDim.y; | |
| + unsigned int mempos = x + y * blockDim.x * gridDim.x; | |
| + if (mempos > pixels) | |
| + return; | |
| +\end{lstlisting} | |
| +The linear memory position (\texttt{mempos}) is used as the index when reading… | |
| + | |
| +\subsection{Image output} | |
| +After completing all pixel shading computations on the device, the image data … | |
| + | |
| +\subsection{Performance} | |
| +Since this simple raytracing algorithm generates a single non-recursive ray fo… | |
| + | |
| +The data transfer between the host and device is kept at a bare minimum, as th… | |
| + | |
| +The host execution time was profiled using a \texttt{clock()} based CPU timer … | |
| + | |
| +The device execution time was profiled using two \texttt{cudaEvent\_t} timers,… | |
| + | |
| +Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the… | |
| + | |
| +\begin{figure}[!t] | |
| +\centering | |
| +\includegraphics[width=0.47\textwidth]{np-performance.pdf} | |
| +\caption{Performance scaling with varying particle numbers at image dimensions… | |
| +\label{fig:np-performance.pdf} | |
| +\end{figure} | |
| + | |
| +\begin{figure}[!t] | |
| +\centering | |
| +\includegraphics[width=0.47\textwidth]{px-performance.pdf} | |
| +\caption{Performance scaling with varying image dimensions ($n \times m$) wit… | |
| +\label{fig:px-performance.pdf} | |
| +\end{figure} | |
| + | |
| +The device memory allocation and data transfer was also profiled, and turns ou… | |
| + | |
| +The CPU time spent in the host kernels proves to be linear with the particle n… | |
| + | |
| +The ratio between CPU computational times and the sum of the device kernel exe… | |
| + | |
| +As the number of particles are not known by compilation, it is not possible to… | |
| + | |
| +Previous GPU implementations often rely on k-D trees, constructed as an sortin… | |
| + | |
| +\section{Conclusion} | |
| +This document presented the implementation of a basic ray tracing algorithm, u… | |
| +Performance tests showed the expected, linear correlation between image dimens… | |
| + | |
| +The final product will come into good use during further development and compl… | |
| + | |
| +\bibliographystyle{IEEEtran} | |
| +\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB} | |
| + | |
| + | |
| + | |
| +\appendices | |
| + | |
| +\section{Test environment} | |
| +The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac … | |
| + | |
| +The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g+… | |
| +\begin{lstlisting} | |
| +g++ -c -Wall -O3 -arch x86_64 -fopenmp ... | |
| +nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ... | |
| +g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt | |
| +\end{lstlisting} | |
| +When profiling device code performance, the application was executed two times… | |
| + | |
| +The host system was measured to have a memory bandwidth of 4642.1 MB/s when tr… | |
| + | |
| +\ifCLASSOPTIONcaptionsoff | |
| + \newpage | |
| +\fi | |
| + | |
| + | |
| + | |
| + | |
| +% that's all folks | |
| +\end{document} | |
| + | |
| + | |
| diff --git a/raytracer/Makefile b/raytracer/Makefile | |
| t@@ -1,101 +0,0 @@ | |
| -NVCC=nvcc | |
| -CC=g++ | |
| -LD=g++ | |
| - | |
| -CCFLAGS=-c -Wall -O3 -fopenmp | |
| -LDFLAGS=-fopenmp | |
| - | |
| -# Verbose compile? | |
| -#NVCCFLAGS+=--verbose | |
| - | |
| -# Profile code? | |
| -#NVCCFLAGS+=-pg | |
| - | |
| -# Debugable code? | |
| -#CCFLAGS+=-g | |
| -#NVCCFLAGS+=-g -G | |
| - | |
| -CCFILES=main.cpp o-ppm.cpp rt-kernel-cpu.cpp | |
| -CUFILES=rt-kernel.cu | |
| -CCOBJECTS=$(CCFILES:.cpp=.o) | |
| -CUOBJECTS=$(CUFILES:.cu=.o) | |
| -OBJECTS=$(CCOBJECTS) $(CUOBJECTS) | |
| - | |
| -EXECUTABLE=rt | |
| - | |
| -# NVCC flags | |
| -NVCCFLAGS+=-use_fast_math -gencode arch=compute_20,code=sm_20 | |
| - | |
| -# Generate device code | |
| -#NVCCFLAGS+=--export-dir=$(EXECUTABLE).devcode | |
| - | |
| -INCLUDES = -I. -I../libs -I/usr/local/cuda/include | |
| -LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib -lcudart -lcuda | |
| -#GL_LIBS = -lGLEW | |
| - | |
| -# detect OS | |
| -OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) | |
| -# 'linux' is output for Linux system, 'darwin' for OS X | |
| -DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) | |
| -ifneq ($(DARWIN),) | |
| - #INCLUDES += -I/opt/local/include | |
| - #LIBS += -L/opt/local/lib | |
| - #GL_LIBS += -framework glut -framework OpenGL | |
| - CUDA_SDK=/Developer/GPU\ Computing/C | |
| -else | |
| - INCLUDES += -I/usr/include -I/usr/local/include | |
| - LIBS += -L/usr/lib -L/usr/local/lib | |
| - #GL_LIBS += -L/usr/X11/lib -lglut -lGL -lGLU | |
| - CUDA_SDK=$(HOME)/NVIDIA_GPU_Computing_SDK/C | |
| -endif | |
| - | |
| -# detect OS | |
| -ARCHUPPER = $(shell uname -m 2>/dev/null | tr [:lower:] [:upper:]) | |
| -# 'linux' is output for Linux system, 'darwin' for OS X | |
| -ARCH := $(strip $(findstring X86_64, $(ARCHUPPER))) | |
| -ifneq ($(ARCH),) | |
| - LIB_ARCH = x86_64 | |
| - NVCCFLAGS += -m64 | |
| - ifneq ($(DARWIN),) | |
| - CXX_ARCH_FLAGS += -arch x86_64 | |
| - else | |
| - CXX_ARCH_FLAGS += -m64 | |
| - endif | |
| -else | |
| - LIB_ARCH = i386 | |
| - NVCCFLAGS += -m32 | |
| - ifneq ($(DARWIN),) | |
| - CXX_ARCH_FLAGS += -arch i386 | |
| - else | |
| - CXX_ARCH_FLAGS += -m32 | |
| - endif | |
| -endif | |
| - | |
| - | |
| -INCLUDES += -I$(CUDA_SDK)/common/inc | |
| -#LIBS += -L$(CUDA_SDK)/lib #-lcutil_$(LIB_ARCH) | |
| -LIBS += -L$(CUDA_SDK)/lib -lcutil_$(LIB_ARCH) | |
| - | |
| -$(EXECUTABLE): $(OBJECTS) | |
| - $(LD) $(CXX_ARCH_FLAGS) $(LDFLAGS) $(LIBS) $(OBJECTS) -o $(EXECUTABLE) | |
| - | |
| -main.o: main.cpp | |
| - $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
| - | |
| -o-ppm.o: o-ppm.cpp | |
| - $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
| - | |
| -rt-kernel-cpu.o: rt-kernel-cpu.cpp | |
| - $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
| - | |
| -rt-kernel.o: rt-kernel.cu | |
| - $(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@ | |
| - | |
| -clean: | |
| - rm -f doc/*.{log,aux,bbl,blg,out,gz} | |
| - rm -f *.o *.ppm *.png | |
| - rm -f $(EXECUTABLE) | |
| - | |
| -edit: | |
| - vim -p Makefile $(CCFILES) $(CUFILES) *.h | |
| - | |
| diff --git a/raytracer/README b/raytracer/README | |
| t@@ -1,43 +0,0 @@ | |
| ------------------------------------------------------------------- | |
| - This is the README file for the SPHERE ray tracing module | |
| - Last revision: May. 18., 2012 | |
| ------------------------------------------------------------------- | |
| - | |
| -BACKGROUND INFORMATION | |
| -====================== | |
| - | |
| -The ray tracing module was initially developed for the Sphere DEM | |
| -software: http://github.com/anders-dc/sphere | |
| - | |
| -This is a port for visualizing simulation output from the | |
| -SPHERE discrete element method software. | |
| - | |
| -It is developed by Anders Damsgaard Christensen, [email protected], and | |
| -is licenced under the GNU General Public Licence (GPL). | |
| - | |
| -Documentation can be found in the file doc/raytracer.pdf | |
| - | |
| - | |
| -REQUIREMENTS | |
| -============ | |
| - | |
| -For compiling the raytracer, the following software is required: | |
| - - GNU compiler collection | |
| - - CUDA developer drivers | |
| - - CUDA toolkit | |
| -When executing the compiled code, a CUDA compatible GPU is | |
| -necessary, along with appropriate developer device drivers. | |
| - | |
| - | |
| -USAGE INSTRUCTIONS | |
| -================== | |
| - | |
| -The source code is built using GNU make with the command: | |
| - ./make | |
| - | |
| -A sample run can subsequently be started with: | |
| - ./make run | |
| - | |
| -Temporary files can be removed with: | |
| - ./make clean | |
| - | |
| diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h | |
| t@@ -1,22 +0,0 @@ | |
| -#ifndef COLORBAR_H_ | |
| -#define COLORBAR_H_ | |
| - | |
| -// Functions that determine red-, green- and blue color components | |
| -// in a blue-white-red colormap. Ratio should be between 0.0-1.0. | |
| - | |
| -__inline__ __host__ __device__ float red(float ratio) | |
| -{ | |
| - return fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio … | |
| -}; | |
| - | |
| -__inline__ __host__ __device__ float green(float ratio) | |
| -{ | |
| - return fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f); | |
| -}; | |
| - | |
| -__inline__ __host__ __device__ float blue(float ratio) | |
| -{ | |
| - return fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f); | |
| -}; | |
| - | |
| -#endif | |
| diff --git a/raytracer/doc/.DS_Store b/raytracer/doc/.DS_Store | |
| Binary files differ. | |
| diff --git a/raytracer/doc/.raytracer.tex.swp b/raytracer/doc/.raytracer.tex.swp | |
| Binary files differ. | |
| diff --git a/raytracer/doc/bloopers/black.png b/raytracer/doc/bloopers/black.png | |
| Binary files differ. | |
| diff --git a/raytracer/doc/bloopers/colorful.png b/raytracer/doc/bloopers/color… | |
| Binary files differ. | |
| diff --git a/raytracer/doc/bloopers/cpu_wrong_colorful.png b/raytracer/doc/bloo… | |
| Binary files differ. | |
| diff --git a/raytracer/doc/bloopers/twisted.png b/raytracer/doc/bloopers/twiste… | |
| Binary files differ. | |
| diff --git a/raytracer/doc/bloopers/wrong_order.png b/raytracer/doc/bloopers/wr… | |
| Binary files differ. | |
| diff --git a/raytracer/doc/raytracer.pdf b/raytracer/doc/raytracer.pdf | |
| Binary files differ. | |
| diff --git a/raytracer/doc/raytracer.tex b/raytracer/doc/raytracer.tex | |
| t@@ -1,317 +0,0 @@ | |
| -\documentclass[journal]{IEEEtran} | |
| - | |
| - | |
| -\ifCLASSINFOpdf | |
| - \usepackage[pdftex]{graphicx} | |
| -\else | |
| - \usepackage[dvips]{graphicx} | |
| -\fi | |
| - | |
| -\usepackage{color} | |
| - | |
| -\usepackage{url} | |
| -%\usepackage{showkeys} | |
| -\usepackage{siunitx} | |
| - | |
| -% correct bad hyphenation here | |
| -\hyphenation{op-tical net-works semi-conduc-tor} | |
| - | |
| -\definecolor{DarkGreen}{rgb}{0,0.45,0.08} | |
| -\usepackage{listings} | |
| -\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,… | |
| - | |
| - | |
| -\begin{document} | |
| -\title{CUDA raytracing algorithm for visualizing\\ discrete element model outp… | |
| - | |
| -\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213} | |
| -\thanks{Contact: {[email protected]}}% | |
| -\thanks{Webpage: {http://users-cs.au.dk/adc}}% | |
| -\thanks{Manuscript, last revision: \today.}} | |
| - | |
| -% The paper headers | |
| -\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm… | |
| - | |
| -% make the title area | |
| -\maketitle | |
| - | |
| - | |
| -\begin{abstract} | |
| -%\boldmath | |
| -A raytracing algorithm is constructed using the CUDA API for visualizing outpu… | |
| -The raytracing algorithm is optimized with constant memory and compilation fla… | |
| -\end{abstract} | |
| -\begin{IEEEkeywords} | |
| -CUDA, discrete element method, raytracing | |
| -\end{IEEEkeywords} | |
| - | |
| - | |
| -\IEEEpeerreviewmaketitle | |
| - | |
| - | |
| -\section{Introduction} | |
| -\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional… | |
| - | |
| -Previous studies of GPU or CUDA implementations of ray tracing algorithms repo… | |
| - | |
| -\subsection{Discrete Element Method} | |
| -The input particle data to the raytracer is the output of a custom CUDA-based … | |
| - | |
| -The discrete element method (DEM) is a subtype of molecular dynamics (MD), and… | |
| - | |
| -\subsection{Application usage} | |
| -The CUDA DEM application is a command line executable, and writes updated part… | |
| - | |
| -Both the CUDA DEM and raytracing applications are open-source\footnote{\url{ht… | |
| - | |
| -This document consists of a short introduction to the basic mathematics behind… | |
| - | |
| - | |
| -\section{Ray tracing algorithm} | |
| -The goal of the ray tracing algorithm is to compute the shading of each pixel … | |
| -\begin{lstlisting}[basicstyle=\footnotesize] | |
| -for each pixel do | |
| - compute viewing ray origin and direction | |
| - iterate through objects and find the closest hit | |
| - set pixel color to value computed from hit point, light, n | |
| -\end{lstlisting} | |
| -The implemented code does not utilize recursive rays, since the modeled materi… | |
| - | |
| -\subsection{Ray generation} | |
| -The rays are in vector form defined as: | |
| -\begin{equation} | |
| - \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e}) | |
| - \label{eq:raygen} | |
| -\end{equation} | |
| -The perspective can be either \emph{orthograpic}, where all viewing rays have … | |
| - | |
| -The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The d… | |
| -\begin{equation} | |
| - \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v} | |
| - \label{eq:raydirection} | |
| -\end{equation} | |
| -where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the … | |
| -\[ u = l + (r-l)(i+0.5)/n \] | |
| -\[ v = b + (t-b)(j+0.5)/m \] | |
| -where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, righ… | |
| - | |
| - | |
| -\subsection{Ray-sphere intersection} | |
| -Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be c… | |
| -\begin{equation} | |
| - (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0 | |
| - \label{eq:sphere} | |
| -\end{equation} | |
| -By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and… | |
| -\begin{equation} | |
| - (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t | |
| - + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0 | |
| - \label{eq:quad} | |
| -\end{equation} | |
| -The number of ray steps $t$ is the only unknown, so the number of intersection… | |
| -\begin{equation} | |
| - \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\m… | |
| - \label{eq:Delta} | |
| -\end{equation} | |
| -A negative value denotes no intersection between the sphere and the ray, a val… | |
| -\begin{equation} | |
| - t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\math… | |
| - \label{eq:intersect} | |
| -\end{equation} | |
| -where | |
| -\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf… | |
| -Only the smallest intersection ($t_\text{minus}$) is calculated, since this ma… | |
| -\begin{equation} | |
| - \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d} | |
| - \label{eq:intersection} | |
| -\end{equation} | |
| -\begin{equation} | |
| - \mathbf{n} = 2(\mathbf{p}-\mathbf{c}) | |
| - \label{eq:normal} | |
| -\end{equation} | |
| -The intersection distance in vector steps ($t_\text{minus}$) is saved in order… | |
| - | |
| -\subsection{Pixel shading} | |
| -The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where… | |
| -\begin{equation} | |
| - L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l})) | |
| - \label{eq:shading} | |
| -\end{equation} | |
| -where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian)… | |
| - | |
| - | |
| -\subsection{Computational implementation} | |
| -The above routines were first implemented in CUDA for device execution, and af… | |
| - | |
| - | |
| -\section{CUDA implementation} | |
| -When constructing the algorithm for execution on the GPGPU device, the data-pa… | |
| - | |
| -The application starts by reading the discrete element method data from a cust… | |
| -Divergent branches in the kernel code were avoided as much as possible \cite{N… | |
| - | |
| -\begin{figure}[!t] | |
| -\centering | |
| -\includegraphics[width=0.47\textwidth]{np500.png} | |
| -\caption{Sample output of GPU raytracer rendering of \num{512} particles.} | |
| -\label{fig:np500.png} | |
| -\end{figure} | |
| - | |
| - | |
| -\begin{figure}[!t] | |
| -\centering | |
| -\includegraphics[width=0.47\textwidth]{np50000.png} | |
| -\caption{Sample output of GPU raytracer rendering of \num{50653} particles.} | |
| -\label{fig:np50000.png} | |
| -\end{figure} | |
| - | |
| -The algorithm starts by allocating memory on the device for the particle data,… | |
| - | |
| -All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the i… | |
| - | |
| -After all pixel values have been computed, the image data is transfered back t… | |
| - | |
| -\subsection{Thread and block layout} | |
| -The thread/block layout passed during kernel launches is arranged in the follo… | |
| -\begin{lstlisting} | |
| - dim3 threads(16, 16); | |
| - dim3 blocks((width+15)/16, (height+15)/16); | |
| -\end{lstlisting} | |
| -The image pixel position of the thread can be determined from the thread- and … | |
| -\begin{lstlisting} | |
| - int i = threadIdx.x + blockIdx.x * blockDim.x; | |
| - int j = threadIdx.y + blockIdx.y * blockDim.y; | |
| - unsigned int mempos = x + y * blockDim.x * gridDim.x; | |
| - if (mempos > pixels) | |
| - return; | |
| -\end{lstlisting} | |
| -The linear memory position (\texttt{mempos}) is used as the index when reading… | |
| - | |
| -\subsection{Image output} | |
| -After completing all pixel shading computations on the device, the image data … | |
| - | |
| -\subsection{Performance} | |
| -Since this simple raytracing algorithm generates a single non-recursive ray fo… | |
| - | |
| -The data transfer between the host and device is kept at a bare minimum, as th… | |
| - | |
| -The host execution time was profiled using a \texttt{clock()} based CPU timer … | |
| - | |
| -The device execution time was profiled using two \texttt{cudaEvent\_t} timers,… | |
| - | |
| -Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the… | |
| - | |
| -\begin{figure}[!t] | |
| -\centering | |
| -\includegraphics[width=0.47\textwidth]{np-performance.pdf} | |
| -\caption{Performance scaling with varying particle numbers at image dimensions… | |
| -\label{fig:np-performance.pdf} | |
| -\end{figure} | |
| - | |
| -\begin{figure}[!t] | |
| -\centering | |
| -\includegraphics[width=0.47\textwidth]{px-performance.pdf} | |
| -\caption{Performance scaling with varying image dimensions ($n \times m$) wit… | |
| -\label{fig:px-performance.pdf} | |
| -\end{figure} | |
| - | |
| -The device memory allocation and data transfer was also profiled, and turns ou… | |
| - | |
| -The CPU time spent in the host kernels proves to be linear with the particle n… | |
| - | |
| -The ratio between CPU computational times and the sum of the device kernel exe… | |
| - | |
| -As the number of particles are not known by compilation, it is not possible to… | |
| - | |
| -Previous GPU implementations often rely on k-D trees, constructed as an sortin… | |
| - | |
| -\section{Conclusion} | |
| -This document presented the implementation of a basic ray tracing algorithm, u… | |
| -Performance tests showed the expected, linear correlation between image dimens… | |
| - | |
| -The final product will come into good use during further development and compl… | |
| - | |
| -\bibliographystyle{IEEEtran} | |
| -\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB} | |
| - | |
| - | |
| - | |
| -\appendices | |
| - | |
| -\section{Test environment} | |
| -The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac … | |
| - | |
| -The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g+… | |
| -\begin{lstlisting} | |
| -g++ -c -Wall -O3 -arch x86_64 -fopenmp ... | |
| -nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ... | |
| -g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt | |
| -\end{lstlisting} | |
| -When profiling device code performance, the application was executed two times… | |
| - | |
| -The host system was measured to have a memory bandwidth of 4642.1 MB/s when tr… | |
| - | |
| -\ifCLASSOPTIONcaptionsoff | |
| - \newpage | |
| -\fi | |
| - | |
| - | |
| -\section{Source code} | |
| -The entire source code, as well as input data files, can be found in the follo… | |
| -\url{http://users-cs.au.dk/adc/files/sphere-rt.tar.gz} | |
| -The source code is built and run with the commands: | |
| -\begin{lstlisting} | |
| - make | |
| - make run | |
| -\end{lstlisting} | |
| -With the \texttt{make run} command, the Makefile uses ImageMagick to convert t… | |
| -\begin{lstlisting} | |
| - ./rt <CPU | GPU> <sphere-binary.bin> <width> <height> <output-image.ppm> | |
| -\end{lstlisting} | |
| -This appendix contains the following source code files: | |
| -\lstlistoflistings | |
| - | |
| -\subsection{CUDA raytracing source code} | |
| -%\subsubsection{rt\_kernel.h} | |
| -\lstinputlisting[caption={rt-kernel.h},label={rt-kernel.h},numbers=left,breakl… | |
| - | |
| -%\subsubsection{rt\_kernel.cu} | |
| -\lstinputlisting[caption={rt-kernel.cu},label={rt-kernel.cu},numbers=left,brea… | |
| - | |
| -\subsection{CPU raytracing source code} | |
| -%\subsubsection{rt\_kernel\_cpu.h} | |
| -\lstinputlisting[caption={rt-kernel-cpu.h},label={rt-kernel-cpu.h},numbers=lef… | |
| - | |
| -%\subsubsection{rt\_kernel\_cpu.cpp} | |
| -\lstinputlisting[caption={rt-kernel-cpu.cpp},label={rt-kernel-cpu.cpp},numbers… | |
| - | |
| - | |
| - | |
| -\clearpage | |
| - | |
| - | |
| - | |
| - | |
| - | |
| - | |
| - | |
| - | |
| - | |
| -% You can push biographies down or up by placing | |
| -% a \vfill before or after them. The appropriate | |
| -% use of \vfill depends on what kind of text is | |
| -% on the last page and whether or not the columns | |
| -% are being equalized. | |
| - | |
| -%\vfill | |
| - | |
| -% Can be used to pull up biographies so that the bottom of the last one | |
| -% is flush with the other column. | |
| -%\enlargethispage{-5in} | |
| - | |
| - | |
| - | |
| -% that's all folks | |
| -\end{document} | |
| - | |
| - | |
| diff --git a/raytracer/header.h b/raytracer/header.h | |
| t@@ -1,26 +0,0 @@ | |
| -// header.h -- Structure templates and function prototypes | |
| - | |
| -// Standard technique for avoiding multiple inclusions of header file | |
| -#ifndef HEADER_H_ | |
| -#define HEADER_H_ | |
| - | |
| -// Type declaration | |
| -typedef unsigned int Inttype; | |
| - | |
| -//// Structure declarations //// | |
| - | |
| -struct f3 { | |
| - float x; | |
| - float y; | |
| - float z; | |
| -}; | |
| - | |
| -// Image structure | |
| -struct rgb { | |
| - unsigned char r; | |
| - unsigned char g; | |
| - unsigned char b; | |
| - unsigned char alpha; | |
| -}; | |
| - | |
| -#endif | |
| diff --git a/raytracer/main.cpp b/raytracer/main.cpp | |
| t@@ -1,264 +0,0 @@ | |
| -#include <iostream> | |
| -#include <cstdio> | |
| -#include <cstdlib> | |
| -#include <cstring> | |
| -#include <cmath> | |
| -#include <vector_functions.h> | |
| -#include "header.h" | |
| -#include "rt-kernel.h" | |
| -#include "rt-kernel-cpu.h" | |
| -#include "colorbar.h" | |
| -#include "o-ppm.h" | |
| - | |
| -int main(const int argc, const char* argv[]) | |
| -{ | |
| - using std::cout; | |
| - | |
| - if (argc < 6 || argc > 8 ){ | |
| - cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width>… | |
| - << "or\n" | |
| - << "Usage: " << argv[0] << " GPU <particle-data.txt | colorbar> <widt… | |
| - return 1; | |
| - } | |
| - | |
| - cout << "\n----------------------------\n" | |
| - << "This is the SPHERE raytracer\n" | |
| - << "----------------------------\n"; | |
| - | |
| - // Allocate memory for image | |
| - unsigned int width = atoi(argv[3]); | |
| - unsigned int height = atoi(argv[4]); | |
| - if (width < 1 || height < 1) { | |
| - cout << "Image dimensions invalid.\n"; | |
| - return 1; | |
| - } | |
| - rgb* img; | |
| - img = new rgb [height*width]; | |
| - | |
| - // Render colorbar image | |
| - if (strcmp(argv[2],"colorbar") == 0) { | |
| - | |
| - for (unsigned int y=0; y<height; ++y) { | |
| - for (unsigned int x=0; x<width; ++x) { | |
| - | |
| - // Colormap value is relative to width position | |
| - float ratio = (float) (x+1)/width; | |
| - | |
| - // Write pixel value to image array | |
| - img[x + y*width].r = red(ratio) * 250.f; | |
| - img[x + y*width].g = green(ratio) * 250.f; | |
| - img[x + y*width].b = blue(ratio) * 250.f; | |
| - | |
| - } | |
| - } | |
| - } else { // Render sphere binary | |
| - | |
| - cout << "Reading binary: " << argv[2] << "\n"; | |
| - | |
| - FILE* fin; | |
| - if ((fin = fopen(argv[2], "rb")) == NULL) { | |
| - cout << " Error encountered while trying to open binary '" | |
| - << argv[2] << "'\n"; | |
| - return 1; | |
| - } | |
| - | |
| - // Read the number of dimensions and particles | |
| - unsigned int nd, np; | |
| - (void)fread(&nd, sizeof(nd), 1, fin); | |
| - if (nd != 3) { | |
| - cout << " Input binary contains " << nd | |
| - << "D data, this raytracer is made for 3D data\n"; | |
| - return 1; | |
| - } | |
| - | |
| - (void)fread(&np, sizeof(np), 1, fin); | |
| - cout << " Number of particles: " << np << "\n"; | |
| - | |
| - // Allocate particle structure array | |
| - cout << " Allocating host memory\n"; | |
| - | |
| - // Single precision arrays used for computations | |
| - float4* p = new float4[np]; | |
| - float* fixvel = new float[np]; | |
| - float* xsum = new float[np]; | |
| - float* es_dot = new float[np]; | |
| - float* ev_dot = new float[np]; | |
| - float* es = new float[np]; | |
| - float* ev = new float[np]; | |
| - float* pres = new float[np]; | |
| - float* vel = new float[np]; | |
| - | |
| - // Read temporal information | |
| - double dt, file_dt, current, total; | |
| - unsigned int step_count; | |
| - (void)fread(&dt, sizeof(dt), 1, fin); | |
| - (void)fread(¤t, sizeof(current), 1, fin); | |
| - (void)fread(&total, sizeof(total), 1, fin); | |
| - (void)fread(&file_dt, sizeof(file_dt), 1, fin); | |
| - (void)fread(&step_count, sizeof(step_count), 1, fin); | |
| - | |
| - double d; // Double precision temporary value holder | |
| - | |
| - // Canonical coordinate system origo | |
| - f3 origo; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - origo.x = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - origo.y = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - origo.z = (float)d; | |
| - | |
| - // Canonical coordinate system dimensions | |
| - f3 L; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - L.x = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - L.y = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - L.z = (float)d; | |
| - | |
| - | |
| - // Skip over irrelevant data | |
| - double blankd; | |
| - //f3 blankd3; | |
| - unsigned int blankui; | |
| - for (int j=0; j<3; j++) // Skip over grid.num data | |
| - (void)fread(&blankui, sizeof(blankui), 1, fin); | |
| - | |
| - // Velocity vector, later converted to lenght | |
| - float3 v; | |
| - | |
| - // Load data into particle array | |
| - for (unsigned int i=0; i<np; i++) { | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - p[i].x = (float)d; // Typecast to single precision | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - v.x = (float)d; | |
| - for (int j=0; j<4; j++) | |
| - (void)fread(&blankd, sizeof(blankd), 1, fin); | |
| - | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - p[i].y = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - v.y = (float)d; | |
| - for (int j=0; j<4; j++) | |
| - (void)fread(&blankd, sizeof(blankd), 1, fin); | |
| - | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - p[i].z = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - v.z = (float)d; | |
| - for (int j=0; j<4; j++) | |
| - (void)fread(&blankd, sizeof(blankd), 1, fin); | |
| - | |
| - // Save velocity vector length | |
| - vel[i] = sqrt(v.x*v.x + v.y*v.y + v.z*v.z); | |
| - } | |
| - for (unsigned int i=0; i<np; i++) { | |
| - (void)fread(&d, sizeof(d), 1, fin); // fixvel | |
| - fixvel[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); // xsum | |
| - xsum[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); // radius | |
| - p[i].w = (float)d; | |
| - for (int j=0; j<10; j++) | |
| - (void)fread(&blankd, sizeof(blankd), 1, fin); | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - es_dot[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - ev_dot[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - es[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - ev[i] = (float)d; | |
| - (void)fread(&d, sizeof(d), 1, fin); | |
| - pres[i] = (float)d; | |
| - } | |
| - | |
| - fclose(fin); // Binary read complete | |
| - | |
| - cout << " Spatial dimensions: " | |
| - << L.x << "*" << L.y << "*" << L.z << " m\n"; | |
| - | |
| - // Eye position and focus point | |
| - //f3 eye = {0.5f*L.x, -4.2f*L.y, 0.5f*L.z}; | |
| - //f3 eye = {2.5f*L.x, -4.2f*L.y, 2.0f*L.z}; | |
| - //f3 eye = {0.5f*L.x, -5.0f*L.y, 0.5f*L.z}; | |
| - f3 eye = {2.5f*L.x, -5.0f*L.y, 1.5f*L.z}; | |
| - f3 lookat = {0.5f*L.x, 0.5f*L.y, 0.5f*L.z}; | |
| - if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetu… | |
| - eye.x = 4.1f*L.x; | |
| - eye.y = -15.1*L.y; | |
| - eye.z = 1.1*L.z; | |
| - /*lookat.x = 0.45f*L.x; | |
| - lookat.y = 0.5f*L.y; | |
| - lookat.z = 0.30f*L.z;*/ | |
| - } | |
| - | |
| - // Screen width in world coordinates | |
| - //float imgw = 1.4f*L.x; | |
| - //float imgw = pow(L.x*L.y*L.z, 0.32f); | |
| - //float imgw = sqrt(L.x*L.y)*1.2f; // Adjust last float to capture entire … | |
| - float imgw = L.x*1.35f; | |
| - | |
| - // Determine visualization mode | |
| - int visualize = 0; // 0: ordinary view | |
| - float max_val = 0; | |
| - if (argc == 8) { | |
| - if(strcmp(argv[6],"pressure") == 0) | |
| - visualize = 1; // 1: pressure view | |
| - if(strcmp(argv[6],"es_dot") == 0) | |
| - visualize = 2; // 2: es_dot view | |
| - if(strcmp(argv[6],"es") == 0) | |
| - visualize = 3; // 3: es view | |
| - if(strcmp(argv[6],"vel") == 0) | |
| - visualize = 4; // 4: velocity view | |
| - if(strcmp(argv[6],"xsum") == 0) | |
| - visualize = 5; // 5: xsum view | |
| - | |
| - // Read max. value specified in command args. | |
| - max_val = atof(argv[7]); | |
| - | |
| - } | |
| - | |
| - | |
| - if (strcmp(argv[1],"GPU") == 0) { | |
| - | |
| - // Call cuda wrapper | |
| - if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize… | |
| - cout << "\nError in rt()\n"; | |
| - return 1; | |
| - } | |
| - } else if (strcmp(argv[1],"CPU") == 0) { | |
| - | |
| - // Call CPU wrapper | |
| - if (rt_cpu(p, np, img, width, height, origo, L, eye, lookat, imgw) != 0)… | |
| - cout << "\nError in rt_cpu()\n"; | |
| - return 1; | |
| - } | |
| - } else { | |
| - cout << "Please specify CPU or GPU for execution\n"; | |
| - return 1; | |
| - } | |
| - | |
| - // Free dynamically allocated memory | |
| - delete [] p; | |
| - delete [] fixvel; | |
| - delete [] xsum; | |
| - delete [] pres; | |
| - delete [] es; | |
| - delete [] ev; | |
| - delete [] es_dot; | |
| - delete [] ev_dot; | |
| - delete [] vel; | |
| - | |
| - } | |
| - | |
| - // Write final image to PPM file | |
| - image_to_ppm(img, argv[5], width, height); | |
| - | |
| - delete [] img; | |
| - | |
| - cout << "Terminating successfully\n\n"; | |
| - return 0; | |
| -} | |
| diff --git a/raytracer/o-ppm.cpp b/raytracer/o-ppm.cpp | |
| t@@ -1,30 +0,0 @@ | |
| -// o-ppm.cpp | |
| -// Write images in the PPM format | |
| - | |
| -#include <iostream> | |
| -#include <cstdio> | |
| -#include "header.h" | |
| - | |
| - | |
| -// Write image structure to PPM file | |
| -int image_to_ppm(rgb* rgb_img, const char* filename, const unsigned int width,… | |
| -{ | |
| - FILE *fout = fopen(filename, "wb"); | |
| - if (fout == NULL) { | |
| - std::cout << "File error in img_write() while trying to writing " << filen… | |
| - return 1; | |
| - } | |
| - | |
| - fprintf(fout, "P6 %d %d 255\n", width, height); | |
| - | |
| - // Write pixel array to ppm image file | |
| - for (unsigned int i=0; i<height*width; ++i) { | |
| - putc(rgb_img[i].r, fout); | |
| - putc(rgb_img[i].g, fout); | |
| - putc(rgb_img[i].b, fout); | |
| - } | |
| - | |
| - fclose(fout); | |
| - return 0; | |
| -} | |
| - | |
| diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h | |
| t@@ -1,7 +0,0 @@ | |
| -#ifndef O_PPM_H_ | |
| -#define O_PPM_H_ | |
| - | |
| -int image_to_ppm(rgb* rgb_img, const char* filename, | |
| - const unsigned int width, const unsigned int height); | |
| - | |
| -#endif | |
| diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs… | |
| t@@ -1,27 +0,0 @@ | |
| -#!/bin/bash | |
| - | |
| -FILES=`ls ../output/*.bin` | |
| - | |
| -XRES=800 | |
| -YRES=800 | |
| - | |
| -WORKHORSE=CPU | |
| - | |
| -echo "# Rendering PPM images and converting to JPG" | |
| -for F in ../output/*.bin | |
| -do | |
| - (BASE=`basename $F`; | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm; | |
| - convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;) | |
| - #rm ../img_out/$BASE.ppm &) | |
| -done | |
| - | |
| -#echo "# Converting PPM files to JPG using ImageMagick in parallel" | |
| -#for F in ../img_out/*.ppm | |
| -#do | |
| -# (BASE=`basename $F`; convert $F $F.jpg &) | |
| -#done | |
| - | |
| -#echo "# Removed temporary files" | |
| -#rm ../img_out/*.ppm | |
| - | |
| diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs… | |
| t@@ -1,29 +0,0 @@ | |
| -#!/bin/bash | |
| - | |
| -FILES=`ls ../output/*.bin` | |
| - | |
| -XRES=800 | |
| -YRES=800 | |
| - | |
| -WORKHORSE=GPU | |
| - | |
| -echo "# Rendering PPM images" | |
| -for F in ../output/*.bin | |
| -do | |
| - BASE=`basename $F` | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null | |
| - if [ $? -ne 0 ] ; then | |
| - echo "Error rendering $F, trying again..." | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null | |
| - fi | |
| -done | |
| - | |
| -echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
| -for F in ../img_out/*.ppm | |
| -do | |
| - (BASE=`basename $F`; convert $F $F.jpg &) | |
| -done | |
| - | |
| -#echo "# Removed temporary files" | |
| -#rm ../img_out/*.ppm | |
| - | |
| diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp | |
| t@@ -1,273 +0,0 @@ | |
| -#include <iostream> | |
| -#include <cstdio> | |
| -#include <cmath> | |
| -#include <time.h> | |
| -#include <cuda.h> | |
| -#include <cutil_math.h> | |
| -#include <string.h> | |
| -#include <stdlib.h> | |
| -#include "header.h" | |
| -#include "rt-kernel-cpu.h" | |
| - | |
| -// Constants | |
| -float3 constc_u; | |
| -float3 constc_v; | |
| -float3 constc_w; | |
| -float3 constc_eye; | |
| -float4 constc_imgplane; | |
| -float constc_d; | |
| -float3 constc_light; | |
| - | |
| -__inline__ float3 f4_to_f3(float4 in) | |
| -{ | |
| - return make_float3(in.x, in.y, in.z); | |
| -} | |
| - | |
| -__inline__ float4 f3_to_f4(float3 in) | |
| -{ | |
| - return make_float4(in.x, in.y, in.z, 0.0f); | |
| -} | |
| - | |
| -__inline__ float lengthf3(float3 in) | |
| -{ | |
| - return sqrt(in.x*in.x + in.y*in.y + in.z*in.z); | |
| -} | |
| - | |
| -// Kernel for initializing image data | |
| -void imageInit_cpu(unsigned char* _img, unsigned int pixels) | |
| -{ | |
| - for (unsigned int mempos=0; mempos<pixels; ++mempos) { | |
| - _img[mempos*4] = 255; // Red channel | |
| - _img[mempos*4 + 1] = 255; // Green channel | |
| - _img[mempos*4 + 2] = 255; // Blue channel | |
| - } | |
| -} | |
| - | |
| -// Calculate ray origins and directions | |
| -void rayInitPerspective_cpu(float3* _ray_origo, | |
| - float3* _ray_direction, | |
| - float3 eye, | |
| - unsigned int width, | |
| - unsigned int height) | |
| -{ | |
| - int i,j; | |
| - unsigned int mempos; | |
| - float p_u, p_v; | |
| - #pragma omp parallel for private(mempos,j,p_u,p_v) | |
| - for (i=0; i<(int)width; ++i) { | |
| - for (j=0; j<(int)height; ++j) { | |
| - | |
| - mempos = i + j*width; | |
| - | |
| - // Calculate pixel coordinates in image plane | |
| - p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x) | |
| - * (i + 0.5f) / width; | |
| - p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z) | |
| - * (j + 0.5f) / height; | |
| - | |
| - // Write ray origo and direction to global memory | |
| - _ray_origo[mempos] = constc_eye; | |
| - _ray_direction[mempos] = -constc_d*constc_w + p_u*constc_u + p_v*constc_… | |
| - } | |
| - } | |
| -} | |
| - | |
| -// Check wether the pixel's viewing ray intersects with the spheres, | |
| -// and shade the pixel correspondingly | |
| -void rayIntersectSpheres_cpu(float3* _ray_origo, | |
| - float3* _ray_direction, | |
| - float4* _p, | |
| - float* _nuance, | |
| - unsigned char* _img, | |
| - unsigned int pixels, | |
| - unsigned int np) | |
| -{ | |
| - long int mempos; | |
| - float3 e, d, n, p, c; | |
| - float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance; | |
| - Inttype i, ifinal; | |
| - #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d… | |
| - for (mempos=0; mempos<pixels; ++mempos) { | |
| - | |
| - // Read ray data from global memory | |
| - e = _ray_origo[mempos]; | |
| - d = _ray_direction[mempos]; | |
| - | |
| - // Distance, in ray steps, between object and eye initialized with a large… | |
| - tdist = 1e10f; | |
| - | |
| - // Iterate through all particles | |
| - for (i=0; i<np; ++i) { | |
| - | |
| - // Read sphere coordinate and radius | |
| - c = f4_to_f3(_p[i]); | |
| - R = _p[i].w; | |
| - | |
| - // Calculate the discriminant: d = B^2 - 4AC | |
| - Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
| - - 4.0f*dot(d,d) // -4*A | |
| - * (dot((e-c),(e-c)) - R*R); // C | |
| - | |
| - // If the determinant is positive, there are two solutions | |
| - // One where the line enters the sphere, and one where it exits | |
| - if (Delta > 0.0f) { | |
| - | |
| - // Calculate roots, Shirley 2009 p. 77 | |
| - t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d) | |
| - * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
| - | |
| - // Check wether intersection is closer than previous values | |
| - if (fabs(t_minus) < tdist) { | |
| - p = e + t_minus*d; | |
| - tdist = fabs(t_minus); | |
| - n = normalize(2.0f * (p - c)); // Surface normal | |
| - ifinal = i; | |
| - } | |
| - | |
| - } // End of solution branch | |
| - | |
| - } // End of particle loop | |
| - | |
| - // Write pixel color | |
| - if (tdist < 1e10) { | |
| - | |
| - // Lambertian shading parameters | |
| - //float dotprod = fabs(dot(n, constc_light)); | |
| - dotprod = fmax(0.0f,dot(n, constc_light)); | |
| - I_d = 70.0f; // Light intensity | |
| - k_d = 5.0f; // Diffuse coefficient | |
| - | |
| - // Ambient shading | |
| - k_a = 30.0f; | |
| - I_a = 5.0f; | |
| - | |
| - // Read color nuance of grain | |
| - nuance = _nuance[ifinal]; | |
| - | |
| - // Write shading model values to pixel color channels | |
| - _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.48f*nuance); | |
| - _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.41f*nuance); | |
| - _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.27f*nuance); | |
| - } | |
| - } | |
| -} | |
| - | |
| - | |
| -void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio) | |
| -{ | |
| - // Image dimensions in world space (l, r, b, t) | |
| - float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.… | |
| - | |
| - // The view vector | |
| - float3 view = eye - lookat; | |
| - | |
| - // Construct the camera view orthonormal base | |
| - //float3 up = make_float3(0.0f, 1.0f, 0.0f); // Pointing upward along +y | |
| - float3 up = make_float3(0.0f, 0.0f, 1.0f); // Pointing upward along +z | |
| - float3 w = -view/length(view); // w: Pointing backwards | |
| - float3 u = cross(up, w) / length(cross(up, w)); | |
| - float3 v = cross(w, u); | |
| - | |
| - // Focal length 20% of eye vector length | |
| - float d = lengthf3(view)*0.8f; | |
| - | |
| - // Light direction (points towards light source) | |
| - float3 light = normalize(-1.0f*eye*make_float3(1.0f, 0.2f, 0.6f)); | |
| - | |
| - std::cout << " Transfering camera values to constant memory\n"; | |
| - | |
| - constc_u = u; | |
| - constc_v = v; | |
| - constc_w = w; | |
| - constc_eye = eye; | |
| - constc_imgplane = imgplane; | |
| - constc_d = d; | |
| - constc_light = light; | |
| - | |
| - std::cout << "Rendering image..."; | |
| -} | |
| - | |
| - | |
| -// Wrapper for the rt algorithm | |
| -int rt_cpu(float4* p, unsigned int np, | |
| - rgb* img, unsigned int width, unsigned int height, | |
| - f3 origo, f3 L, f3 eye, f3 lookat, float imgw) { | |
| - | |
| - using std::cout; | |
| - | |
| - cout << "Initializing CPU raytracer:\n"; | |
| - | |
| - // Initialize GPU timestamp recorders | |
| - float t1_go, t2_go, t1_stop, t2_stop; | |
| - | |
| - // Start timer 1 | |
| - t1_go = clock(); | |
| - | |
| - // Generate random nuance values for all grains | |
| - static float* _nuance; // Values between 0.5 and 1.0 | |
| - _nuance = new float[np]; | |
| - srand(0); // Initialize seed at same value at every run | |
| - for (unsigned int i=0; i<np; ++i) | |
| - _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f; | |
| - | |
| - // Allocate memory | |
| - cout << " Allocating memory\n"; | |
| - static unsigned char *_img; // RGBw values in image | |
| - static float3* _ray_origo; // Ray origo (x,y,z) | |
| - static float3* _ray_direction; // Ray direction (x,y,z) | |
| - _img = new unsigned char[width*height*4]; | |
| - _ray_origo = new float3[width*height]; | |
| - _ray_direction = new float3[width*height]; | |
| - | |
| - // Arrange thread/block structure | |
| - unsigned int pixels = width*height; | |
| - float hw_ratio = (float)height/(float)width; | |
| - | |
| - // Start timer 2 | |
| - t2_go = clock(); | |
| - | |
| - // Initialize image to background color | |
| - imageInit_cpu(_img, pixels); | |
| - | |
| - // Initialize camera | |
| - cameraInit_cpu(make_float3(eye.x, eye.y, eye.z), | |
| - make_float3(lookat.x, lookat.y, lookat.z), | |
| - imgw, hw_ratio); | |
| - | |
| - // Construct rays for perspective projection | |
| - rayInitPerspective_cpu( | |
| - _ray_origo, _ray_direction, | |
| - make_float3(eye.x, eye.y, eye.z), | |
| - width, height); | |
| - | |
| - // Find closest intersection between rays and spheres | |
| - rayIntersectSpheres_cpu( | |
| - _ray_origo, _ray_direction, | |
| - p, _nuance, _img, pixels, np); | |
| - | |
| - // Stop timer 2 | |
| - t2_stop = clock(); | |
| - | |
| - memcpy(img, _img, sizeof(unsigned char)*pixels*4); | |
| - | |
| - // Free dynamically allocated device memory | |
| - delete [] _nuance; | |
| - delete [] _img; | |
| - delete [] _ray_origo; | |
| - delete [] _ray_direction; | |
| - | |
| - // Stop timer 1 | |
| - t1_stop = clock(); | |
| - | |
| - // Report time spent | |
| - cout << " done.\n" | |
| - << " Time spent on entire CPU raytracing routine: " | |
| - << (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n"; | |
| - cout << " - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n… | |
| - | |
| - // Return successfully | |
| - return 0; | |
| -} | |
| diff --git a/raytracer/rt-kernel-cpu.h b/raytracer/rt-kernel-cpu.h | |
| t@@ -1,14 +0,0 @@ | |
| -#ifndef RT_KERNEL_CPU_H_ | |
| -#define RT_KERNEL_CPU_H_ | |
| - | |
| -#include <vector_functions.h> | |
| - | |
| -// Host prototype functions | |
| - | |
| -void cameraInit(float3 eye, float3 lookat, float imgw, float hw_ratio); | |
| - | |
| -int rt_cpu(float4* p, const unsigned int np, | |
| - rgb* img, const unsigned int width, const unsigned int height, | |
| - f3 origo, f3 L, f3 eye, f3 lookat, float imgw); | |
| - | |
| -#endif | |
| diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu | |
| t@@ -1,454 +0,0 @@ | |
| -#include <iostream> | |
| -#include <cutil_math.h> | |
| -#include "header.h" | |
| -#include "rt-kernel.h" | |
| -#include "colorbar.h" | |
| - | |
| -unsigned int iDivUp (unsigned int a, unsigned int b) { | |
| - return (a % b != 0) ? (a / b + 1) : (a / b); | |
| -} | |
| - | |
| -__inline__ __host__ __device__ float3 f4_to_f3(float4 in) | |
| -{ | |
| - return make_float3(in.x, in.y, in.z); | |
| -} | |
| - | |
| -__inline__ __host__ __device__ float4 f3_to_f4(float3 in) | |
| -{ | |
| - return make_float4(in.x, in.y, in.z, 0.0f); | |
| -} | |
| - | |
| -// Kernel for initializing image data | |
| -__global__ void imageInit(unsigned char* _img, unsigned int pixels) | |
| -{ | |
| - // Compute pixel position from threadIdx/blockIdx | |
| - unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
| - if (mempos > pixels) | |
| - return; | |
| - | |
| - _img[mempos*4] = 255; // Red channel | |
| - _img[mempos*4 + 1] = 255; // Green channel | |
| - _img[mempos*4 + 2] = 255; // Blue channel | |
| -} | |
| - | |
| -// Calculate ray origins and directions | |
| -__global__ void rayInitPerspective(float4* _ray_origo, | |
| - float4* _ray_direction, | |
| - float4 eye, | |
| - unsigned int width, | |
| - unsigned int height) | |
| -{ | |
| - // Compute pixel position from threadIdx/blockIdx | |
| - unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
| - if (mempos > width*height) | |
| - return; | |
| - | |
| - // Calculate 2D position from linear index | |
| - unsigned int i = mempos % width; | |
| - unsigned int j = (int)floor((float)mempos/width) % width; | |
| - | |
| - // Calculate pixel coordinates in image plane | |
| - float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x) | |
| - * (i + 0.5f) / width; | |
| - float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z) | |
| - * (j + 0.5f) / height; | |
| - | |
| - // Write ray origo and direction to global memory | |
| - _ray_origo[mempos] = const_eye; | |
| - _ray_direction[mempos] = -const_d*const_w + p_u*const_u + p_v*const_v; | |
| -} | |
| - | |
| -// Check wether the pixel's viewing ray intersects with the spheres, | |
| -// and shade the pixel correspondingly | |
| -__global__ void rayIntersectSpheres(float4* _ray_origo, | |
| - float4* _ray_direction, | |
| - float4* _p, | |
| - unsigned char* _img) | |
| -{ | |
| - // Compute pixel position from threadIdx/blockIdx | |
| - unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
| - if (mempos > const_pixels) | |
| - return; | |
| - | |
| - // Read ray data from global memory | |
| - float3 e = f4_to_f3(_ray_origo[mempos]); | |
| - float3 d = f4_to_f3(_ray_direction[mempos]); | |
| - //float step = length(d); | |
| - | |
| - // Distance, in ray steps, between object and eye initialized with a large v… | |
| - float tdist = 1e10f; | |
| - | |
| - // Surface normal at closest sphere intersection | |
| - float3 n; | |
| - | |
| - // Intersection point coordinates | |
| - float3 p; | |
| - | |
| - // Iterate through all particles | |
| - for (Inttype i=0; i<const_np; ++i) { | |
| - | |
| - // Read sphere coordinate and radius | |
| - float3 c = f4_to_f3(_p[i]); | |
| - float R = _p[i].w; | |
| - | |
| - // Calculate the discriminant: d = B^2 - 4AC | |
| - float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
| - - 4.0f*dot(d,d) // -4*A | |
| - * (dot((e-c),(e-c)) - R*R); // C | |
| - | |
| - // If the determinant is positive, there are two solutions | |
| - // One where the line enters the sphere, and one where it exits | |
| - if (Delta > 0.0f) { | |
| - | |
| - // Calculate roots, Shirley 2009 p. 77 | |
| - float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(… | |
| - * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
| - | |
| - // Check wether intersection is closer than previous values | |
| - if (fabs(t_minus) < tdist) { | |
| - p = e + t_minus*d; | |
| - tdist = fabs(t_minus); | |
| - n = normalize(2.0f * (p - c)); // Surface normal | |
| - } | |
| - | |
| - } // End of solution branch | |
| - | |
| - } // End of particle loop | |
| - | |
| - // Write pixel color | |
| - if (tdist < 1e10f) { | |
| - | |
| - // Lambertian shading parameters | |
| - float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light))); | |
| - float I_d = 40.0f; // Light intensity | |
| - float k_d = 5.0f; // Diffuse coefficient | |
| - | |
| - // Ambient shading | |
| - float k_a = 10.0f; | |
| - float I_a = 5.0f; // 5.0 for black background | |
| - | |
| - // Write shading model values to pixel color channels | |
| - _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.48f); | |
| - _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.41f); | |
| - _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*0.27f); | |
| - | |
| - } | |
| -} | |
| - | |
| -// Check wether the pixel's viewing ray intersects with the spheres, | |
| -// and shade the pixel correspondingly using a colormap | |
| -__global__ void rayIntersectSpheresColormap(float4* _ray_origo, | |
| - float4* _ray_direction, | |
| - float4* _p, | |
| - float* _fixvel, | |
| - float* _linarr, | |
| - float max_val, | |
| - unsigned char* _img) | |
| -{ | |
| - // Compute pixel position from threadIdx/blockIdx | |
| - unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
| - if (mempos > const_pixels) | |
| - return; | |
| - | |
| - // Read ray data from global memory | |
| - float3 e = f4_to_f3(_ray_origo[mempos]); | |
| - float3 d = f4_to_f3(_ray_direction[mempos]); | |
| - | |
| - // Distance, in ray steps, between object and eye initialized with a large v… | |
| - float tdist = 1e10f; | |
| - | |
| - // Surface normal at closest sphere intersection | |
| - float3 n; | |
| - | |
| - // Intersection point coordinates | |
| - float3 p; | |
| - | |
| - unsigned int hitidx; | |
| - | |
| - // Iterate through all particles | |
| - for (Inttype i=0; i<const_np; ++i) { | |
| - | |
| - // Read sphere coordinate and radius | |
| - float3 c = f4_to_f3(_p[i]); | |
| - float R = _p[i].w; | |
| - | |
| - // Calculate the discriminant: d = B^2 - 4AC | |
| - float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
| - - 4.0f*dot(d,d) // -4*A | |
| - * (dot((e-c),(e-c)) - R*R); // C | |
| - | |
| - // If the determinant is positive, there are two solutions | |
| - // One where the line enters the sphere, and one where it exits | |
| - if (Delta > 0.0f) { | |
| - | |
| - // Calculate roots, Shirley 2009 p. 77 | |
| - float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(… | |
| - * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
| - | |
| - // Check wether intersection is closer than previous values | |
| - if (fabs(t_minus) < tdist) { | |
| - p = e + t_minus*d; | |
| - tdist = fabs(t_minus); | |
| - n = normalize(2.0f * (p - c)); // Surface normal | |
| - hitidx = i; | |
| - } | |
| - | |
| - } // End of solution branch | |
| - | |
| - } // End of particle loop | |
| - | |
| - // Write pixel color | |
| - if (tdist < 1e10) { | |
| - | |
| - // Fetch particle data used for color | |
| - float ratio = _linarr[hitidx] / max_val; | |
| - float fixvel = _fixvel[hitidx]; | |
| - | |
| - // Make sure the ratio doesn't exceed the 0.0-1.0 interval | |
| - if (ratio < 0.01f) | |
| - ratio = 0.01f; | |
| - if (ratio > 0.99f) | |
| - ratio = 0.99f; | |
| - | |
| - // Lambertian shading parameters | |
| - float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light))); | |
| - float I_d = 40.0f; // Light intensity | |
| - float k_d = 5.0f; // Diffuse coefficient | |
| - | |
| - // Ambient shading | |
| - float k_a = 10.0f; | |
| - float I_a = 5.0f; | |
| - | |
| - float redv = red(ratio); | |
| - float greenv = green(ratio); | |
| - float bluev = blue(ratio); | |
| - | |
| - // Make particle dark grey if the horizontal velocity is fixed | |
| - if (fixvel > 0.f) { | |
| - redv = 0.5; | |
| - greenv = 0.5; | |
| - bluev = 0.5; | |
| - } | |
| - | |
| - // Write shading model values to pixel color channels | |
| - _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*redv); | |
| - _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*greenv); | |
| - _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
| - + k_a * I_a)*bluev); | |
| - } | |
| -} | |
| - | |
| -extern "C" | |
| -__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio, | |
| - unsigned int pixels, Inttype np) | |
| -{ | |
| - // Image dimensions in world space (l, r, b, t) | |
| - float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.… | |
| - | |
| - // The view vector | |
| - float4 view = eye - lookat; | |
| - | |
| - // Construct the camera view orthonormal base | |
| - //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f); // Pointing upward along… | |
| - float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f); // Pointing upward along +z | |
| - float4 w = -view/length(view); // w: Pointing backwards | |
| - float4 u = make_float4(cross(make_float3(up.x, up.y, up.z), | |
| - make_float3(w.x, w.y, w.z)), 0.0f) | |
| - / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y… | |
| - float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.… | |
| - | |
| - // Focal length 20% of eye vector length | |
| - float d = length(view)*0.8f; | |
| - | |
| - // Light direction (points towards light source) | |
| - float4 light = normalize(-1.0f*eye*make_float4(1.0f, 0.2f, 0.6f, 0.0f)); | |
| - | |
| - std::cout << " Transfering camera values to constant memory\n"; | |
| - | |
| - cudaMemcpyToSymbol("const_u", &u, sizeof(u)); | |
| - cudaMemcpyToSymbol("const_v", &v, sizeof(v)); | |
| - cudaMemcpyToSymbol("const_w", &w, sizeof(w)); | |
| - cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye)); | |
| - cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane)); | |
| - cudaMemcpyToSymbol("const_d", &d, sizeof(d)); | |
| - cudaMemcpyToSymbol("const_light", &light, sizeof(light)); | |
| - cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels)); | |
| - cudaMemcpyToSymbol("const_np", &np, sizeof(np)); | |
| -} | |
| - | |
| -// Check for CUDA errors | |
| -extern "C" | |
| -__host__ void checkForCudaErrors(const char* checkpoint_description) | |
| -{ | |
| - cudaError_t err = cudaGetLastError(); | |
| - if (err != cudaSuccess) { | |
| - std::cout << "\nCuda error detected, checkpoint: " << checkpoint_descripti… | |
| - << "\nError string: " << cudaGetErrorString(err) << "\n"; | |
| - exit(EXIT_FAILURE); | |
| - } | |
| -} | |
| - | |
| - | |
| -// Wrapper for the rt kernel | |
| -extern "C" | |
| -__host__ int rt(float4* p, Inttype np, | |
| - rgb* img, unsigned int width, unsigned int height, | |
| - f3 origo, f3 L, f3 eye, f3 lookat, float imgw, | |
| - int visualize, float max_val, | |
| - float* fixvel, | |
| - float* xsum, | |
| - float* pres, | |
| - float* es_dot, | |
| - float* es, | |
| - float* vel) | |
| -{ | |
| - using std::cout; | |
| - | |
| - cout << "Initializing CUDA:\n"; | |
| - | |
| - // Initialize GPU timestamp recorders | |
| - float t1, t2; | |
| - cudaEvent_t t1_go, t2_go, t1_stop, t2_stop; | |
| - cudaEventCreate(&t1_go); | |
| - cudaEventCreate(&t2_go); | |
| - cudaEventCreate(&t2_stop); | |
| - cudaEventCreate(&t1_stop); | |
| - | |
| - // Start timer 1 | |
| - cudaEventRecord(t1_go, 0); | |
| - | |
| - // Allocate memory | |
| - cout << " Allocating device memory\n"; | |
| - static float4 *_p; // Particle positions (x,y,z) and … | |
| - static float *_fixvel; // Indicates whether a particle has a … | |
| - static float *_linarr; // Array for linear values to color t… | |
| - static unsigned char *_img; // RGBw values in image | |
| - static float4 *_ray_origo; // Ray origo (x,y,z) | |
| - static float4 *_ray_direction; // Ray direction (x,y,z) | |
| - cudaMalloc((void**)&_p, np*sizeof(float4)); | |
| - cudaMalloc((void**)&_fixvel, np*sizeof(float)); | |
| - cudaMalloc((void**)&_linarr, np*sizeof(float)); // 0 size if visualize = 0; | |
| - cudaMalloc((void**)&_img, width*height*4*sizeof(unsigned char)); | |
| - cudaMalloc((void**)&_ray_origo, width*height*sizeof(float4)); | |
| - cudaMalloc((void**)&_ray_direction, width*height*sizeof(float4)); | |
| - | |
| - // Transfer particle data | |
| - cout << " Transfering particle data: host -> device\n"; | |
| - cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice); | |
| - cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - if (visualize == 1 || visualize == 4) | |
| - cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - if (visualize == 2) | |
| - cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - if (visualize == 3) | |
| - cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - if (visualize == 4) | |
| - cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - if (visualize == 5) | |
| - cudaMemcpy(_linarr, xsum, np*sizeof(float), cudaMemcpyHostToDevice); | |
| - | |
| - // Check for errors after memory allocation | |
| - checkForCudaErrors("CUDA error after memory allocation"); | |
| - | |
| - // Arrange thread/block structure | |
| - unsigned int pixels = width*height; | |
| - float hw_ratio = (float)height/(float)width; | |
| - const unsigned int threadsPerBlock = 256; | |
| - const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock); | |
| - | |
| - // Start timer 2 | |
| - cudaEventRecord(t2_go, 0); | |
| - | |
| - // Initialize image to background color | |
| - imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels); | |
| - | |
| - // Initialize camera | |
| - cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f), | |
| - make_float4(lookat.x, lookat.y, lookat.z, 0.0f), | |
| - imgw, hw_ratio, pixels, np); | |
| - checkForCudaErrors("CUDA error after cameraInit"); | |
| - | |
| - // Construct rays for perspective projection | |
| - rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - make_float4(eye.x, eye.y, eye.z, 0.0f), | |
| - width, height); | |
| - | |
| - cudaThreadSynchronize(); | |
| - | |
| - // Find closest intersection between rays and spheres | |
| - if (visualize == 1) { // Visualize pressure | |
| - cout << " Pressure color map range: [0, " << max_val << "] Pa\n"; | |
| - rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _fixvel, _linarr, max_val, _img); | |
| - } else if (visualize == 2) { // es_dot visualization | |
| - cout << " Shear heat production rate color map range: [0, " << max_val <<… | |
| - rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _fixvel, _linarr, max_val, _img); | |
| - } else if (visualize == 3) { // es visualization | |
| - cout << " Total shear heat color map range: [0, " << max_val << "] J\n"; | |
| - rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _fixvel, _linarr, max_val, _img); | |
| - } else if (visualize == 4) { // velocity visualization | |
| - cout << " Velocity color map range: [0, " << max_val << "] m/s\n"; | |
| - rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _fixvel, _linarr, max_val, _img); | |
| - } else if (visualize == 5) { // xsum visualization | |
| - cout << " XSum color map range: [0, " << max_val << "] m\n"; | |
| - rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _fixvel, _linarr, max_val, _img); | |
| - } else { // Normal visualization | |
| - rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>( | |
| - _ray_origo, _ray_direction, | |
| - _p, _img); | |
| - } | |
| - | |
| - // Make sure all threads are done before continuing CPU control sequence | |
| - cudaThreadSynchronize(); | |
| - | |
| - // Check for errors | |
| - checkForCudaErrors("CUDA error after kernel execution"); | |
| - | |
| - // Stop timer 2 | |
| - cudaEventRecord(t2_stop, 0); | |
| - cudaEventSynchronize(t2_stop); | |
| - | |
| - // Transfer image data from device to host | |
| - cout << " Transfering image data: device -> host\n"; | |
| - cudaMemcpy(img, _img, width*height*4*sizeof(unsigned char), cudaMemcpyDevice… | |
| - | |
| - // Free dynamically allocated device memory | |
| - cudaFree(_p); | |
| - cudaFree(_fixvel); | |
| - cudaFree(_linarr); | |
| - cudaFree(_img); | |
| - cudaFree(_ray_origo); | |
| - cudaFree(_ray_direction); | |
| - | |
| - // Stop timer 1 | |
| - cudaEventRecord(t1_stop, 0); | |
| - cudaEventSynchronize(t1_stop); | |
| - | |
| - // Calculate time spent in t1 and t2 | |
| - cudaEventElapsedTime(&t1, t1_go, t1_stop); | |
| - cudaEventElapsedTime(&t2, t2_go, t2_stop); | |
| - | |
| - // Report time spent | |
| - cout << " Time spent on entire GPU routine: " | |
| - << t1 << " ms\n"; | |
| - cout << " - Kernels: " << t2 << " ms\n" | |
| - << " - Memory alloc. and transfer: " << t1-t2 << " ms\n"; | |
| - | |
| - // Return successfully | |
| - return 0; | |
| -} | |
| diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h | |
| t@@ -1,35 +0,0 @@ | |
| -#ifndef RT_KERNEL_H_ | |
| -#define RT_KERNEL_H_ | |
| - | |
| -#include <vector_functions.h> | |
| -#include "../src/datatypes.h" | |
| -#include "header.h" | |
| - | |
| -// Constants | |
| -__constant__ float4 const_u; | |
| -__constant__ float4 const_v; | |
| -__constant__ float4 const_w; | |
| -__constant__ float4 const_eye; | |
| -__constant__ float4 const_imgplane; | |
| -__constant__ float const_d; | |
| -__constant__ float4 const_light; | |
| -__constant__ unsigned int const_pixels; | |
| -__constant__ Inttype const_np; | |
| - | |
| - | |
| -// Host prototype functions | |
| - | |
| -extern "C" | |
| -void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio, | |
| - unsigned int pixels, Inttype np); | |
| - | |
| -extern "C" | |
| -void checkForCudaErrors(const char* checkpoint_description); | |
| - | |
| -extern "C" | |
| -int rt(float4* p, Inttype np, | |
| - rgb* img, unsigned int width, unsigned int height, | |
| - f3 origo, f3 L, f3 eye, f3 lookat, float imgw, | |
| - int visualize, float max_val, | |
| - float* fixvel, float* xsum, float* pres, float* es_dot, float* es, floa… | |
| -#endif | |
| diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh | |
| t@@ -1,36 +0,0 @@ | |
| -#!/bin/bash | |
| - | |
| -# Pass max. pressure as command line argument | |
| - | |
| -XRES=400 | |
| -YRES=1600 | |
| - | |
| -WORKHORSE=GPU | |
| - | |
| -echo "# Rendering PPM images" | |
| -for F in ../output/*init*.bin | |
| -do | |
| - BASE=`basename $F` | |
| - OUTFILE=$BASE.ppm.jpg | |
| - if [ -e ../img_out/$OUTFILE ] | |
| - then | |
| - echo "$OUTFILE exists." > /dev/null | |
| - else | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
| - if [ $? -ne 0 ] ; then | |
| - echo "Error rendering $F, trying again..." | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
| - fi | |
| - fi | |
| -done | |
| - | |
| -echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
| -for F in ../img_out/*.ppm | |
| -do | |
| - (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
| -done | |
| - | |
| -sleep 5 | |
| -echo "# Removing temporary PPM files" | |
| -rm ../img_out/*.ppm | |
| - | |
| diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh | |
| t@@ -1,36 +0,0 @@ | |
| -#!/bin/bash | |
| - | |
| -# Pass max. pressure as command line argument | |
| - | |
| -XRES=800 | |
| -YRES=800 | |
| - | |
| -WORKHORSE=GPU | |
| - | |
| -echo "# Rendering PPM images" | |
| -for F in ../output/*.bin | |
| -do | |
| - BASE=`basename $F` | |
| - OUTFILE=$BASE.ppm.jpg | |
| - if [ -e ../img_out/$OUTFILE ] | |
| - then | |
| - echo "$OUTFILE exists." > /dev/null | |
| - else | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
| - if [ $? -ne 0 ] ; then | |
| - echo "Error rendering $F, trying again..." | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
| - fi | |
| - fi | |
| -done | |
| - | |
| -echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
| -for F in ../img_out/*.ppm | |
| -do | |
| - (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
| -done | |
| - | |
| -sleep 5 | |
| -echo "# Removing temporary PPM files" | |
| -rm ../img_out/*.ppm | |
| - | |
| diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh | |
| t@@ -1,36 +0,0 @@ | |
| -#!/bin/bash | |
| - | |
| -# Pass max. pressure as command line argument | |
| - | |
| -XRES=800 | |
| -YRES=1200 | |
| - | |
| -WORKHORSE=GPU | |
| - | |
| -echo "# Rendering PPM images" | |
| -for F in ../output/*.bin | |
| -do | |
| - BASE=`basename $F` | |
| - OUTFILE=$BASE.ppm.jpg | |
| - if [ -e ../img_out/$OUTFILE ] | |
| - then | |
| - echo "$OUTFILE exists." > /dev/null | |
| - else | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
| - if [ $? -ne 0 ] ; then | |
| - echo "Error rendering $F, trying again..." | |
| - ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
| - fi | |
| - fi | |
| -done | |
| - | |
| -echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
| -for F in ../img_out/*.ppm | |
| -do | |
| - (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
| -done | |
| - | |
| -sleep 5 | |
| -echo "# Removing temporary PPM files" | |
| -rm ../img_out/*.ppm | |
| - |