tRemoved legacy raytracer folder, moved rt doc into main doc - sphere - GPU-bas… | |
git clone git://src.adamsgaard.dk/sphere | |
Log | |
Files | |
Refs | |
LICENSE | |
--- | |
commit d30540ae62abcfd984b3eda396cf534e72aaad66 | |
parent 5cd210a9136c5c2dfee90711cedf294fc6f0484e | |
Author: Anders Damsgaard <[email protected]> | |
Date: Tue, 18 Dec 2012 13:58:29 +0100 | |
Removed legacy raytracer folder, moved rt doc into main doc | |
Diffstat: | |
R raytracer/doc/IEEEtran.cls -> doc/… | 0 | |
R raytracer/doc/np-performance.pdf -… | 0 | |
R raytracer/doc/np-performance.txt -… | 0 | |
R raytracer/doc/np500.png -> doc/ray… | 0 | |
R raytracer/doc/np50000.png -> doc/r… | 0 | |
R raytracer/doc/px-performance.pdf -… | 0 | |
R raytracer/doc/px-performance.txt -… | 0 | |
A doc/raytracer/raytracer.pdf | 0 | |
A doc/raytracer/raytracer.tex | 264 +++++++++++++++++++++++++++++… | |
D raytracer/Makefile | 101 -----------------------------… | |
D raytracer/README | 43 ------------------------------ | |
D raytracer/colorbar.h | 22 ---------------------- | |
D raytracer/doc/.DS_Store | 0 | |
D raytracer/doc/.raytracer.tex.swp | 0 | |
D raytracer/doc/bloopers/black.png | 0 | |
D raytracer/doc/bloopers/colorful.png | 0 | |
D raytracer/doc/bloopers/cpu_wrong_c… | 0 | |
D raytracer/doc/bloopers/twisted.png | 0 | |
D raytracer/doc/bloopers/wrong_order… | 0 | |
D raytracer/doc/raytracer.pdf | 0 | |
D raytracer/doc/raytracer.tex | 317 -----------------------------… | |
D raytracer/header.h | 26 -------------------------- | |
D raytracer/main.cpp | 264 -----------------------------… | |
D raytracer/o-ppm.cpp | 30 ------------------------------ | |
D raytracer/o-ppm.h | 7 ------- | |
D raytracer/render_all_outputs_CPU.sh | 27 --------------------------- | |
D raytracer/render_all_outputs_GPU.sh | 29 ----------------------------- | |
D raytracer/rt-kernel-cpu.cpp | 273 -----------------------------… | |
D raytracer/rt-kernel-cpu.h | 14 -------------- | |
D raytracer/rt-kernel.cu | 454 -----------------------------… | |
D raytracer/rt-kernel.h | 35 -----------------------------… | |
D raytracer/rt_GPU_init_pres.sh | 36 -----------------------------… | |
D raytracer/rt_GPU_pres.sh | 36 -----------------------------… | |
D raytracer/rt_GPU_tall_pres.sh | 36 -----------------------------… | |
34 files changed, 264 insertions(+), 1750 deletions(-) | |
--- | |
diff --git a/raytracer/doc/IEEEtran.cls b/doc/raytracer/IEEEtran.cls | |
diff --git a/raytracer/doc/np-performance.pdf b/doc/raytracer/np-performance.pdf | |
Binary files differ. | |
diff --git a/raytracer/doc/np-performance.txt b/doc/raytracer/np-performance.txt | |
diff --git a/raytracer/doc/np500.png b/doc/raytracer/np500.png | |
Binary files differ. | |
diff --git a/raytracer/doc/np50000.png b/doc/raytracer/np50000.png | |
Binary files differ. | |
diff --git a/raytracer/doc/px-performance.pdf b/doc/raytracer/px-performance.pdf | |
Binary files differ. | |
diff --git a/raytracer/doc/px-performance.txt b/doc/raytracer/px-performance.txt | |
diff --git a/doc/raytracer/raytracer.pdf b/doc/raytracer/raytracer.pdf | |
Binary files differ. | |
diff --git a/doc/raytracer/raytracer.tex b/doc/raytracer/raytracer.tex | |
t@@ -0,0 +1,264 @@ | |
+\documentclass[journal]{IEEEtran} | |
+ | |
+ | |
+\ifCLASSINFOpdf | |
+ \usepackage[pdftex]{graphicx} | |
+\else | |
+ \usepackage[dvips]{graphicx} | |
+\fi | |
+ | |
+\usepackage{color} | |
+ | |
+\usepackage{url} | |
+%\usepackage{showkeys} | |
+\usepackage{siunitx} | |
+ | |
+% correct bad hyphenation here | |
+\hyphenation{op-tical net-works semi-conduc-tor} | |
+ | |
+\definecolor{DarkGreen}{rgb}{0,0.45,0.08} | |
+\usepackage{listings} | |
+\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,… | |
+ | |
+ | |
+\begin{document} | |
+\title{CUDA raytracing algorithm for visualizing\\ discrete element model outp… | |
+ | |
+\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213} | |
+\thanks{Contact: {[email protected]}}% | |
+\thanks{Webpage: {http://users-cs.au.dk/adc}}% | |
+\thanks{Manuscript, last revision: \today.}} | |
+ | |
+% The paper headers | |
+\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm… | |
+ | |
+% make the title area | |
+\maketitle | |
+ | |
+ | |
+\begin{abstract} | |
+%\boldmath | |
+A raytracing algorithm is constructed using the CUDA API for visualizing outpu… | |
+The raytracing algorithm is optimized with constant memory and compilation fla… | |
+\end{abstract} | |
+\begin{IEEEkeywords} | |
+CUDA, discrete element method, raytracing | |
+\end{IEEEkeywords} | |
+ | |
+ | |
+\IEEEpeerreviewmaketitle | |
+ | |
+ | |
+\section{Introduction} | |
+\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional… | |
+ | |
+Previous studies of GPU or CUDA implementations of ray tracing algorithms repo… | |
+ | |
+\subsection{Discrete Element Method} | |
+The input particle data to the raytracer is the output of a custom CUDA-based … | |
+ | |
+The discrete element method (DEM) is a subtype of molecular dynamics (MD), and… | |
+ | |
+\subsection{Application usage} | |
+The CUDA DEM application is a command line executable, and writes updated part… | |
+ | |
+Both the CUDA DEM and raytracing applications are open-source\footnote{\url{ht… | |
+ | |
+This document consists of a short introduction to the basic mathematics behind… | |
+ | |
+ | |
+\section{Ray tracing algorithm} | |
+The goal of the ray tracing algorithm is to compute the shading of each pixel … | |
+\begin{lstlisting}[basicstyle=\footnotesize] | |
+for each pixel do | |
+ compute viewing ray origin and direction | |
+ iterate through objects and find the closest hit | |
+ set pixel color to value computed from hit point, light, n | |
+\end{lstlisting} | |
+The implemented code does not utilize recursive rays, since the modeled materi… | |
+ | |
+\subsection{Ray generation} | |
+The rays are in vector form defined as: | |
+\begin{equation} | |
+ \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e}) | |
+ \label{eq:raygen} | |
+\end{equation} | |
+The perspective can be either \emph{orthograpic}, where all viewing rays have … | |
+ | |
+The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The d… | |
+\begin{equation} | |
+ \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v} | |
+ \label{eq:raydirection} | |
+\end{equation} | |
+where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the … | |
+\[ u = l + (r-l)(i+0.5)/n \] | |
+\[ v = b + (t-b)(j+0.5)/m \] | |
+where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, righ… | |
+ | |
+ | |
+\subsection{Ray-sphere intersection} | |
+Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be c… | |
+\begin{equation} | |
+ (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0 | |
+ \label{eq:sphere} | |
+\end{equation} | |
+By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and… | |
+\begin{equation} | |
+ (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t | |
+ + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0 | |
+ \label{eq:quad} | |
+\end{equation} | |
+The number of ray steps $t$ is the only unknown, so the number of intersection… | |
+\begin{equation} | |
+ \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\m… | |
+ \label{eq:Delta} | |
+\end{equation} | |
+A negative value denotes no intersection between the sphere and the ray, a val… | |
+\begin{equation} | |
+ t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\math… | |
+ \label{eq:intersect} | |
+\end{equation} | |
+where | |
+\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf… | |
+Only the smallest intersection ($t_\text{minus}$) is calculated, since this ma… | |
+\begin{equation} | |
+ \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d} | |
+ \label{eq:intersection} | |
+\end{equation} | |
+\begin{equation} | |
+ \mathbf{n} = 2(\mathbf{p}-\mathbf{c}) | |
+ \label{eq:normal} | |
+\end{equation} | |
+The intersection distance in vector steps ($t_\text{minus}$) is saved in order… | |
+ | |
+\subsection{Pixel shading} | |
+The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where… | |
+\begin{equation} | |
+ L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l})) | |
+ \label{eq:shading} | |
+\end{equation} | |
+where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian)… | |
+ | |
+ | |
+\subsection{Computational implementation} | |
+The above routines were first implemented in CUDA for device execution, and af… | |
+ | |
+ | |
+\section{CUDA implementation} | |
+When constructing the algorithm for execution on the GPGPU device, the data-pa… | |
+ | |
+The application starts by reading the discrete element method data from a cust… | |
+Divergent branches in the kernel code were avoided as much as possible \cite{N… | |
+ | |
+\begin{figure}[!t] | |
+\centering | |
+\includegraphics[width=0.47\textwidth]{np500.png} | |
+\caption{Sample output of GPU raytracer rendering of \num{512} particles.} | |
+\label{fig:np500.png} | |
+\end{figure} | |
+ | |
+ | |
+\begin{figure}[!t] | |
+\centering | |
+\includegraphics[width=0.47\textwidth]{np50000.png} | |
+\caption{Sample output of GPU raytracer rendering of \num{50653} particles.} | |
+\label{fig:np50000.png} | |
+\end{figure} | |
+ | |
+The algorithm starts by allocating memory on the device for the particle data,… | |
+ | |
+All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the i… | |
+ | |
+After all pixel values have been computed, the image data is transfered back t… | |
+ | |
+\subsection{Thread and block layout} | |
+The thread/block layout passed during kernel launches is arranged in the follo… | |
+\begin{lstlisting} | |
+ dim3 threads(16, 16); | |
+ dim3 blocks((width+15)/16, (height+15)/16); | |
+\end{lstlisting} | |
+The image pixel position of the thread can be determined from the thread- and … | |
+\begin{lstlisting} | |
+ int i = threadIdx.x + blockIdx.x * blockDim.x; | |
+ int j = threadIdx.y + blockIdx.y * blockDim.y; | |
+ unsigned int mempos = x + y * blockDim.x * gridDim.x; | |
+ if (mempos > pixels) | |
+ return; | |
+\end{lstlisting} | |
+The linear memory position (\texttt{mempos}) is used as the index when reading… | |
+ | |
+\subsection{Image output} | |
+After completing all pixel shading computations on the device, the image data … | |
+ | |
+\subsection{Performance} | |
+Since this simple raytracing algorithm generates a single non-recursive ray fo… | |
+ | |
+The data transfer between the host and device is kept at a bare minimum, as th… | |
+ | |
+The host execution time was profiled using a \texttt{clock()} based CPU timer … | |
+ | |
+The device execution time was profiled using two \texttt{cudaEvent\_t} timers,… | |
+ | |
+Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the… | |
+ | |
+\begin{figure}[!t] | |
+\centering | |
+\includegraphics[width=0.47\textwidth]{np-performance.pdf} | |
+\caption{Performance scaling with varying particle numbers at image dimensions… | |
+\label{fig:np-performance.pdf} | |
+\end{figure} | |
+ | |
+\begin{figure}[!t] | |
+\centering | |
+\includegraphics[width=0.47\textwidth]{px-performance.pdf} | |
+\caption{Performance scaling with varying image dimensions ($n \times m$) wit… | |
+\label{fig:px-performance.pdf} | |
+\end{figure} | |
+ | |
+The device memory allocation and data transfer was also profiled, and turns ou… | |
+ | |
+The CPU time spent in the host kernels proves to be linear with the particle n… | |
+ | |
+The ratio between CPU computational times and the sum of the device kernel exe… | |
+ | |
+As the number of particles are not known by compilation, it is not possible to… | |
+ | |
+Previous GPU implementations often rely on k-D trees, constructed as an sortin… | |
+ | |
+\section{Conclusion} | |
+This document presented the implementation of a basic ray tracing algorithm, u… | |
+Performance tests showed the expected, linear correlation between image dimens… | |
+ | |
+The final product will come into good use during further development and compl… | |
+ | |
+\bibliographystyle{IEEEtran} | |
+\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB} | |
+ | |
+ | |
+ | |
+\appendices | |
+ | |
+\section{Test environment} | |
+The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac … | |
+ | |
+The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g+… | |
+\begin{lstlisting} | |
+g++ -c -Wall -O3 -arch x86_64 -fopenmp ... | |
+nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ... | |
+g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt | |
+\end{lstlisting} | |
+When profiling device code performance, the application was executed two times… | |
+ | |
+The host system was measured to have a memory bandwidth of 4642.1 MB/s when tr… | |
+ | |
+\ifCLASSOPTIONcaptionsoff | |
+ \newpage | |
+\fi | |
+ | |
+ | |
+ | |
+ | |
+% that's all folks | |
+\end{document} | |
+ | |
+ | |
diff --git a/raytracer/Makefile b/raytracer/Makefile | |
t@@ -1,101 +0,0 @@ | |
-NVCC=nvcc | |
-CC=g++ | |
-LD=g++ | |
- | |
-CCFLAGS=-c -Wall -O3 -fopenmp | |
-LDFLAGS=-fopenmp | |
- | |
-# Verbose compile? | |
-#NVCCFLAGS+=--verbose | |
- | |
-# Profile code? | |
-#NVCCFLAGS+=-pg | |
- | |
-# Debugable code? | |
-#CCFLAGS+=-g | |
-#NVCCFLAGS+=-g -G | |
- | |
-CCFILES=main.cpp o-ppm.cpp rt-kernel-cpu.cpp | |
-CUFILES=rt-kernel.cu | |
-CCOBJECTS=$(CCFILES:.cpp=.o) | |
-CUOBJECTS=$(CUFILES:.cu=.o) | |
-OBJECTS=$(CCOBJECTS) $(CUOBJECTS) | |
- | |
-EXECUTABLE=rt | |
- | |
-# NVCC flags | |
-NVCCFLAGS+=-use_fast_math -gencode arch=compute_20,code=sm_20 | |
- | |
-# Generate device code | |
-#NVCCFLAGS+=--export-dir=$(EXECUTABLE).devcode | |
- | |
-INCLUDES = -I. -I../libs -I/usr/local/cuda/include | |
-LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib -lcudart -lcuda | |
-#GL_LIBS = -lGLEW | |
- | |
-# detect OS | |
-OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) | |
-# 'linux' is output for Linux system, 'darwin' for OS X | |
-DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) | |
-ifneq ($(DARWIN),) | |
- #INCLUDES += -I/opt/local/include | |
- #LIBS += -L/opt/local/lib | |
- #GL_LIBS += -framework glut -framework OpenGL | |
- CUDA_SDK=/Developer/GPU\ Computing/C | |
-else | |
- INCLUDES += -I/usr/include -I/usr/local/include | |
- LIBS += -L/usr/lib -L/usr/local/lib | |
- #GL_LIBS += -L/usr/X11/lib -lglut -lGL -lGLU | |
- CUDA_SDK=$(HOME)/NVIDIA_GPU_Computing_SDK/C | |
-endif | |
- | |
-# detect OS | |
-ARCHUPPER = $(shell uname -m 2>/dev/null | tr [:lower:] [:upper:]) | |
-# 'linux' is output for Linux system, 'darwin' for OS X | |
-ARCH := $(strip $(findstring X86_64, $(ARCHUPPER))) | |
-ifneq ($(ARCH),) | |
- LIB_ARCH = x86_64 | |
- NVCCFLAGS += -m64 | |
- ifneq ($(DARWIN),) | |
- CXX_ARCH_FLAGS += -arch x86_64 | |
- else | |
- CXX_ARCH_FLAGS += -m64 | |
- endif | |
-else | |
- LIB_ARCH = i386 | |
- NVCCFLAGS += -m32 | |
- ifneq ($(DARWIN),) | |
- CXX_ARCH_FLAGS += -arch i386 | |
- else | |
- CXX_ARCH_FLAGS += -m32 | |
- endif | |
-endif | |
- | |
- | |
-INCLUDES += -I$(CUDA_SDK)/common/inc | |
-#LIBS += -L$(CUDA_SDK)/lib #-lcutil_$(LIB_ARCH) | |
-LIBS += -L$(CUDA_SDK)/lib -lcutil_$(LIB_ARCH) | |
- | |
-$(EXECUTABLE): $(OBJECTS) | |
- $(LD) $(CXX_ARCH_FLAGS) $(LDFLAGS) $(LIBS) $(OBJECTS) -o $(EXECUTABLE) | |
- | |
-main.o: main.cpp | |
- $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
- | |
-o-ppm.o: o-ppm.cpp | |
- $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
- | |
-rt-kernel-cpu.o: rt-kernel-cpu.cpp | |
- $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ | |
- | |
-rt-kernel.o: rt-kernel.cu | |
- $(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@ | |
- | |
-clean: | |
- rm -f doc/*.{log,aux,bbl,blg,out,gz} | |
- rm -f *.o *.ppm *.png | |
- rm -f $(EXECUTABLE) | |
- | |
-edit: | |
- vim -p Makefile $(CCFILES) $(CUFILES) *.h | |
- | |
diff --git a/raytracer/README b/raytracer/README | |
t@@ -1,43 +0,0 @@ | |
------------------------------------------------------------------- | |
- This is the README file for the SPHERE ray tracing module | |
- Last revision: May. 18., 2012 | |
------------------------------------------------------------------- | |
- | |
-BACKGROUND INFORMATION | |
-====================== | |
- | |
-The ray tracing module was initially developed for the Sphere DEM | |
-software: http://github.com/anders-dc/sphere | |
- | |
-This is a port for visualizing simulation output from the | |
-SPHERE discrete element method software. | |
- | |
-It is developed by Anders Damsgaard Christensen, [email protected], and | |
-is licenced under the GNU General Public Licence (GPL). | |
- | |
-Documentation can be found in the file doc/raytracer.pdf | |
- | |
- | |
-REQUIREMENTS | |
-============ | |
- | |
-For compiling the raytracer, the following software is required: | |
- - GNU compiler collection | |
- - CUDA developer drivers | |
- - CUDA toolkit | |
-When executing the compiled code, a CUDA compatible GPU is | |
-necessary, along with appropriate developer device drivers. | |
- | |
- | |
-USAGE INSTRUCTIONS | |
-================== | |
- | |
-The source code is built using GNU make with the command: | |
- ./make | |
- | |
-A sample run can subsequently be started with: | |
- ./make run | |
- | |
-Temporary files can be removed with: | |
- ./make clean | |
- | |
diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h | |
t@@ -1,22 +0,0 @@ | |
-#ifndef COLORBAR_H_ | |
-#define COLORBAR_H_ | |
- | |
-// Functions that determine red-, green- and blue color components | |
-// in a blue-white-red colormap. Ratio should be between 0.0-1.0. | |
- | |
-__inline__ __host__ __device__ float red(float ratio) | |
-{ | |
- return fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio … | |
-}; | |
- | |
-__inline__ __host__ __device__ float green(float ratio) | |
-{ | |
- return fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f); | |
-}; | |
- | |
-__inline__ __host__ __device__ float blue(float ratio) | |
-{ | |
- return fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f); | |
-}; | |
- | |
-#endif | |
diff --git a/raytracer/doc/.DS_Store b/raytracer/doc/.DS_Store | |
Binary files differ. | |
diff --git a/raytracer/doc/.raytracer.tex.swp b/raytracer/doc/.raytracer.tex.swp | |
Binary files differ. | |
diff --git a/raytracer/doc/bloopers/black.png b/raytracer/doc/bloopers/black.png | |
Binary files differ. | |
diff --git a/raytracer/doc/bloopers/colorful.png b/raytracer/doc/bloopers/color… | |
Binary files differ. | |
diff --git a/raytracer/doc/bloopers/cpu_wrong_colorful.png b/raytracer/doc/bloo… | |
Binary files differ. | |
diff --git a/raytracer/doc/bloopers/twisted.png b/raytracer/doc/bloopers/twiste… | |
Binary files differ. | |
diff --git a/raytracer/doc/bloopers/wrong_order.png b/raytracer/doc/bloopers/wr… | |
Binary files differ. | |
diff --git a/raytracer/doc/raytracer.pdf b/raytracer/doc/raytracer.pdf | |
Binary files differ. | |
diff --git a/raytracer/doc/raytracer.tex b/raytracer/doc/raytracer.tex | |
t@@ -1,317 +0,0 @@ | |
-\documentclass[journal]{IEEEtran} | |
- | |
- | |
-\ifCLASSINFOpdf | |
- \usepackage[pdftex]{graphicx} | |
-\else | |
- \usepackage[dvips]{graphicx} | |
-\fi | |
- | |
-\usepackage{color} | |
- | |
-\usepackage{url} | |
-%\usepackage{showkeys} | |
-\usepackage{siunitx} | |
- | |
-% correct bad hyphenation here | |
-\hyphenation{op-tical net-works semi-conduc-tor} | |
- | |
-\definecolor{DarkGreen}{rgb}{0,0.45,0.08} | |
-\usepackage{listings} | |
-\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,… | |
- | |
- | |
-\begin{document} | |
-\title{CUDA raytracing algorithm for visualizing\\ discrete element model outp… | |
- | |
-\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213} | |
-\thanks{Contact: {[email protected]}}% | |
-\thanks{Webpage: {http://users-cs.au.dk/adc}}% | |
-\thanks{Manuscript, last revision: \today.}} | |
- | |
-% The paper headers | |
-\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm… | |
- | |
-% make the title area | |
-\maketitle | |
- | |
- | |
-\begin{abstract} | |
-%\boldmath | |
-A raytracing algorithm is constructed using the CUDA API for visualizing outpu… | |
-The raytracing algorithm is optimized with constant memory and compilation fla… | |
-\end{abstract} | |
-\begin{IEEEkeywords} | |
-CUDA, discrete element method, raytracing | |
-\end{IEEEkeywords} | |
- | |
- | |
-\IEEEpeerreviewmaketitle | |
- | |
- | |
-\section{Introduction} | |
-\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional… | |
- | |
-Previous studies of GPU or CUDA implementations of ray tracing algorithms repo… | |
- | |
-\subsection{Discrete Element Method} | |
-The input particle data to the raytracer is the output of a custom CUDA-based … | |
- | |
-The discrete element method (DEM) is a subtype of molecular dynamics (MD), and… | |
- | |
-\subsection{Application usage} | |
-The CUDA DEM application is a command line executable, and writes updated part… | |
- | |
-Both the CUDA DEM and raytracing applications are open-source\footnote{\url{ht… | |
- | |
-This document consists of a short introduction to the basic mathematics behind… | |
- | |
- | |
-\section{Ray tracing algorithm} | |
-The goal of the ray tracing algorithm is to compute the shading of each pixel … | |
-\begin{lstlisting}[basicstyle=\footnotesize] | |
-for each pixel do | |
- compute viewing ray origin and direction | |
- iterate through objects and find the closest hit | |
- set pixel color to value computed from hit point, light, n | |
-\end{lstlisting} | |
-The implemented code does not utilize recursive rays, since the modeled materi… | |
- | |
-\subsection{Ray generation} | |
-The rays are in vector form defined as: | |
-\begin{equation} | |
- \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e}) | |
- \label{eq:raygen} | |
-\end{equation} | |
-The perspective can be either \emph{orthograpic}, where all viewing rays have … | |
- | |
-The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The d… | |
-\begin{equation} | |
- \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v} | |
- \label{eq:raydirection} | |
-\end{equation} | |
-where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the … | |
-\[ u = l + (r-l)(i+0.5)/n \] | |
-\[ v = b + (t-b)(j+0.5)/m \] | |
-where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, righ… | |
- | |
- | |
-\subsection{Ray-sphere intersection} | |
-Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be c… | |
-\begin{equation} | |
- (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0 | |
- \label{eq:sphere} | |
-\end{equation} | |
-By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and… | |
-\begin{equation} | |
- (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t | |
- + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0 | |
- \label{eq:quad} | |
-\end{equation} | |
-The number of ray steps $t$ is the only unknown, so the number of intersection… | |
-\begin{equation} | |
- \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\m… | |
- \label{eq:Delta} | |
-\end{equation} | |
-A negative value denotes no intersection between the sphere and the ray, a val… | |
-\begin{equation} | |
- t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\math… | |
- \label{eq:intersect} | |
-\end{equation} | |
-where | |
-\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf… | |
-Only the smallest intersection ($t_\text{minus}$) is calculated, since this ma… | |
-\begin{equation} | |
- \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d} | |
- \label{eq:intersection} | |
-\end{equation} | |
-\begin{equation} | |
- \mathbf{n} = 2(\mathbf{p}-\mathbf{c}) | |
- \label{eq:normal} | |
-\end{equation} | |
-The intersection distance in vector steps ($t_\text{minus}$) is saved in order… | |
- | |
-\subsection{Pixel shading} | |
-The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where… | |
-\begin{equation} | |
- L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l})) | |
- \label{eq:shading} | |
-\end{equation} | |
-where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian)… | |
- | |
- | |
-\subsection{Computational implementation} | |
-The above routines were first implemented in CUDA for device execution, and af… | |
- | |
- | |
-\section{CUDA implementation} | |
-When constructing the algorithm for execution on the GPGPU device, the data-pa… | |
- | |
-The application starts by reading the discrete element method data from a cust… | |
-Divergent branches in the kernel code were avoided as much as possible \cite{N… | |
- | |
-\begin{figure}[!t] | |
-\centering | |
-\includegraphics[width=0.47\textwidth]{np500.png} | |
-\caption{Sample output of GPU raytracer rendering of \num{512} particles.} | |
-\label{fig:np500.png} | |
-\end{figure} | |
- | |
- | |
-\begin{figure}[!t] | |
-\centering | |
-\includegraphics[width=0.47\textwidth]{np50000.png} | |
-\caption{Sample output of GPU raytracer rendering of \num{50653} particles.} | |
-\label{fig:np50000.png} | |
-\end{figure} | |
- | |
-The algorithm starts by allocating memory on the device for the particle data,… | |
- | |
-All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the i… | |
- | |
-After all pixel values have been computed, the image data is transfered back t… | |
- | |
-\subsection{Thread and block layout} | |
-The thread/block layout passed during kernel launches is arranged in the follo… | |
-\begin{lstlisting} | |
- dim3 threads(16, 16); | |
- dim3 blocks((width+15)/16, (height+15)/16); | |
-\end{lstlisting} | |
-The image pixel position of the thread can be determined from the thread- and … | |
-\begin{lstlisting} | |
- int i = threadIdx.x + blockIdx.x * blockDim.x; | |
- int j = threadIdx.y + blockIdx.y * blockDim.y; | |
- unsigned int mempos = x + y * blockDim.x * gridDim.x; | |
- if (mempos > pixels) | |
- return; | |
-\end{lstlisting} | |
-The linear memory position (\texttt{mempos}) is used as the index when reading… | |
- | |
-\subsection{Image output} | |
-After completing all pixel shading computations on the device, the image data … | |
- | |
-\subsection{Performance} | |
-Since this simple raytracing algorithm generates a single non-recursive ray fo… | |
- | |
-The data transfer between the host and device is kept at a bare minimum, as th… | |
- | |
-The host execution time was profiled using a \texttt{clock()} based CPU timer … | |
- | |
-The device execution time was profiled using two \texttt{cudaEvent\_t} timers,… | |
- | |
-Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the… | |
- | |
-\begin{figure}[!t] | |
-\centering | |
-\includegraphics[width=0.47\textwidth]{np-performance.pdf} | |
-\caption{Performance scaling with varying particle numbers at image dimensions… | |
-\label{fig:np-performance.pdf} | |
-\end{figure} | |
- | |
-\begin{figure}[!t] | |
-\centering | |
-\includegraphics[width=0.47\textwidth]{px-performance.pdf} | |
-\caption{Performance scaling with varying image dimensions ($n \times m$) wit… | |
-\label{fig:px-performance.pdf} | |
-\end{figure} | |
- | |
-The device memory allocation and data transfer was also profiled, and turns ou… | |
- | |
-The CPU time spent in the host kernels proves to be linear with the particle n… | |
- | |
-The ratio between CPU computational times and the sum of the device kernel exe… | |
- | |
-As the number of particles are not known by compilation, it is not possible to… | |
- | |
-Previous GPU implementations often rely on k-D trees, constructed as an sortin… | |
- | |
-\section{Conclusion} | |
-This document presented the implementation of a basic ray tracing algorithm, u… | |
-Performance tests showed the expected, linear correlation between image dimens… | |
- | |
-The final product will come into good use during further development and compl… | |
- | |
-\bibliographystyle{IEEEtran} | |
-\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB} | |
- | |
- | |
- | |
-\appendices | |
- | |
-\section{Test environment} | |
-The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac … | |
- | |
-The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g+… | |
-\begin{lstlisting} | |
-g++ -c -Wall -O3 -arch x86_64 -fopenmp ... | |
-nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ... | |
-g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt | |
-\end{lstlisting} | |
-When profiling device code performance, the application was executed two times… | |
- | |
-The host system was measured to have a memory bandwidth of 4642.1 MB/s when tr… | |
- | |
-\ifCLASSOPTIONcaptionsoff | |
- \newpage | |
-\fi | |
- | |
- | |
-\section{Source code} | |
-The entire source code, as well as input data files, can be found in the follo… | |
-\url{http://users-cs.au.dk/adc/files/sphere-rt.tar.gz} | |
-The source code is built and run with the commands: | |
-\begin{lstlisting} | |
- make | |
- make run | |
-\end{lstlisting} | |
-With the \texttt{make run} command, the Makefile uses ImageMagick to convert t… | |
-\begin{lstlisting} | |
- ./rt <CPU | GPU> <sphere-binary.bin> <width> <height> <output-image.ppm> | |
-\end{lstlisting} | |
-This appendix contains the following source code files: | |
-\lstlistoflistings | |
- | |
-\subsection{CUDA raytracing source code} | |
-%\subsubsection{rt\_kernel.h} | |
-\lstinputlisting[caption={rt-kernel.h},label={rt-kernel.h},numbers=left,breakl… | |
- | |
-%\subsubsection{rt\_kernel.cu} | |
-\lstinputlisting[caption={rt-kernel.cu},label={rt-kernel.cu},numbers=left,brea… | |
- | |
-\subsection{CPU raytracing source code} | |
-%\subsubsection{rt\_kernel\_cpu.h} | |
-\lstinputlisting[caption={rt-kernel-cpu.h},label={rt-kernel-cpu.h},numbers=lef… | |
- | |
-%\subsubsection{rt\_kernel\_cpu.cpp} | |
-\lstinputlisting[caption={rt-kernel-cpu.cpp},label={rt-kernel-cpu.cpp},numbers… | |
- | |
- | |
- | |
-\clearpage | |
- | |
- | |
- | |
- | |
- | |
- | |
- | |
- | |
- | |
-% You can push biographies down or up by placing | |
-% a \vfill before or after them. The appropriate | |
-% use of \vfill depends on what kind of text is | |
-% on the last page and whether or not the columns | |
-% are being equalized. | |
- | |
-%\vfill | |
- | |
-% Can be used to pull up biographies so that the bottom of the last one | |
-% is flush with the other column. | |
-%\enlargethispage{-5in} | |
- | |
- | |
- | |
-% that's all folks | |
-\end{document} | |
- | |
- | |
diff --git a/raytracer/header.h b/raytracer/header.h | |
t@@ -1,26 +0,0 @@ | |
-// header.h -- Structure templates and function prototypes | |
- | |
-// Standard technique for avoiding multiple inclusions of header file | |
-#ifndef HEADER_H_ | |
-#define HEADER_H_ | |
- | |
-// Type declaration | |
-typedef unsigned int Inttype; | |
- | |
-//// Structure declarations //// | |
- | |
-struct f3 { | |
- float x; | |
- float y; | |
- float z; | |
-}; | |
- | |
-// Image structure | |
-struct rgb { | |
- unsigned char r; | |
- unsigned char g; | |
- unsigned char b; | |
- unsigned char alpha; | |
-}; | |
- | |
-#endif | |
diff --git a/raytracer/main.cpp b/raytracer/main.cpp | |
t@@ -1,264 +0,0 @@ | |
-#include <iostream> | |
-#include <cstdio> | |
-#include <cstdlib> | |
-#include <cstring> | |
-#include <cmath> | |
-#include <vector_functions.h> | |
-#include "header.h" | |
-#include "rt-kernel.h" | |
-#include "rt-kernel-cpu.h" | |
-#include "colorbar.h" | |
-#include "o-ppm.h" | |
- | |
-int main(const int argc, const char* argv[]) | |
-{ | |
- using std::cout; | |
- | |
- if (argc < 6 || argc > 8 ){ | |
- cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width>… | |
- << "or\n" | |
- << "Usage: " << argv[0] << " GPU <particle-data.txt | colorbar> <widt… | |
- return 1; | |
- } | |
- | |
- cout << "\n----------------------------\n" | |
- << "This is the SPHERE raytracer\n" | |
- << "----------------------------\n"; | |
- | |
- // Allocate memory for image | |
- unsigned int width = atoi(argv[3]); | |
- unsigned int height = atoi(argv[4]); | |
- if (width < 1 || height < 1) { | |
- cout << "Image dimensions invalid.\n"; | |
- return 1; | |
- } | |
- rgb* img; | |
- img = new rgb [height*width]; | |
- | |
- // Render colorbar image | |
- if (strcmp(argv[2],"colorbar") == 0) { | |
- | |
- for (unsigned int y=0; y<height; ++y) { | |
- for (unsigned int x=0; x<width; ++x) { | |
- | |
- // Colormap value is relative to width position | |
- float ratio = (float) (x+1)/width; | |
- | |
- // Write pixel value to image array | |
- img[x + y*width].r = red(ratio) * 250.f; | |
- img[x + y*width].g = green(ratio) * 250.f; | |
- img[x + y*width].b = blue(ratio) * 250.f; | |
- | |
- } | |
- } | |
- } else { // Render sphere binary | |
- | |
- cout << "Reading binary: " << argv[2] << "\n"; | |
- | |
- FILE* fin; | |
- if ((fin = fopen(argv[2], "rb")) == NULL) { | |
- cout << " Error encountered while trying to open binary '" | |
- << argv[2] << "'\n"; | |
- return 1; | |
- } | |
- | |
- // Read the number of dimensions and particles | |
- unsigned int nd, np; | |
- (void)fread(&nd, sizeof(nd), 1, fin); | |
- if (nd != 3) { | |
- cout << " Input binary contains " << nd | |
- << "D data, this raytracer is made for 3D data\n"; | |
- return 1; | |
- } | |
- | |
- (void)fread(&np, sizeof(np), 1, fin); | |
- cout << " Number of particles: " << np << "\n"; | |
- | |
- // Allocate particle structure array | |
- cout << " Allocating host memory\n"; | |
- | |
- // Single precision arrays used for computations | |
- float4* p = new float4[np]; | |
- float* fixvel = new float[np]; | |
- float* xsum = new float[np]; | |
- float* es_dot = new float[np]; | |
- float* ev_dot = new float[np]; | |
- float* es = new float[np]; | |
- float* ev = new float[np]; | |
- float* pres = new float[np]; | |
- float* vel = new float[np]; | |
- | |
- // Read temporal information | |
- double dt, file_dt, current, total; | |
- unsigned int step_count; | |
- (void)fread(&dt, sizeof(dt), 1, fin); | |
- (void)fread(¤t, sizeof(current), 1, fin); | |
- (void)fread(&total, sizeof(total), 1, fin); | |
- (void)fread(&file_dt, sizeof(file_dt), 1, fin); | |
- (void)fread(&step_count, sizeof(step_count), 1, fin); | |
- | |
- double d; // Double precision temporary value holder | |
- | |
- // Canonical coordinate system origo | |
- f3 origo; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- origo.x = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- origo.y = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- origo.z = (float)d; | |
- | |
- // Canonical coordinate system dimensions | |
- f3 L; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- L.x = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- L.y = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- L.z = (float)d; | |
- | |
- | |
- // Skip over irrelevant data | |
- double blankd; | |
- //f3 blankd3; | |
- unsigned int blankui; | |
- for (int j=0; j<3; j++) // Skip over grid.num data | |
- (void)fread(&blankui, sizeof(blankui), 1, fin); | |
- | |
- // Velocity vector, later converted to lenght | |
- float3 v; | |
- | |
- // Load data into particle array | |
- for (unsigned int i=0; i<np; i++) { | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- p[i].x = (float)d; // Typecast to single precision | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- v.x = (float)d; | |
- for (int j=0; j<4; j++) | |
- (void)fread(&blankd, sizeof(blankd), 1, fin); | |
- | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- p[i].y = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- v.y = (float)d; | |
- for (int j=0; j<4; j++) | |
- (void)fread(&blankd, sizeof(blankd), 1, fin); | |
- | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- p[i].z = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- v.z = (float)d; | |
- for (int j=0; j<4; j++) | |
- (void)fread(&blankd, sizeof(blankd), 1, fin); | |
- | |
- // Save velocity vector length | |
- vel[i] = sqrt(v.x*v.x + v.y*v.y + v.z*v.z); | |
- } | |
- for (unsigned int i=0; i<np; i++) { | |
- (void)fread(&d, sizeof(d), 1, fin); // fixvel | |
- fixvel[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); // xsum | |
- xsum[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); // radius | |
- p[i].w = (float)d; | |
- for (int j=0; j<10; j++) | |
- (void)fread(&blankd, sizeof(blankd), 1, fin); | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- es_dot[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- ev_dot[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- es[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- ev[i] = (float)d; | |
- (void)fread(&d, sizeof(d), 1, fin); | |
- pres[i] = (float)d; | |
- } | |
- | |
- fclose(fin); // Binary read complete | |
- | |
- cout << " Spatial dimensions: " | |
- << L.x << "*" << L.y << "*" << L.z << " m\n"; | |
- | |
- // Eye position and focus point | |
- //f3 eye = {0.5f*L.x, -4.2f*L.y, 0.5f*L.z}; | |
- //f3 eye = {2.5f*L.x, -4.2f*L.y, 2.0f*L.z}; | |
- //f3 eye = {0.5f*L.x, -5.0f*L.y, 0.5f*L.z}; | |
- f3 eye = {2.5f*L.x, -5.0f*L.y, 1.5f*L.z}; | |
- f3 lookat = {0.5f*L.x, 0.5f*L.y, 0.5f*L.z}; | |
- if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetu… | |
- eye.x = 4.1f*L.x; | |
- eye.y = -15.1*L.y; | |
- eye.z = 1.1*L.z; | |
- /*lookat.x = 0.45f*L.x; | |
- lookat.y = 0.5f*L.y; | |
- lookat.z = 0.30f*L.z;*/ | |
- } | |
- | |
- // Screen width in world coordinates | |
- //float imgw = 1.4f*L.x; | |
- //float imgw = pow(L.x*L.y*L.z, 0.32f); | |
- //float imgw = sqrt(L.x*L.y)*1.2f; // Adjust last float to capture entire … | |
- float imgw = L.x*1.35f; | |
- | |
- // Determine visualization mode | |
- int visualize = 0; // 0: ordinary view | |
- float max_val = 0; | |
- if (argc == 8) { | |
- if(strcmp(argv[6],"pressure") == 0) | |
- visualize = 1; // 1: pressure view | |
- if(strcmp(argv[6],"es_dot") == 0) | |
- visualize = 2; // 2: es_dot view | |
- if(strcmp(argv[6],"es") == 0) | |
- visualize = 3; // 3: es view | |
- if(strcmp(argv[6],"vel") == 0) | |
- visualize = 4; // 4: velocity view | |
- if(strcmp(argv[6],"xsum") == 0) | |
- visualize = 5; // 5: xsum view | |
- | |
- // Read max. value specified in command args. | |
- max_val = atof(argv[7]); | |
- | |
- } | |
- | |
- | |
- if (strcmp(argv[1],"GPU") == 0) { | |
- | |
- // Call cuda wrapper | |
- if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize… | |
- cout << "\nError in rt()\n"; | |
- return 1; | |
- } | |
- } else if (strcmp(argv[1],"CPU") == 0) { | |
- | |
- // Call CPU wrapper | |
- if (rt_cpu(p, np, img, width, height, origo, L, eye, lookat, imgw) != 0)… | |
- cout << "\nError in rt_cpu()\n"; | |
- return 1; | |
- } | |
- } else { | |
- cout << "Please specify CPU or GPU for execution\n"; | |
- return 1; | |
- } | |
- | |
- // Free dynamically allocated memory | |
- delete [] p; | |
- delete [] fixvel; | |
- delete [] xsum; | |
- delete [] pres; | |
- delete [] es; | |
- delete [] ev; | |
- delete [] es_dot; | |
- delete [] ev_dot; | |
- delete [] vel; | |
- | |
- } | |
- | |
- // Write final image to PPM file | |
- image_to_ppm(img, argv[5], width, height); | |
- | |
- delete [] img; | |
- | |
- cout << "Terminating successfully\n\n"; | |
- return 0; | |
-} | |
diff --git a/raytracer/o-ppm.cpp b/raytracer/o-ppm.cpp | |
t@@ -1,30 +0,0 @@ | |
-// o-ppm.cpp | |
-// Write images in the PPM format | |
- | |
-#include <iostream> | |
-#include <cstdio> | |
-#include "header.h" | |
- | |
- | |
-// Write image structure to PPM file | |
-int image_to_ppm(rgb* rgb_img, const char* filename, const unsigned int width,… | |
-{ | |
- FILE *fout = fopen(filename, "wb"); | |
- if (fout == NULL) { | |
- std::cout << "File error in img_write() while trying to writing " << filen… | |
- return 1; | |
- } | |
- | |
- fprintf(fout, "P6 %d %d 255\n", width, height); | |
- | |
- // Write pixel array to ppm image file | |
- for (unsigned int i=0; i<height*width; ++i) { | |
- putc(rgb_img[i].r, fout); | |
- putc(rgb_img[i].g, fout); | |
- putc(rgb_img[i].b, fout); | |
- } | |
- | |
- fclose(fout); | |
- return 0; | |
-} | |
- | |
diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h | |
t@@ -1,7 +0,0 @@ | |
-#ifndef O_PPM_H_ | |
-#define O_PPM_H_ | |
- | |
-int image_to_ppm(rgb* rgb_img, const char* filename, | |
- const unsigned int width, const unsigned int height); | |
- | |
-#endif | |
diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs… | |
t@@ -1,27 +0,0 @@ | |
-#!/bin/bash | |
- | |
-FILES=`ls ../output/*.bin` | |
- | |
-XRES=800 | |
-YRES=800 | |
- | |
-WORKHORSE=CPU | |
- | |
-echo "# Rendering PPM images and converting to JPG" | |
-for F in ../output/*.bin | |
-do | |
- (BASE=`basename $F`; | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm; | |
- convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;) | |
- #rm ../img_out/$BASE.ppm &) | |
-done | |
- | |
-#echo "# Converting PPM files to JPG using ImageMagick in parallel" | |
-#for F in ../img_out/*.ppm | |
-#do | |
-# (BASE=`basename $F`; convert $F $F.jpg &) | |
-#done | |
- | |
-#echo "# Removed temporary files" | |
-#rm ../img_out/*.ppm | |
- | |
diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs… | |
t@@ -1,29 +0,0 @@ | |
-#!/bin/bash | |
- | |
-FILES=`ls ../output/*.bin` | |
- | |
-XRES=800 | |
-YRES=800 | |
- | |
-WORKHORSE=GPU | |
- | |
-echo "# Rendering PPM images" | |
-for F in ../output/*.bin | |
-do | |
- BASE=`basename $F` | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null | |
- if [ $? -ne 0 ] ; then | |
- echo "Error rendering $F, trying again..." | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null | |
- fi | |
-done | |
- | |
-echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
-for F in ../img_out/*.ppm | |
-do | |
- (BASE=`basename $F`; convert $F $F.jpg &) | |
-done | |
- | |
-#echo "# Removed temporary files" | |
-#rm ../img_out/*.ppm | |
- | |
diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp | |
t@@ -1,273 +0,0 @@ | |
-#include <iostream> | |
-#include <cstdio> | |
-#include <cmath> | |
-#include <time.h> | |
-#include <cuda.h> | |
-#include <cutil_math.h> | |
-#include <string.h> | |
-#include <stdlib.h> | |
-#include "header.h" | |
-#include "rt-kernel-cpu.h" | |
- | |
-// Constants | |
-float3 constc_u; | |
-float3 constc_v; | |
-float3 constc_w; | |
-float3 constc_eye; | |
-float4 constc_imgplane; | |
-float constc_d; | |
-float3 constc_light; | |
- | |
-__inline__ float3 f4_to_f3(float4 in) | |
-{ | |
- return make_float3(in.x, in.y, in.z); | |
-} | |
- | |
-__inline__ float4 f3_to_f4(float3 in) | |
-{ | |
- return make_float4(in.x, in.y, in.z, 0.0f); | |
-} | |
- | |
-__inline__ float lengthf3(float3 in) | |
-{ | |
- return sqrt(in.x*in.x + in.y*in.y + in.z*in.z); | |
-} | |
- | |
-// Kernel for initializing image data | |
-void imageInit_cpu(unsigned char* _img, unsigned int pixels) | |
-{ | |
- for (unsigned int mempos=0; mempos<pixels; ++mempos) { | |
- _img[mempos*4] = 255; // Red channel | |
- _img[mempos*4 + 1] = 255; // Green channel | |
- _img[mempos*4 + 2] = 255; // Blue channel | |
- } | |
-} | |
- | |
-// Calculate ray origins and directions | |
-void rayInitPerspective_cpu(float3* _ray_origo, | |
- float3* _ray_direction, | |
- float3 eye, | |
- unsigned int width, | |
- unsigned int height) | |
-{ | |
- int i,j; | |
- unsigned int mempos; | |
- float p_u, p_v; | |
- #pragma omp parallel for private(mempos,j,p_u,p_v) | |
- for (i=0; i<(int)width; ++i) { | |
- for (j=0; j<(int)height; ++j) { | |
- | |
- mempos = i + j*width; | |
- | |
- // Calculate pixel coordinates in image plane | |
- p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x) | |
- * (i + 0.5f) / width; | |
- p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z) | |
- * (j + 0.5f) / height; | |
- | |
- // Write ray origo and direction to global memory | |
- _ray_origo[mempos] = constc_eye; | |
- _ray_direction[mempos] = -constc_d*constc_w + p_u*constc_u + p_v*constc_… | |
- } | |
- } | |
-} | |
- | |
-// Check wether the pixel's viewing ray intersects with the spheres, | |
-// and shade the pixel correspondingly | |
-void rayIntersectSpheres_cpu(float3* _ray_origo, | |
- float3* _ray_direction, | |
- float4* _p, | |
- float* _nuance, | |
- unsigned char* _img, | |
- unsigned int pixels, | |
- unsigned int np) | |
-{ | |
- long int mempos; | |
- float3 e, d, n, p, c; | |
- float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance; | |
- Inttype i, ifinal; | |
- #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d… | |
- for (mempos=0; mempos<pixels; ++mempos) { | |
- | |
- // Read ray data from global memory | |
- e = _ray_origo[mempos]; | |
- d = _ray_direction[mempos]; | |
- | |
- // Distance, in ray steps, between object and eye initialized with a large… | |
- tdist = 1e10f; | |
- | |
- // Iterate through all particles | |
- for (i=0; i<np; ++i) { | |
- | |
- // Read sphere coordinate and radius | |
- c = f4_to_f3(_p[i]); | |
- R = _p[i].w; | |
- | |
- // Calculate the discriminant: d = B^2 - 4AC | |
- Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
- - 4.0f*dot(d,d) // -4*A | |
- * (dot((e-c),(e-c)) - R*R); // C | |
- | |
- // If the determinant is positive, there are two solutions | |
- // One where the line enters the sphere, and one where it exits | |
- if (Delta > 0.0f) { | |
- | |
- // Calculate roots, Shirley 2009 p. 77 | |
- t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d) | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
- | |
- // Check wether intersection is closer than previous values | |
- if (fabs(t_minus) < tdist) { | |
- p = e + t_minus*d; | |
- tdist = fabs(t_minus); | |
- n = normalize(2.0f * (p - c)); // Surface normal | |
- ifinal = i; | |
- } | |
- | |
- } // End of solution branch | |
- | |
- } // End of particle loop | |
- | |
- // Write pixel color | |
- if (tdist < 1e10) { | |
- | |
- // Lambertian shading parameters | |
- //float dotprod = fabs(dot(n, constc_light)); | |
- dotprod = fmax(0.0f,dot(n, constc_light)); | |
- I_d = 70.0f; // Light intensity | |
- k_d = 5.0f; // Diffuse coefficient | |
- | |
- // Ambient shading | |
- k_a = 30.0f; | |
- I_a = 5.0f; | |
- | |
- // Read color nuance of grain | |
- nuance = _nuance[ifinal]; | |
- | |
- // Write shading model values to pixel color channels | |
- _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.48f*nuance); | |
- _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.41f*nuance); | |
- _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.27f*nuance); | |
- } | |
- } | |
-} | |
- | |
- | |
-void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio) | |
-{ | |
- // Image dimensions in world space (l, r, b, t) | |
- float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.… | |
- | |
- // The view vector | |
- float3 view = eye - lookat; | |
- | |
- // Construct the camera view orthonormal base | |
- //float3 up = make_float3(0.0f, 1.0f, 0.0f); // Pointing upward along +y | |
- float3 up = make_float3(0.0f, 0.0f, 1.0f); // Pointing upward along +z | |
- float3 w = -view/length(view); // w: Pointing backwards | |
- float3 u = cross(up, w) / length(cross(up, w)); | |
- float3 v = cross(w, u); | |
- | |
- // Focal length 20% of eye vector length | |
- float d = lengthf3(view)*0.8f; | |
- | |
- // Light direction (points towards light source) | |
- float3 light = normalize(-1.0f*eye*make_float3(1.0f, 0.2f, 0.6f)); | |
- | |
- std::cout << " Transfering camera values to constant memory\n"; | |
- | |
- constc_u = u; | |
- constc_v = v; | |
- constc_w = w; | |
- constc_eye = eye; | |
- constc_imgplane = imgplane; | |
- constc_d = d; | |
- constc_light = light; | |
- | |
- std::cout << "Rendering image..."; | |
-} | |
- | |
- | |
-// Wrapper for the rt algorithm | |
-int rt_cpu(float4* p, unsigned int np, | |
- rgb* img, unsigned int width, unsigned int height, | |
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw) { | |
- | |
- using std::cout; | |
- | |
- cout << "Initializing CPU raytracer:\n"; | |
- | |
- // Initialize GPU timestamp recorders | |
- float t1_go, t2_go, t1_stop, t2_stop; | |
- | |
- // Start timer 1 | |
- t1_go = clock(); | |
- | |
- // Generate random nuance values for all grains | |
- static float* _nuance; // Values between 0.5 and 1.0 | |
- _nuance = new float[np]; | |
- srand(0); // Initialize seed at same value at every run | |
- for (unsigned int i=0; i<np; ++i) | |
- _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f; | |
- | |
- // Allocate memory | |
- cout << " Allocating memory\n"; | |
- static unsigned char *_img; // RGBw values in image | |
- static float3* _ray_origo; // Ray origo (x,y,z) | |
- static float3* _ray_direction; // Ray direction (x,y,z) | |
- _img = new unsigned char[width*height*4]; | |
- _ray_origo = new float3[width*height]; | |
- _ray_direction = new float3[width*height]; | |
- | |
- // Arrange thread/block structure | |
- unsigned int pixels = width*height; | |
- float hw_ratio = (float)height/(float)width; | |
- | |
- // Start timer 2 | |
- t2_go = clock(); | |
- | |
- // Initialize image to background color | |
- imageInit_cpu(_img, pixels); | |
- | |
- // Initialize camera | |
- cameraInit_cpu(make_float3(eye.x, eye.y, eye.z), | |
- make_float3(lookat.x, lookat.y, lookat.z), | |
- imgw, hw_ratio); | |
- | |
- // Construct rays for perspective projection | |
- rayInitPerspective_cpu( | |
- _ray_origo, _ray_direction, | |
- make_float3(eye.x, eye.y, eye.z), | |
- width, height); | |
- | |
- // Find closest intersection between rays and spheres | |
- rayIntersectSpheres_cpu( | |
- _ray_origo, _ray_direction, | |
- p, _nuance, _img, pixels, np); | |
- | |
- // Stop timer 2 | |
- t2_stop = clock(); | |
- | |
- memcpy(img, _img, sizeof(unsigned char)*pixels*4); | |
- | |
- // Free dynamically allocated device memory | |
- delete [] _nuance; | |
- delete [] _img; | |
- delete [] _ray_origo; | |
- delete [] _ray_direction; | |
- | |
- // Stop timer 1 | |
- t1_stop = clock(); | |
- | |
- // Report time spent | |
- cout << " done.\n" | |
- << " Time spent on entire CPU raytracing routine: " | |
- << (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n"; | |
- cout << " - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n… | |
- | |
- // Return successfully | |
- return 0; | |
-} | |
diff --git a/raytracer/rt-kernel-cpu.h b/raytracer/rt-kernel-cpu.h | |
t@@ -1,14 +0,0 @@ | |
-#ifndef RT_KERNEL_CPU_H_ | |
-#define RT_KERNEL_CPU_H_ | |
- | |
-#include <vector_functions.h> | |
- | |
-// Host prototype functions | |
- | |
-void cameraInit(float3 eye, float3 lookat, float imgw, float hw_ratio); | |
- | |
-int rt_cpu(float4* p, const unsigned int np, | |
- rgb* img, const unsigned int width, const unsigned int height, | |
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw); | |
- | |
-#endif | |
diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu | |
t@@ -1,454 +0,0 @@ | |
-#include <iostream> | |
-#include <cutil_math.h> | |
-#include "header.h" | |
-#include "rt-kernel.h" | |
-#include "colorbar.h" | |
- | |
-unsigned int iDivUp (unsigned int a, unsigned int b) { | |
- return (a % b != 0) ? (a / b + 1) : (a / b); | |
-} | |
- | |
-__inline__ __host__ __device__ float3 f4_to_f3(float4 in) | |
-{ | |
- return make_float3(in.x, in.y, in.z); | |
-} | |
- | |
-__inline__ __host__ __device__ float4 f3_to_f4(float3 in) | |
-{ | |
- return make_float4(in.x, in.y, in.z, 0.0f); | |
-} | |
- | |
-// Kernel for initializing image data | |
-__global__ void imageInit(unsigned char* _img, unsigned int pixels) | |
-{ | |
- // Compute pixel position from threadIdx/blockIdx | |
- unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
- if (mempos > pixels) | |
- return; | |
- | |
- _img[mempos*4] = 255; // Red channel | |
- _img[mempos*4 + 1] = 255; // Green channel | |
- _img[mempos*4 + 2] = 255; // Blue channel | |
-} | |
- | |
-// Calculate ray origins and directions | |
-__global__ void rayInitPerspective(float4* _ray_origo, | |
- float4* _ray_direction, | |
- float4 eye, | |
- unsigned int width, | |
- unsigned int height) | |
-{ | |
- // Compute pixel position from threadIdx/blockIdx | |
- unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
- if (mempos > width*height) | |
- return; | |
- | |
- // Calculate 2D position from linear index | |
- unsigned int i = mempos % width; | |
- unsigned int j = (int)floor((float)mempos/width) % width; | |
- | |
- // Calculate pixel coordinates in image plane | |
- float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x) | |
- * (i + 0.5f) / width; | |
- float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z) | |
- * (j + 0.5f) / height; | |
- | |
- // Write ray origo and direction to global memory | |
- _ray_origo[mempos] = const_eye; | |
- _ray_direction[mempos] = -const_d*const_w + p_u*const_u + p_v*const_v; | |
-} | |
- | |
-// Check wether the pixel's viewing ray intersects with the spheres, | |
-// and shade the pixel correspondingly | |
-__global__ void rayIntersectSpheres(float4* _ray_origo, | |
- float4* _ray_direction, | |
- float4* _p, | |
- unsigned char* _img) | |
-{ | |
- // Compute pixel position from threadIdx/blockIdx | |
- unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
- if (mempos > const_pixels) | |
- return; | |
- | |
- // Read ray data from global memory | |
- float3 e = f4_to_f3(_ray_origo[mempos]); | |
- float3 d = f4_to_f3(_ray_direction[mempos]); | |
- //float step = length(d); | |
- | |
- // Distance, in ray steps, between object and eye initialized with a large v… | |
- float tdist = 1e10f; | |
- | |
- // Surface normal at closest sphere intersection | |
- float3 n; | |
- | |
- // Intersection point coordinates | |
- float3 p; | |
- | |
- // Iterate through all particles | |
- for (Inttype i=0; i<const_np; ++i) { | |
- | |
- // Read sphere coordinate and radius | |
- float3 c = f4_to_f3(_p[i]); | |
- float R = _p[i].w; | |
- | |
- // Calculate the discriminant: d = B^2 - 4AC | |
- float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
- - 4.0f*dot(d,d) // -4*A | |
- * (dot((e-c),(e-c)) - R*R); // C | |
- | |
- // If the determinant is positive, there are two solutions | |
- // One where the line enters the sphere, and one where it exits | |
- if (Delta > 0.0f) { | |
- | |
- // Calculate roots, Shirley 2009 p. 77 | |
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(… | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
- | |
- // Check wether intersection is closer than previous values | |
- if (fabs(t_minus) < tdist) { | |
- p = e + t_minus*d; | |
- tdist = fabs(t_minus); | |
- n = normalize(2.0f * (p - c)); // Surface normal | |
- } | |
- | |
- } // End of solution branch | |
- | |
- } // End of particle loop | |
- | |
- // Write pixel color | |
- if (tdist < 1e10f) { | |
- | |
- // Lambertian shading parameters | |
- float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light))); | |
- float I_d = 40.0f; // Light intensity | |
- float k_d = 5.0f; // Diffuse coefficient | |
- | |
- // Ambient shading | |
- float k_a = 10.0f; | |
- float I_a = 5.0f; // 5.0 for black background | |
- | |
- // Write shading model values to pixel color channels | |
- _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.48f); | |
- _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.41f); | |
- _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*0.27f); | |
- | |
- } | |
-} | |
- | |
-// Check wether the pixel's viewing ray intersects with the spheres, | |
-// and shade the pixel correspondingly using a colormap | |
-__global__ void rayIntersectSpheresColormap(float4* _ray_origo, | |
- float4* _ray_direction, | |
- float4* _p, | |
- float* _fixvel, | |
- float* _linarr, | |
- float max_val, | |
- unsigned char* _img) | |
-{ | |
- // Compute pixel position from threadIdx/blockIdx | |
- unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x; | |
- if (mempos > const_pixels) | |
- return; | |
- | |
- // Read ray data from global memory | |
- float3 e = f4_to_f3(_ray_origo[mempos]); | |
- float3 d = f4_to_f3(_ray_direction[mempos]); | |
- | |
- // Distance, in ray steps, between object and eye initialized with a large v… | |
- float tdist = 1e10f; | |
- | |
- // Surface normal at closest sphere intersection | |
- float3 n; | |
- | |
- // Intersection point coordinates | |
- float3 p; | |
- | |
- unsigned int hitidx; | |
- | |
- // Iterate through all particles | |
- for (Inttype i=0; i<const_np; ++i) { | |
- | |
- // Read sphere coordinate and radius | |
- float3 c = f4_to_f3(_p[i]); | |
- float R = _p[i].w; | |
- | |
- // Calculate the discriminant: d = B^2 - 4AC | |
- float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2 | |
- - 4.0f*dot(d,d) // -4*A | |
- * (dot((e-c),(e-c)) - R*R); // C | |
- | |
- // If the determinant is positive, there are two solutions | |
- // One where the line enters the sphere, and one where it exits | |
- if (Delta > 0.0f) { | |
- | |
- // Calculate roots, Shirley 2009 p. 77 | |
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(… | |
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d)); | |
- | |
- // Check wether intersection is closer than previous values | |
- if (fabs(t_minus) < tdist) { | |
- p = e + t_minus*d; | |
- tdist = fabs(t_minus); | |
- n = normalize(2.0f * (p - c)); // Surface normal | |
- hitidx = i; | |
- } | |
- | |
- } // End of solution branch | |
- | |
- } // End of particle loop | |
- | |
- // Write pixel color | |
- if (tdist < 1e10) { | |
- | |
- // Fetch particle data used for color | |
- float ratio = _linarr[hitidx] / max_val; | |
- float fixvel = _fixvel[hitidx]; | |
- | |
- // Make sure the ratio doesn't exceed the 0.0-1.0 interval | |
- if (ratio < 0.01f) | |
- ratio = 0.01f; | |
- if (ratio > 0.99f) | |
- ratio = 0.99f; | |
- | |
- // Lambertian shading parameters | |
- float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light))); | |
- float I_d = 40.0f; // Light intensity | |
- float k_d = 5.0f; // Diffuse coefficient | |
- | |
- // Ambient shading | |
- float k_a = 10.0f; | |
- float I_a = 5.0f; | |
- | |
- float redv = red(ratio); | |
- float greenv = green(ratio); | |
- float bluev = blue(ratio); | |
- | |
- // Make particle dark grey if the horizontal velocity is fixed | |
- if (fixvel > 0.f) { | |
- redv = 0.5; | |
- greenv = 0.5; | |
- bluev = 0.5; | |
- } | |
- | |
- // Write shading model values to pixel color channels | |
- _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*redv); | |
- _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*greenv); | |
- _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod | |
- + k_a * I_a)*bluev); | |
- } | |
-} | |
- | |
-extern "C" | |
-__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio, | |
- unsigned int pixels, Inttype np) | |
-{ | |
- // Image dimensions in world space (l, r, b, t) | |
- float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.… | |
- | |
- // The view vector | |
- float4 view = eye - lookat; | |
- | |
- // Construct the camera view orthonormal base | |
- //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f); // Pointing upward along… | |
- float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f); // Pointing upward along +z | |
- float4 w = -view/length(view); // w: Pointing backwards | |
- float4 u = make_float4(cross(make_float3(up.x, up.y, up.z), | |
- make_float3(w.x, w.y, w.z)), 0.0f) | |
- / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y… | |
- float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.… | |
- | |
- // Focal length 20% of eye vector length | |
- float d = length(view)*0.8f; | |
- | |
- // Light direction (points towards light source) | |
- float4 light = normalize(-1.0f*eye*make_float4(1.0f, 0.2f, 0.6f, 0.0f)); | |
- | |
- std::cout << " Transfering camera values to constant memory\n"; | |
- | |
- cudaMemcpyToSymbol("const_u", &u, sizeof(u)); | |
- cudaMemcpyToSymbol("const_v", &v, sizeof(v)); | |
- cudaMemcpyToSymbol("const_w", &w, sizeof(w)); | |
- cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye)); | |
- cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane)); | |
- cudaMemcpyToSymbol("const_d", &d, sizeof(d)); | |
- cudaMemcpyToSymbol("const_light", &light, sizeof(light)); | |
- cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels)); | |
- cudaMemcpyToSymbol("const_np", &np, sizeof(np)); | |
-} | |
- | |
-// Check for CUDA errors | |
-extern "C" | |
-__host__ void checkForCudaErrors(const char* checkpoint_description) | |
-{ | |
- cudaError_t err = cudaGetLastError(); | |
- if (err != cudaSuccess) { | |
- std::cout << "\nCuda error detected, checkpoint: " << checkpoint_descripti… | |
- << "\nError string: " << cudaGetErrorString(err) << "\n"; | |
- exit(EXIT_FAILURE); | |
- } | |
-} | |
- | |
- | |
-// Wrapper for the rt kernel | |
-extern "C" | |
-__host__ int rt(float4* p, Inttype np, | |
- rgb* img, unsigned int width, unsigned int height, | |
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw, | |
- int visualize, float max_val, | |
- float* fixvel, | |
- float* xsum, | |
- float* pres, | |
- float* es_dot, | |
- float* es, | |
- float* vel) | |
-{ | |
- using std::cout; | |
- | |
- cout << "Initializing CUDA:\n"; | |
- | |
- // Initialize GPU timestamp recorders | |
- float t1, t2; | |
- cudaEvent_t t1_go, t2_go, t1_stop, t2_stop; | |
- cudaEventCreate(&t1_go); | |
- cudaEventCreate(&t2_go); | |
- cudaEventCreate(&t2_stop); | |
- cudaEventCreate(&t1_stop); | |
- | |
- // Start timer 1 | |
- cudaEventRecord(t1_go, 0); | |
- | |
- // Allocate memory | |
- cout << " Allocating device memory\n"; | |
- static float4 *_p; // Particle positions (x,y,z) and … | |
- static float *_fixvel; // Indicates whether a particle has a … | |
- static float *_linarr; // Array for linear values to color t… | |
- static unsigned char *_img; // RGBw values in image | |
- static float4 *_ray_origo; // Ray origo (x,y,z) | |
- static float4 *_ray_direction; // Ray direction (x,y,z) | |
- cudaMalloc((void**)&_p, np*sizeof(float4)); | |
- cudaMalloc((void**)&_fixvel, np*sizeof(float)); | |
- cudaMalloc((void**)&_linarr, np*sizeof(float)); // 0 size if visualize = 0; | |
- cudaMalloc((void**)&_img, width*height*4*sizeof(unsigned char)); | |
- cudaMalloc((void**)&_ray_origo, width*height*sizeof(float4)); | |
- cudaMalloc((void**)&_ray_direction, width*height*sizeof(float4)); | |
- | |
- // Transfer particle data | |
- cout << " Transfering particle data: host -> device\n"; | |
- cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice); | |
- cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice); | |
- if (visualize == 1 || visualize == 4) | |
- cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice); | |
- if (visualize == 2) | |
- cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice); | |
- if (visualize == 3) | |
- cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice); | |
- if (visualize == 4) | |
- cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice); | |
- if (visualize == 5) | |
- cudaMemcpy(_linarr, xsum, np*sizeof(float), cudaMemcpyHostToDevice); | |
- | |
- // Check for errors after memory allocation | |
- checkForCudaErrors("CUDA error after memory allocation"); | |
- | |
- // Arrange thread/block structure | |
- unsigned int pixels = width*height; | |
- float hw_ratio = (float)height/(float)width; | |
- const unsigned int threadsPerBlock = 256; | |
- const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock); | |
- | |
- // Start timer 2 | |
- cudaEventRecord(t2_go, 0); | |
- | |
- // Initialize image to background color | |
- imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels); | |
- | |
- // Initialize camera | |
- cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f), | |
- make_float4(lookat.x, lookat.y, lookat.z, 0.0f), | |
- imgw, hw_ratio, pixels, np); | |
- checkForCudaErrors("CUDA error after cameraInit"); | |
- | |
- // Construct rays for perspective projection | |
- rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- make_float4(eye.x, eye.y, eye.z, 0.0f), | |
- width, height); | |
- | |
- cudaThreadSynchronize(); | |
- | |
- // Find closest intersection between rays and spheres | |
- if (visualize == 1) { // Visualize pressure | |
- cout << " Pressure color map range: [0, " << max_val << "] Pa\n"; | |
- rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _fixvel, _linarr, max_val, _img); | |
- } else if (visualize == 2) { // es_dot visualization | |
- cout << " Shear heat production rate color map range: [0, " << max_val <<… | |
- rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _fixvel, _linarr, max_val, _img); | |
- } else if (visualize == 3) { // es visualization | |
- cout << " Total shear heat color map range: [0, " << max_val << "] J\n"; | |
- rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _fixvel, _linarr, max_val, _img); | |
- } else if (visualize == 4) { // velocity visualization | |
- cout << " Velocity color map range: [0, " << max_val << "] m/s\n"; | |
- rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _fixvel, _linarr, max_val, _img); | |
- } else if (visualize == 5) { // xsum visualization | |
- cout << " XSum color map range: [0, " << max_val << "] m\n"; | |
- rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _fixvel, _linarr, max_val, _img); | |
- } else { // Normal visualization | |
- rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>( | |
- _ray_origo, _ray_direction, | |
- _p, _img); | |
- } | |
- | |
- // Make sure all threads are done before continuing CPU control sequence | |
- cudaThreadSynchronize(); | |
- | |
- // Check for errors | |
- checkForCudaErrors("CUDA error after kernel execution"); | |
- | |
- // Stop timer 2 | |
- cudaEventRecord(t2_stop, 0); | |
- cudaEventSynchronize(t2_stop); | |
- | |
- // Transfer image data from device to host | |
- cout << " Transfering image data: device -> host\n"; | |
- cudaMemcpy(img, _img, width*height*4*sizeof(unsigned char), cudaMemcpyDevice… | |
- | |
- // Free dynamically allocated device memory | |
- cudaFree(_p); | |
- cudaFree(_fixvel); | |
- cudaFree(_linarr); | |
- cudaFree(_img); | |
- cudaFree(_ray_origo); | |
- cudaFree(_ray_direction); | |
- | |
- // Stop timer 1 | |
- cudaEventRecord(t1_stop, 0); | |
- cudaEventSynchronize(t1_stop); | |
- | |
- // Calculate time spent in t1 and t2 | |
- cudaEventElapsedTime(&t1, t1_go, t1_stop); | |
- cudaEventElapsedTime(&t2, t2_go, t2_stop); | |
- | |
- // Report time spent | |
- cout << " Time spent on entire GPU routine: " | |
- << t1 << " ms\n"; | |
- cout << " - Kernels: " << t2 << " ms\n" | |
- << " - Memory alloc. and transfer: " << t1-t2 << " ms\n"; | |
- | |
- // Return successfully | |
- return 0; | |
-} | |
diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h | |
t@@ -1,35 +0,0 @@ | |
-#ifndef RT_KERNEL_H_ | |
-#define RT_KERNEL_H_ | |
- | |
-#include <vector_functions.h> | |
-#include "../src/datatypes.h" | |
-#include "header.h" | |
- | |
-// Constants | |
-__constant__ float4 const_u; | |
-__constant__ float4 const_v; | |
-__constant__ float4 const_w; | |
-__constant__ float4 const_eye; | |
-__constant__ float4 const_imgplane; | |
-__constant__ float const_d; | |
-__constant__ float4 const_light; | |
-__constant__ unsigned int const_pixels; | |
-__constant__ Inttype const_np; | |
- | |
- | |
-// Host prototype functions | |
- | |
-extern "C" | |
-void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio, | |
- unsigned int pixels, Inttype np); | |
- | |
-extern "C" | |
-void checkForCudaErrors(const char* checkpoint_description); | |
- | |
-extern "C" | |
-int rt(float4* p, Inttype np, | |
- rgb* img, unsigned int width, unsigned int height, | |
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw, | |
- int visualize, float max_val, | |
- float* fixvel, float* xsum, float* pres, float* es_dot, float* es, floa… | |
-#endif | |
diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh | |
t@@ -1,36 +0,0 @@ | |
-#!/bin/bash | |
- | |
-# Pass max. pressure as command line argument | |
- | |
-XRES=400 | |
-YRES=1600 | |
- | |
-WORKHORSE=GPU | |
- | |
-echo "# Rendering PPM images" | |
-for F in ../output/*init*.bin | |
-do | |
- BASE=`basename $F` | |
- OUTFILE=$BASE.ppm.jpg | |
- if [ -e ../img_out/$OUTFILE ] | |
- then | |
- echo "$OUTFILE exists." > /dev/null | |
- else | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
- if [ $? -ne 0 ] ; then | |
- echo "Error rendering $F, trying again..." | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
- fi | |
- fi | |
-done | |
- | |
-echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
-for F in ../img_out/*.ppm | |
-do | |
- (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
-done | |
- | |
-sleep 5 | |
-echo "# Removing temporary PPM files" | |
-rm ../img_out/*.ppm | |
- | |
diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh | |
t@@ -1,36 +0,0 @@ | |
-#!/bin/bash | |
- | |
-# Pass max. pressure as command line argument | |
- | |
-XRES=800 | |
-YRES=800 | |
- | |
-WORKHORSE=GPU | |
- | |
-echo "# Rendering PPM images" | |
-for F in ../output/*.bin | |
-do | |
- BASE=`basename $F` | |
- OUTFILE=$BASE.ppm.jpg | |
- if [ -e ../img_out/$OUTFILE ] | |
- then | |
- echo "$OUTFILE exists." > /dev/null | |
- else | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
- if [ $? -ne 0 ] ; then | |
- echo "Error rendering $F, trying again..." | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
- fi | |
- fi | |
-done | |
- | |
-echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
-for F in ../img_out/*.ppm | |
-do | |
- (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
-done | |
- | |
-sleep 5 | |
-echo "# Removing temporary PPM files" | |
-rm ../img_out/*.ppm | |
- | |
diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh | |
t@@ -1,36 +0,0 @@ | |
-#!/bin/bash | |
- | |
-# Pass max. pressure as command line argument | |
- | |
-XRES=800 | |
-YRES=1200 | |
- | |
-WORKHORSE=GPU | |
- | |
-echo "# Rendering PPM images" | |
-for F in ../output/*.bin | |
-do | |
- BASE=`basename $F` | |
- OUTFILE=$BASE.ppm.jpg | |
- if [ -e ../img_out/$OUTFILE ] | |
- then | |
- echo "$OUTFILE exists." > /dev/null | |
- else | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null | |
- if [ $? -ne 0 ] ; then | |
- echo "Error rendering $F, trying again..." | |
- ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/n… | |
- fi | |
- fi | |
-done | |
- | |
-echo "# Converting PPM files to JPEG using ImageMagick in parallel" | |
-for F in ../img_out/*.ppm | |
-do | |
- (BASE=`basename $F`; convert $F $F.jpg > /dev/null &) | |
-done | |
- | |
-sleep 5 | |
-echo "# Removing temporary PPM files" | |
-rm ../img_out/*.ppm | |
- |