GopherProxy

	tfix various minor issues related to memory initialization, suggest clang-3.8 t…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit f892592dcad73329dce16040772caa240511678b
	parent 103a8713606c2aa6257b0d5bcf4ae4b6562fea66
	Author: Anders Damsgaard <[email protected]>
	Date: Thu, 7 Sep 2017 12:54:25 -0700

	fix various minor issues related to memory initialization, suggest clang-3.8 to…

	Diffstat:
	M src/CMakeLists.txt \| 11 +++++++----
	M src/device.cu \| 74 ++++++++++++++++-------------…
	M src/sphere.cpp \| 4 ++--
	M tests/io_tests.py \| 1 +

	4 files changed, 49 insertions(+), 41 deletions(-)
	---
	diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
	t@@ -3,7 +3,7 @@
	#LINK_LIBRARIES("-L${CUDA_SDK_ROOT_DIR}/lib -lcutil_x86_64") # For 64 bit syst…

	# Ohter folders to include
	-SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
	+#SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
	INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
	INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/src")
	SET(EXECUTABLE_OUTPUT_PATH "../")
	t@@ -16,13 +16,16 @@ INCLUDE(FindCUDA)
	IF (GPU_GENERATION EQUAL 1) # Kepler
	SET(CUDA_NVCC_FLAGS
	#"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35…
	- "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\…
	+ #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35…
	+ #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35…
	+ "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\…
	ELSE() # Fermi
	SET(CUDA_NVCC_FLAGS
	#"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20…
	- "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\…
	+ #"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20…
	+ "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\…
	ENDIF (GPU_GENERATION EQUAL 1)
	-SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")
	+#SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")

	# Rule to build executable program
	CUDA_ADD_EXECUTABLE(sphere
	diff --git a/src/device.cu b/src/device.cu
	t@@ -48,15 +48,14 @@ int cudaCoresPerSM(int major, int minor)
	else if (major == 6 && minor == 1)
	return 128;
	else
	- printf("Error in cudaCoresPerSM",
	- "Device compute capability value (%d.%d) not recognized.",
	- major, minor);
	+ printf("Error in cudaCoresPerSM Device compute capability value "
	+ "(%d.%d) not recognized.", major, minor);
	return -1;
	}

	// Wrapper function for initializing the CUDA components.
	// Called from main.cpp
	-__host__ void DEM::initializeGPU(void)
	+void DEM::initializeGPU(void)
	{
	using std::cout; // stdout

	t@@ -149,13 +148,13 @@ __host__ void DEM::initializeGPU(void)
	}

	// Start timer for kernel profiling
	-__host__ void startTimer(cudaEvent_t* kernel_tic)
	+void startTimer(cudaEvent_t* kernel_tic)
	{
	cudaEventRecord(*kernel_tic);
	}

	// Stop timer for kernel profiling and time to function sum
	-__host__ void stopTimer(cudaEvent_t *kernel_tic,
	+void stopTimer(cudaEvent_t *kernel_tic,
	cudaEvent_t *kernel_toc,
	float *kernel_elapsed,
	double* sum)
	t@@ -280,7 +279,7 @@ __global__ void checkParticlePositions(
	// Copy the constant data components to device memory,
	// and check whether the values correspond to the
	// values in constant memory.
	-__host__ void DEM::checkConstantMemory()
	+void DEM::checkConstantMemory()
	{
	// Allocate space in global device memory
	Grid* dev_grid;
	t@@ -322,7 +321,7 @@ __host__ void DEM::checkConstantMemory()
	}

	// Copy selected constant components to constant device memory.
	-__host__ void DEM::transferToConstantDeviceMemory()
	+void DEM::transferToConstantDeviceMemory()
	{
	using std::cout;

	t@@ -361,7 +360,7 @@ __global__ void printWorldSize(Float4* dev_walls_nx)
	dev_walls_nx[0].w);
	}

	-__host__ void DEM::updateGridSize()
	+void DEM::updateGridSize()
	{
	//printf("\nDEM::updateGridSize() start\n");
	Float* Lz = new Float;
	t@@ -396,7 +395,7 @@ __host__ void DEM::updateGridSize()

	// Allocate device memory for particle variables,
	// tied to previously declared pointers in structures
	-__host__ void DEM::allocateGlobalDeviceMemory(void)
	+void DEM::allocateGlobalDeviceMemory(void)
	{
	// Particle memory size
	unsigned int memSizeF = sizeof(Float) * np;
	t@@ -482,7 +481,7 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)

	// Allocate global memory on other devices required for "interact" function.
	// The values of domain_size[ndevices] must be set beforehand.
	-__host__ void DEM::allocateHelperDeviceMemory(void)
	+void DEM::allocateHelperDeviceMemory(void)
	{
	// Particle memory size
	unsigned int memSizeF4 = sizeof(Float4) * np;
	t@@ -555,7 +554,7 @@ __host__ void DEM::allocateHelperDeviceMemory(void)
	cudaSetDevice(device); // select main GPU
	}

	-__host__ void DEM::freeHelperDeviceMemory()
	+void DEM::freeHelperDeviceMemory()
	{
	for (int d=0; d<ndevices; d++) {

	t@@ -593,7 +592,7 @@ __host__ void DEM::freeHelperDeviceMemory()
	cudaSetDevice(device); // select primary GPU
	}

	-__host__ void DEM::freeGlobalDeviceMemory()
	+void DEM::freeGlobalDeviceMemory()
	{
	if (verbose == 1)
	printf("\nFreeing device memory: ");
	t@@ -658,7 +657,7 @@ __host__ void DEM::freeGlobalDeviceMemory()
	}


	-__host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
	+void DEM::transferToGlobalDeviceMemory(int statusmsg)
	{
	if (verbose == 1 && statusmsg == 1)
	std::cout << " Transfering data to the device: ";
	t@@ -745,7 +744,7 @@ __host__ void DEM::transferToGlobalDeviceMemory(int status…
	std::cout << "Done" << std::endl;
	}

	-__host__ void DEM::transferFromGlobalDeviceMemory()
	+void DEM::transferFromGlobalDeviceMemory()
	{
	//std::cout << " Transfering data from the device: ";

	t@@ -824,7 +823,7 @@ __host__ void DEM::transferFromGlobalDeviceMemory()


	// Iterate through time by explicit time integration
	-__host__ void DEM::startTime()
	+void DEM::startTime()
	{
	using std::cout;
	using std::cerr;
	t@@ -1002,16 +1001,18 @@ __host__ void DEM::startTime()
	unsigned int wall0_iz = 10000000;
	// weight of fluid between two cells in z direction
	Float dp_dz;
	- if (cfd_solver == 0)
	- dp_dz = fabs(ns.rho_fparams.g[2]grid.L[2]/grid.num[2]);
	- else if (cfd_solver == 1) {
	- dp_dz = fabs(darcy.rho_fparams.g[2]grid.L[2]/grid.num[2]);
	-
	- // determine pressure at top wall at t=0
	- darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
	- - darcy.p_mod_A
	- sin(2.0M_PIdarcy.p_mod_ftime.current
	- + darcy.p_mod_phi);
	+ if (fluid == 1) {
	+ if (cfd_solver == 0)
	+ dp_dz = fabs(ns.rho_fparams.g[2]grid.L[2]/grid.num[2]);
	+ else if (cfd_solver == 1) {
	+ dp_dz = fabs(darcy.rho_fparams.g[2]grid.L[2]/grid.num[2]);
	+
	+ // determine pressure at top wall at t=0
	+ darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
	+ - darcy.p_mod_A
	+ sin(2.0M_PIdarcy.p_mod_ftime.current
	+ + darcy.p_mod_phi);
	+ }
	}
	//std::cout << "dp_dz = " << dp_dz << std::endl;

	t@@ -2589,13 +2590,15 @@ __host__ void DEM::startTime()
	iter);

	// Empty the dphi values after device to host transfer
	- if (fluid == 1 && cfd_solver == 1) {
	- setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
	- (dev_darcy_dphi);
	- cudaThreadSynchronize();
	- checkForCudaErrorsIter(
	- "After setDarcyZeros(dev_darcy_dphi) after transfer",
	- iter);
	+ if (fluid == 1) {
	+ if (cfd_solver == 1) {
	+ setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
	+ (dev_darcy_dphi);
	+ cudaThreadSynchronize();
	+ checkForCudaErrorsIter(
	+ "After setDarcyZeros(dev_darcy_dphi) after transfe…
	+ iter);
	+ }
	}

	// Pause the CPU thread until all CUDA calls previously issued are
	t@@ -2603,8 +2606,9 @@ __host__ void DEM::startTime()
	cudaThreadSynchronize();

	// Check the numerical stability of the NS solver
	- if (fluid == 1 && cfd_solver == 0)
	- checkNSstability();
	+ if (fluid == 1)
	+ if (cfd_solver == 0)
	+ checkNSstability();

	// Write binary output file
	time.step_count += 1;
	diff --git a/src/sphere.cpp b/src/sphere.cpp
	t@@ -21,7 +21,7 @@ DEM::DEM(const std::string inputbin,
	const int transferConstMem,
	const int fluidFlow,
	const int device)
	-: verbose(verbosity), fluid(fluidFlow), device(device)
	+: verbose(verbosity), device(device), fluid(fluidFlow)
	{
	using std::cout;
	using std::cerr;
	t@@ -874,7 +874,7 @@ void DEM::forcechains(const std::string format, const int …
	cout << k.x[i].z;
	cout << " to " << k.x[j].x << ',';
	if (threedim == 1)
	- cout << k.x[j].y, ',';
	+ cout << k.x[j].y << ',';
	cout << k.x[j].z;
	cout << " nohead "
	<< "lw " << ratio * thickness_scaling
	diff --git a/tests/io_tests.py b/tests/io_tests.py
	t@@ -27,6 +27,7 @@ compare(orig, py, "Python IO:")
	# Test C++ IO routines
	#orig.run(verbose=True, hideinputfile=True)
	orig.run(dry=True)
	+#orig.run(valgrind=True)
	orig.run()
	cpp = sphere.sim()
	cpp.readbin("../output/" + orig.sid + ".output00000.bin", verbose=False)