GopherProxy

	tdevice.cu: estimate number of cudaCoresPerSM for CC version 8 - sphere - GPU-b…
	git clone git://src.adamsgaard.dk/sphere
	Log
	Files
	Refs
	LICENSE
	---
	commit 04f7aa6ff1ec5eb78167e8b40fb399ed3f63a093
	parent 401f418676345ca4b80e2725106c40fc37f28e31
	Author: Anders Damsgaard <[email protected]>
	Date: Thu, 9 Feb 2023 12:43:46 +0100

	device.cu: estimate number of cudaCoresPerSM for CC version 8

	Diffstat:
	M src/device.cu \| 104 ++++++++++++++++-------------…

	1 file changed, 53 insertions(+), 51 deletions(-)
	---
	diff --git a/src/device.cu b/src/device.cu
	t@@ -54,6 +54,8 @@ int cudaCoresPerSM(int major, int minor)
	return 128;
	else if (major == 7)
	return 32;
	+ else if (major == 8)
	+ return 64;
	else
	printf("Error in cudaCoresPerSM Device compute capability value "
	"(%d.%d) not recognized.", major, minor);
	t@@ -112,11 +114,11 @@ void DEM::initializeGPU(void)

	if (verbose == 1) {
	cout << " CUDA device ID: " << d << "\n";
	- cout << " - Name: " << prop.name << ", compute capability: "
	+ cout << " - Name: " << prop.name << ", compute capability: "
	<< prop.major << "." << prop.minor << ".\n";
	- cout << " - CUDA Driver version: " << cudaDriverVersion/1000
	- << "." << cudaDriverVersion%100
	- << ", runtime version " << cudaRuntimeVersion/1000 << "."
	+ cout << " - CUDA Driver version: " << cudaDriverVersion/1000
	+ << "." << cudaDriverVersion%100
	+ << ", runtime version " << cudaRuntimeVersion/1000 << "."
	<< cudaRuntimeVersion%100 << std::endl;
	}
	}
	t@@ -138,11 +140,11 @@ void DEM::initializeGPU(void)

	if (verbose == 1) {
	cout << " CUDA device ID: " << device << "\n";
	- cout << " - Name: " << prop.name << ", compute capability: "
	+ cout << " - Name: " << prop.name << ", compute capability: "
	<< prop.major << "." << prop.minor << ".\n";
	- cout << " - CUDA Driver version: " << cudaDriverVersion/1000
	- << "." << cudaDriverVersion%100
	- << ", runtime version " << cudaRuntimeVersion/1000 << "."
	+ cout << " - CUDA Driver version: " << cudaDriverVersion/1000
	+ << "." << cudaDriverVersion%100
	+ << ", runtime version " << cudaRuntimeVersion/1000 << "."
	<< cudaRuntimeVersion%100
	<< "\n - " << ncudacores << " CUDA cores" << std::endl;
	}
	t@@ -284,7 +286,7 @@ __global__ void checkParticlePositions(


	// Copy the constant data components to device memory,
	-// and check whether the values correspond to the
	+// and check whether the values correspond to the
	// values in constant memory.
	void DEM::checkConstantMemory()
	{
	t@@ -386,7 +388,7 @@ void DEM::updateGridSize()
	grid.L[2] = *Lz;

	// Write value to devC_grid.L[2]
	- //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float));
	+ //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float));
	cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));

	checkForCudaErrors("DEM::updateGridSize(): write to devC_grid.L[2]");
	t@@ -857,15 +859,15 @@ void DEM::startTime()
	tic = clock();

	//// GPU workload configuration
	- unsigned int threadsPerBlock = 256;
	- //unsigned int threadsPerBlock = 512;
	+ unsigned int threadsPerBlock = 256;
	+ //unsigned int threadsPerBlock = 512;

	// Create enough blocks to accomodate the particles
	- unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
	+ unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
	dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
	dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block

	- unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
	+ unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
	dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid

	// Use 3D block and grid layout for cell-centered fluid calculations
	t@@ -930,9 +932,9 @@ void DEM::startTime()
	//sprintf(file,"output/%s.status.dat", sid);
	outfile = "output/" + sid + ".status.dat";
	fp = fopen(outfile.c_str(), "w");
	- fprintf(fp,"%2.4e %2.4e %d\n",
	- time.current,
	- 100.0*time.current/time.total,
	+ fprintf(fp,"%2.4e %2.4e %d\n",
	+ time.current,
	+ 100.0*time.current/time.total,
	time.step_count);
	fclose(fp);

	t@@ -1052,19 +1054,19 @@ void DEM::startTime()
	checkForCudaErrorsIter("Post checkParticlePositions", iter);
	#endif

	- // If the grid is adaptive, readjust the grid height to equal the
	+ // If the grid is adaptive, readjust the grid height to equal the
	// positions of the dynamic walls
	if (grid.adaptive == 1 && walls.nw > 0) {
	updateGridSize();
	}

	- // For each particle:
	- // Compute hash key (cell index) from position
	+ // For each particle:
	+ // Compute hash key (cell index) from position
	// in the fine, uniform and homogenous grid.
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
	- dev_gridParticleIndex,
	+ dev_gridParticleIndex,
	dev_x);

	// Synchronization point
	t@@ -1093,7 +1095,7 @@ void DEM::startTime()
	// Zero cell array values by setting cellStart to its highest
	// possible value, specified with pointer value 0xffffffff, which
	// for a 32 bit unsigned int is 4294967295.
	- cudaMemset(dev_cellStart, 0xffffffff,
	+ cudaMemset(dev_cellStart, 0xffffffff,
	grid.num[0]grid.num[1]grid.num[2]*sizeof(unsigned int));
	cudaThreadSynchronize();
	checkForCudaErrorsIter("Post cudaMemset", iter);
	t@@ -1103,14 +1105,14 @@ void DEM::startTime()
	// configurations in new arrays (*_sorted).
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
	+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
	dev_cellEnd,
	- dev_gridParticleCellID,
	+ dev_gridParticleCellID,
	dev_gridParticleIndex,
	- dev_x, dev_vel,
	+ dev_x, dev_vel,
	dev_angvel,
	- dev_x_sorted,
	- dev_vel_sorted,
	+ dev_x_sorted,
	+ dev_vel_sorted,
	dev_angvel_sorted);

	// Synchronization point
	t@@ -1127,10 +1129,10 @@ void DEM::startTime()
	// For each particle: Search contacts in neighbor cells
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- topology<<<dimGrid, dimBlock>>>(dev_cellStart,
	+ topology<<<dimGrid, dimBlock>>>(dev_cellStart,
	dev_cellEnd,
	dev_gridParticleIndex,
	- dev_x_sorted,
	+ dev_x_sorted,
	dev_contacts,
	dev_distmod);

	t@@ -1159,10 +1161,10 @@ void DEM::startTime()
	dev_angvel_sorted,
	dev_vel,
	dev_angvel,
	- dev_force,
	- dev_torque,
	+ dev_force,
	+ dev_torque,
	dev_es_dot,
	- dev_ev_dot,
	+ dev_ev_dot,
	dev_es,
	dev_ev,
	dev_p,
	t@@ -1828,7 +1830,7 @@ void DEM::startTime()
	if (write_conv_log == 1)
	convlog << iter+1 << '\t' << nijac << std::end…

	- std::cerr << "\nIteration " << iter << ", time "
	+ std::cerr << "\nIteration " << iter << ", time "
	<< iter*time.dt << " s: "
	"Error, the epsilon solution in the fluid "
	"calculations did not converge. Try increasing…
	t@@ -1898,7 +1900,7 @@ void DEM::startTime()
	} // end cfd_solver == 0

	// Darcy solution
	- else if (cfd_solver == 1) {
	+ else if (cfd_solver == 1) {

	#if defined(REPORT_EPSILON) \|\| defined(REPORT_FORCING_TERMS)
	std::cout << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
	t@@ -2027,7 +2029,7 @@ void DEM::startTime()

	// copy porosities to the frictionless Y boundaries
	if (grid.periodic == 2) {
	- copyDarcyPorositiesToEdges<<<dimGridFluid,
	+ copyDarcyPorositiesToEdges<<<dimGridFluid,
	dimBlockFluid>>>(
	dev_darcy_phi,
	dev_darcy_dphi,
	t@@ -2038,7 +2040,7 @@ void DEM::startTime()

	// copy porosities to the frictionless lower Z boundary
	if (grid.periodic == 2) {
	- copyDarcyPorositiesToBottom<<<dimGridFluid,
	+ copyDarcyPorositiesToBottom<<<dimGridFluid,
	dimBlockFluid>>>(
	dev_darcy_phi,
	dev_darcy_dphi,
	t@@ -2369,7 +2371,7 @@ void DEM::startTime()
	if (write_conv_log == 1)
	convlog << iter+1 << '\t' << nijac << std::end…

	- std::cerr << "\nIteration " << iter << ", time "
	+ std::cerr << "\nIteration " << iter << ", time "
	<< iter*time.dt << " s: "
	"Error, the pressure solution in the fluid "
	"calculations did not converge. Try increasing…
	t@@ -2471,14 +2473,14 @@ void DEM::startTime()
	// Update particle kinematics
	if (PROFILING == 1)
	startTimer(&kernel_tic);
	- integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
	- dev_vel_sorted,
	+ integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
	+ dev_vel_sorted,
	dev_angvel_sorted,
	- dev_x,
	- dev_vel,
	+ dev_x,
	+ dev_vel,
	dev_angvel,
	dev_force,
	- dev_torque,
	+ dev_torque,
	dev_angpos,
	dev_acc,
	dev_angacc,
	t@@ -2571,7 +2573,7 @@ void DEM::startTime()
	}


	- // Produce output binary if the time interval
	+ // Produce output binary if the time interval
	// between output files has been reached
	if (filetimeclock >= time.file_dt) {

	t@@ -2646,19 +2648,19 @@ void DEM::startTime()
	cout << "\n## Particle " << n << " ##\n";

	cout << "- contacts:\n";
	- for (int nc = 0; nc < NC; ++nc)
	+ for (int nc = 0; nc < NC; ++nc)
	cout << "[" << nc << "]=" << k.contacts[nc+NC*n] <<
	'\n';

	cout << "\n- delta_t:\n";
	- for (int nc = 0; nc < NC; ++nc)
	+ for (int nc = 0; nc < NC; ++nc)
	cout << k.delta_t[nc+NC*n].x << '\t'
	<< k.delta_t[nc+NC*n].y << '\t'
	<< k.delta_t[nc+NC*n].z << '\t'
	<< k.delta_t[nc+NC*n].w << '\n';

	cout << "\n- distmod:\n";
	- for (int nc = 0; nc < NC; ++nc)
	+ for (int nc = 0; nc < NC; ++nc)
	cout << k.distmod[nc+NC*n].x << '\t'
	<< k.distmod[nc+NC*n].y << '\t'
	<< k.distmod[nc+NC*n].z << '\t'
	t@@ -2667,11 +2669,11 @@ void DEM::startTime()
	cout << '\n';
	}

	- // Update status.dat at the interval of filetime
	+ // Update status.dat at the interval of filetime
	outfile = "output/" + sid + ".status.dat";
	fp = fopen(outfile.c_str(), "w");
	- fprintf(fp,"%2.4e %2.4e %d\n",
	- time.current,
	+ fprintf(fp,"%2.4e %2.4e %d\n",
	+ time.current,
	100.0*time.current/time.total,
	time.step_count);
	fclose(fp);
	t@@ -2697,7 +2699,7 @@ void DEM::startTime()

	if (verbose == 1) {
	cout << "\nSimulation ended. Statistics:\n"
	- << " - Last output file number: "
	+ << " - Last output file number: "
	<< time.step_count << "\n"
	<< " - GPU time spent: "
	<< dev_time_spent/1000.0f << " s\n"
	t@@ -2705,7 +2707,7 @@ void DEM::startTime()
	<< time_spent << " s\n"
	<< " - Mean duration of iteration:\n"
	<< " " << dev_time_spent/((double)iter*1000.0f) << " s"
	- << std::endl;
	+ << std::endl;
	}

	cudaEventDestroy(dev_tic);