diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 493e049010bb725934600289bdd00029d56bb0f1..942158e7c210024b0b1c7bca447d80ac4ffb41d3 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -9,165 +9,165 @@ //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) //{ -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Device to Host -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborX[i].f[0], -// para->getParD(level)->sendProcessNeighborX[i].index, -// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// cudaManager->cudaCopyProcessNeighborXFsDH(level, i); -// } -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start non blocking MPI receive -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0], -// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborX[i].rankNeighbor); -// } -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////start non blocking MPI send -// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// //{ -// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], -// // para->getParH(level)->sendProcessNeighborX[i].numberOfFs, -// // para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); -// //} -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////Waitall -// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) -// //{ -// // comm->waitallGPU(); -// //} -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start blocking MPI send -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], -// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //Wait -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// comm->waitGPU(i); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //reset the request array -// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) -// { -// comm->resetRequest(); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Host to Device -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// cudaManager->cudaCopyProcessNeighborXFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborX[i].f[0], -// para->getParD(level)->recvProcessNeighborX[i].index, -// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Device to Host +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborX[i].f[0], +// para->getParD(level)->sendProcessNeighborX[i].index, +// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// cudaManager->cudaCopyProcessNeighborXFsDH(level, i); +// } +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start non blocking MPI receive +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0], +// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborX[i].rankNeighbor); +// } +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////start non blocking MPI send +// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// //{ +// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], +// // para->getParH(level)->sendProcessNeighborX[i].numberOfFs, +// // para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); +// //} +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////Waitall +// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) +// //{ +// // comm->waitallGPU(); +// //} +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start blocking MPI send +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], +// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //Wait +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// comm->waitGPU(i); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //reset the request array +// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) +// { +// comm->resetRequest(); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Host to Device +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// cudaManager->cudaCopyProcessNeighborXFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborX[i].f[0], +// para->getParD(level)->recvProcessNeighborX[i].index, +// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->sendProcessNeighborX[i].f[0], - para->getParD(level)->sendProcessNeighborX[i].index, - para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborXFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0], - para->getParH(level)->recvProcessNeighborX[i].numberOfFs, - para->getParH(level)->recvProcessNeighborX[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], - // para->getParH(level)->sendProcessNeighborX[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], - para->getParH(level)->sendProcessNeighborX[i].numberOfFs, - para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborXFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->recvProcessNeighborX[i].f[0], - para->getParD(level)->recvProcessNeighborX[i].index, - para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->sendProcessNeighborX[i].f[0], + para->getParD(level)->sendProcessNeighborX[i].index, + para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborXFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0], + para->getParH(level)->recvProcessNeighborX[i].numberOfFs, + para->getParH(level)->recvProcessNeighborX[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], + // para->getParH(level)->sendProcessNeighborX[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], + para->getParH(level)->sendProcessNeighborX[i].numberOfFs, + para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborXFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->recvProcessNeighborX[i].f[0], + para->getParD(level)->recvProcessNeighborX[i].index, + para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -181,93 +181,93 @@ void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu // int streamIndex) //{ // cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Device to Host -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborY[i].f[0], -// para->getParD(level)->sendProcessNeighborY[i].index, -// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads, -// stream); -// //////////////////////////////////////////////////////////////////////////); -// cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); -// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Device to Host +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborY[i].f[0], +// para->getParD(level)->sendProcessNeighborY[i].index, +// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads, +// stream); +// //////////////////////////////////////////////////////////////////////////); +// cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); +// } // -// //if (para->getUseStreams() && startBulkKernel!=nullptr) -// // cudaEventRecord(*startBulkKernel, stream); +// //if (para->getUseStreams() && startBulkKernel!=nullptr) +// // cudaEventRecord(*startBulkKernel, stream); // -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start non blocking MPI receive -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0], -// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborY[i].rankNeighbor); -// } -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////start non blocking MPI send -// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// //{ -// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], -// // para->getParH(level)->sendProcessNeighborY[i].numberOfFs, -// // para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); -// //} -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////Waitall -// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) -// //{ -// // comm->waitallGPU(); -// //} +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start non blocking MPI receive +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0], +// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborY[i].rankNeighbor); +// } +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////start non blocking MPI send +// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// //{ +// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], +// // para->getParH(level)->sendProcessNeighborY[i].numberOfFs, +// // para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); +// //} +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////Waitall +// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) +// //{ +// // comm->waitallGPU(); +// //} // ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // wait for memcopy host to device to finish before sending data // if (para->getUseStreams()) // cudaStreamSynchronize(stream); // ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start blocking MPI send -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], -// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //Wait -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// comm->waitGPU(i); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //reset the request array -// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) -// { -// comm->resetRequest(); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Host to Device -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// cudaManager->cudaCopyProcessNeighborYFsHD(level, i, streamIndex); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborY[i].f[0], -// para->getParD(level)->recvProcessNeighborY[i].index, -// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads, -// stream); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start blocking MPI send +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], +// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //Wait +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// comm->waitGPU(i); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //reset the request array +// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) +// { +// comm->resetRequest(); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Host to Device +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// cudaManager->cudaCopyProcessNeighborYFsHD(level, i, streamIndex); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborY[i].f[0], +// para->getParD(level)->recvProcessNeighborY[i].index, +// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads, +// stream); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //} void prepareExchangePostCollDataYGPU27(Parameter *para, int level, int streamIndex, std::string eventToTrigger) @@ -288,77 +288,77 @@ void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu int streamIndex) { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); + cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0], - para->getParH(level)->recvProcessNeighborY[i].numberOfFs, - para->getParH(level)->recvProcessNeighborY[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], - // para->getParH(level)->sendProcessNeighborY[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - //{ - // comm->waitallGPU(); - //} + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0], + para->getParH(level)->recvProcessNeighborY[i].numberOfFs, + para->getParH(level)->recvProcessNeighborY[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], + // para->getParH(level)->sendProcessNeighborY[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + //{ + // comm->waitallGPU(); + //} ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // wait for memcopy device to host to finish before sending data if (para->getUseStreams()) cudaStreamSynchronize(stream); - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // - //start blocking MPI send + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // + //start blocking MPI send for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], - para->getParH(level)->sendProcessNeighborY[i].numberOfFs, - para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], + para->getParH(level)->sendProcessNeighborY[i].numberOfFs, + para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborYFsHD(level, i, streamIndex); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->recvProcessNeighborY[i].f[0], - para->getParD(level)->recvProcessNeighborY[i].index, - para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads, - stream); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + { + cudaManager->cudaCopyProcessNeighborYFsHD(level, i, streamIndex); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->recvProcessNeighborY[i].f[0], + para->getParD(level)->recvProcessNeighborY[i].index, + para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads, + stream); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -370,165 +370,165 @@ void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) //{ -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Device to Host -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborZ[i].f[0], -// para->getParD(level)->sendProcessNeighborZ[i].index, -// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// cudaManager->cudaCopyProcessNeighborZFsDH(level, i); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start non blocking MPI receive -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0], -// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor); -// } -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////start non blocking MPI send -// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// //{ -// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], -// // para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, -// // para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); -// //} -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ////Waitall -// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) -// //{ -// // comm->waitallGPU(); -// //} -// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //start blocking MPI send -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], -// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //Wait -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// comm->waitGPU(i); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //reset the request array -// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) -// { -// comm->resetRequest(); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// //copy Host to Device -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// cudaManager->cudaCopyProcessNeighborZFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborZ[i].f[0], -// para->getParD(level)->recvProcessNeighborZ[i].index, -// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// } -// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Device to Host +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborZ[i].f[0], +// para->getParD(level)->sendProcessNeighborZ[i].index, +// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// cudaManager->cudaCopyProcessNeighborZFsDH(level, i); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start non blocking MPI receive +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0], +// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor); +// } +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////start non blocking MPI send +// //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// //{ +// // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], +// // para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, +// // para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); +// //} +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ////Waitall +// //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) +// //{ +// // comm->waitallGPU(); +// //} +// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //start blocking MPI send +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// comm->sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], +// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //Wait +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// comm->waitGPU(i); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //reset the request array +// if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) +// { +// comm->resetRequest(); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// //copy Host to Device +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// cudaManager->cudaCopyProcessNeighborZFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborZ[i].f[0], +// para->getParD(level)->recvProcessNeighborZ[i].index, +// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// } +// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->sendProcessNeighborZ[i].f[0], - para->getParD(level)->sendProcessNeighborZ[i].index, - para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborZFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0], - para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, - para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], - // para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], - para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, - para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborZFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->recvProcessNeighborZ[i].f[0], - para->getParD(level)->recvProcessNeighborZ[i].index, - para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->sendProcessNeighborZ[i].f[0], + para->getParD(level)->sendProcessNeighborZ[i].index, + para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborZFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0], + para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, + para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], + // para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], + para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, + para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborZFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->recvProcessNeighborZ[i].f[0], + para->getParD(level)->recvProcessNeighborZ[i].index, + para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -555,42 +555,42 @@ void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++) - { - ////////////////////////////////////////////////////////////////////////// - GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->sendProcessNeighbor[i].f[0], - para->getParD(level)->sendProcessNeighbor[i].index, - para->getParD(level)->sendProcessNeighbor[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborFsDH(level, i); - ////////////////////////////////////////////////////////////////////////// - comm->exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], - para->getParH(level)->sendProcessNeighbor[i].numberOfFs, - para->getParH(level)->recvProcessNeighbor[i].f[0], - para->getParH(level)->recvProcessNeighbor[i].numberOfFs, - para->getParH(level)->sendProcessNeighbor[i].rankNeighbor); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->recvProcessNeighbor[i].f[0], - para->getParD(level)->recvProcessNeighbor[i].index, - para->getParD(level)->recvProcessNeighbor[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - } + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++) + { + ////////////////////////////////////////////////////////////////////////// + GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->sendProcessNeighbor[i].f[0], + para->getParD(level)->sendProcessNeighbor[i].index, + para->getParD(level)->sendProcessNeighbor[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborFsDH(level, i); + ////////////////////////////////////////////////////////////////////////// + comm->exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], + para->getParH(level)->sendProcessNeighbor[i].numberOfFs, + para->getParH(level)->recvProcessNeighbor[i].f[0], + para->getParH(level)->recvProcessNeighbor[i].numberOfFs, + para->getParH(level)->sendProcessNeighbor[i].rankNeighbor); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->recvProcessNeighbor[i].f[0], + para->getParD(level)->recvProcessNeighbor[i].index, + para->getParD(level)->recvProcessNeighbor[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -600,42 +600,42 @@ void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, Cuda //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++) - { - ////////////////////////////////////////////////////////////////////////// - GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->sendProcessNeighbor[i].f[0], - para->getParD(level)->sendProcessNeighbor[i].index, - para->getParD(level)->sendProcessNeighbor[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborFsDH(level, i); - ////////////////////////////////////////////////////////////////////////// - comm->exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], - para->getParH(level)->sendProcessNeighbor[i].numberOfFs, - para->getParH(level)->recvProcessNeighbor[i].f[0], - para->getParH(level)->recvProcessNeighbor[i].numberOfFs, - para->getParH(level)->sendProcessNeighbor[i].rankNeighbor); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->recvProcessNeighbor[i].f[0], - para->getParD(level)->recvProcessNeighbor[i].index, - para->getParD(level)->recvProcessNeighbor[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - } + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++) + { + ////////////////////////////////////////////////////////////////////////// + GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->sendProcessNeighbor[i].f[0], + para->getParD(level)->sendProcessNeighbor[i].index, + para->getParD(level)->sendProcessNeighbor[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborFsDH(level, i); + ////////////////////////////////////////////////////////////////////////// + comm->exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], + para->getParH(level)->sendProcessNeighbor[i].numberOfFs, + para->getParH(level)->recvProcessNeighbor[i].f[0], + para->getParH(level)->recvProcessNeighbor[i].numberOfFs, + para->getParH(level)->sendProcessNeighbor[i].rankNeighbor); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->recvProcessNeighbor[i].f[0], + para->getParD(level)->recvProcessNeighbor[i].index, + para->getParD(level)->recvProcessNeighbor[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -649,82 +649,82 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborX[i].f[0], -// para->getParD(level)->sendProcessNeighborX[i].index, -// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborXFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], -// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborX[i].f[0], -// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborXFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborX[i].f[0], -// para->getParD(level)->recvProcessNeighborX[i].index, -// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborX[i].f[0], +// para->getParD(level)->sendProcessNeighborX[i].index, +// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborXFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], +// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborX[i].f[0], +// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborXFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborX[i].f[0], +// para->getParD(level)->recvProcessNeighborX[i].index, +// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborX[i].f[0], -// para->getParD(level)->sendProcessNeighborX[i].index, -// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborXFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], -// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborX[i].f[0], -// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborXFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborX[i].f[0], -// para->getParD(level)->recvProcessNeighborX[i].index, -// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborX[i].f[0], +// para->getParD(level)->sendProcessNeighborX[i].index, +// para->getParD(level)->sendProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborXFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], +// para->getParH(level)->sendProcessNeighborX[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborX[i].f[0], +// para->getParH(level)->recvProcessNeighborX[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborX[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborXFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborX[i].f[0], +// para->getParD(level)->recvProcessNeighborX[i].index, +// para->getParD(level)->recvProcessNeighborX[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -736,82 +736,82 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborY[i].f[0], -// para->getParD(level)->sendProcessNeighborY[i].index, -// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborYFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], -// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborY[i].f[0], -// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborYFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborY[i].f[0], -// para->getParD(level)->recvProcessNeighborY[i].index, -// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborY[i].f[0], +// para->getParD(level)->sendProcessNeighborY[i].index, +// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborYFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], +// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborY[i].f[0], +// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborYFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborY[i].f[0], +// para->getParD(level)->recvProcessNeighborY[i].index, +// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborY[i].f[0], -// para->getParD(level)->sendProcessNeighborY[i].index, -// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborYFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], -// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborY[i].f[0], -// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborYFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborY[i].f[0], -// para->getParD(level)->recvProcessNeighborY[i].index, -// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborY[i].f[0], +// para->getParD(level)->sendProcessNeighborY[i].index, +// para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborYFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], +// para->getParH(level)->sendProcessNeighborY[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborY[i].f[0], +// para->getParH(level)->recvProcessNeighborY[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborY[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborYFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborY[i].f[0], +// para->getParD(level)->recvProcessNeighborY[i].index, +// para->getParD(level)->recvProcessNeighborY[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -823,82 +823,82 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborZ[i].f[0], -// para->getParD(level)->sendProcessNeighborZ[i].index, -// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborZFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], -// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborZ[i].f[0], -// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborZFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborZ[i].f[0], -// para->getParD(level)->recvProcessNeighborZ[i].index, -// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborZ[i].f[0], +// para->getParD(level)->sendProcessNeighborZ[i].index, +// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborZFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], +// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborZ[i].f[0], +// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborZFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborZ[i].f[0], +// para->getParD(level)->recvProcessNeighborZ[i].index, +// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, int level) //{ -// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) -// { -// ////////////////////////////////////////////////////////////////////////// -// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->sendProcessNeighborZ[i].f[0], -// para->getParD(level)->sendProcessNeighborZ[i].index, -// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborZFsDH(level, i); -// ////////////////////////////////////////////////////////////////////////// -// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], -// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->recvProcessNeighborZ[i].f[0], -// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, -// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); -// ////////////////////////////////////////////////////////////////////////// -// para->cudaCopyProcessNeighborZFsHD(level, i); -// ////////////////////////////////////////////////////////////////////////// -// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], -// para->getParD(level)->recvProcessNeighborZ[i].f[0], -// para->getParD(level)->recvProcessNeighborZ[i].index, -// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, -// para->getParD(level)->neighborX_SP, -// para->getParD(level)->neighborY_SP, -// para->getParD(level)->neighborZ_SP, -// para->getParD(level)->size_Mat_SP, -// para->getParD(level)->evenOrOdd, -// para->getParD(level)->numberofthreads); -// ////////////////////////////////////////////////////////////////////////// -// } +// for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) +// { +// ////////////////////////////////////////////////////////////////////////// +// GetSendFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->sendProcessNeighborZ[i].f[0], +// para->getParD(level)->sendProcessNeighborZ[i].index, +// para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborZFsDH(level, i); +// ////////////////////////////////////////////////////////////////////////// +// comm->exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], +// para->getParH(level)->sendProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->recvProcessNeighborZ[i].f[0], +// para->getParH(level)->recvProcessNeighborZ[i].numberOfFs, +// para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor); +// ////////////////////////////////////////////////////////////////////////// +// para->cudaCopyProcessNeighborZFsHD(level, i); +// ////////////////////////////////////////////////////////////////////////// +// SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0], +// para->getParD(level)->recvProcessNeighborZ[i].f[0], +// para->getParD(level)->recvProcessNeighborZ[i].index, +// para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes, +// para->getParD(level)->neighborX_SP, +// para->getParD(level)->neighborY_SP, +// para->getParD(level)->neighborZ_SP, +// para->getParD(level)->size_Mat_SP, +// para->getParD(level)->evenOrOdd, +// para->getParD(level)->numberofthreads); +// ////////////////////////////////////////////////////////////////////////// +// } //} ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -958,164 +958,164 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - GetSendFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADX[i].f[0], - para->getParD(level)->sendProcessNeighborADX[i].index, - para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADXFsDH(level, i); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0], - para->getParH(level)->recvProcessNeighborADX[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], - // para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], - para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADXFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADX[i].f[0], - para->getParD(level)->recvProcessNeighborADX[i].index, - para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + GetSendFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADX[i].f[0], + para->getParD(level)->sendProcessNeighborADX[i].index, + para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADXFsDH(level, i); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0], + para->getParH(level)->recvProcessNeighborADX[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], + // para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], + para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADXFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADX[i].f[0], + para->getParD(level)->recvProcessNeighborADX[i].index, + para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADX[i].f[0], - para->getParD(level)->sendProcessNeighborADX[i].index, - para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADXFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0], - para->getParH(level)->recvProcessNeighborADX[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], - // para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], - para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADXFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADX[i].f[0], - para->getParD(level)->recvProcessNeighborADX[i].index, - para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + GetSendFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADX[i].f[0], + para->getParD(level)->sendProcessNeighborADX[i].index, + para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADXFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0], + para->getParH(level)->recvProcessNeighborADX[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], + // para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0], + para->getParH(level)->sendProcessNeighborADX[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADXFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADX[i].f[0], + para->getParD(level)->recvProcessNeighborADX[i].index, + para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1127,164 +1127,164 @@ void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator* comm, //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - GetSendFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADY[i].f[0], - para->getParD(level)->sendProcessNeighborADY[i].index, - para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADYFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0], - para->getParH(level)->recvProcessNeighborADY[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], - // para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], - para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADYFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADY[i].f[0], - para->getParD(level)->recvProcessNeighborADY[i].index, - para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + GetSendFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADY[i].f[0], + para->getParD(level)->sendProcessNeighborADY[i].index, + para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADYFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0], + para->getParH(level)->recvProcessNeighborADY[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], + // para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], + para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADYFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADY[i].f[0], + para->getParD(level)->recvProcessNeighborADY[i].index, + para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADY[i].f[0], - para->getParD(level)->sendProcessNeighborADY[i].index, - para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADYFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0], - para->getParH(level)->recvProcessNeighborADY[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], - // para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], - para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADYFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADY[i].f[0], - para->getParD(level)->recvProcessNeighborADY[i].index, - para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + GetSendFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADY[i].f[0], + para->getParD(level)->sendProcessNeighborADY[i].index, + para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADYFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0], + para->getParH(level)->recvProcessNeighborADY[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], + // para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0], + para->getParH(level)->sendProcessNeighborADY[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADYFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADY[i].f[0], + para->getParD(level)->recvProcessNeighborADY[i].index, + para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1296,164 +1296,164 @@ void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator* comm, //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - GetSendFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADZ[i].f[0], - para->getParD(level)->sendProcessNeighborADZ[i].index, - para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADZFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0], - para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], - // para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], - para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADZFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPreDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADZ[i].f[0], - para->getParD(level)->recvProcessNeighborADZ[i].index, - para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + GetSendFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADZ[i].f[0], + para->getParD(level)->sendProcessNeighborADZ[i].index, + para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADZFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0], + para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], + // para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], + para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADZFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPreDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADZ[i].f[0], + para->getParD(level)->recvProcessNeighborADZ[i].index, + para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->sendProcessNeighborADZ[i].f[0], - para->getParD(level)->sendProcessNeighborADZ[i].index, - para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborADZFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0], - para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs, - para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////start non blocking MPI send - //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - //{ - // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], - // para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, - // para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ////Waitall - //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - //{ - // comm->waitallGPU(); - //} - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], - para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, - para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborADZFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - SetRecvFsPostDev27(para->getParD(level)->d27.f[0], - para->getParD(level)->recvProcessNeighborADZ[i].f[0], - para->getParD(level)->recvProcessNeighborADZ[i].index, - para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + GetSendFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->sendProcessNeighborADZ[i].f[0], + para->getParD(level)->sendProcessNeighborADZ[i].index, + para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborADZFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0], + para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs, + para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////start non blocking MPI send + //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + //{ + // comm->nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], + // para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, + // para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ////Waitall + //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + //{ + // comm->waitallGPU(); + //} + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0], + para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs, + para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborADZFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + SetRecvFsPostDev27(para->getParD(level)->d27.f[0], + para->getParD(level)->recvProcessNeighborADZ[i].f[0], + para->getParD(level)->recvProcessNeighborADZ[i].index, + para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1512,73 +1512,73 @@ void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator* comm, //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - getSendGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->sendProcessNeighborF3X[i].g[0], - para->getParD(level)->sendProcessNeighborF3X[i].index, - para->getParD(level)->sendProcessNeighborF3X[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborF3XFsDH(level, i); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->nbRecvDataGPU( - para->getParH(level)->recvProcessNeighborF3X[i].g[0], - para->getParH(level)->recvProcessNeighborF3X[i].numberOfGs, - para->getParH(level)->recvProcessNeighborF3X[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->sendDataGPU( - para->getParH(level)->sendProcessNeighborF3X[i].g[0], - para->getParH(level)->sendProcessNeighborF3X[i].numberOfGs, - para->getParH(level)->sendProcessNeighborF3X[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborF3XFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - setRecvGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->recvProcessNeighborF3X[i].g[0], - para->getParD(level)->recvProcessNeighborF3X[i].index, - para->getParD(level)->recvProcessNeighborF3X[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + getSendGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->sendProcessNeighborF3X[i].g[0], + para->getParD(level)->sendProcessNeighborF3X[i].index, + para->getParD(level)->sendProcessNeighborF3X[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborF3XFsDH(level, i); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->nbRecvDataGPU( + para->getParH(level)->recvProcessNeighborF3X[i].g[0], + para->getParH(level)->recvProcessNeighborF3X[i].numberOfGs, + para->getParH(level)->recvProcessNeighborF3X[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->sendDataGPU( + para->getParH(level)->sendProcessNeighborF3X[i].g[0], + para->getParH(level)->sendProcessNeighborF3X[i].numberOfGs, + para->getParH(level)->sendProcessNeighborF3X[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborF3XFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + setRecvGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->recvProcessNeighborF3X[i].g[0], + para->getParD(level)->recvProcessNeighborF3X[i].index, + para->getParD(level)->recvProcessNeighborF3X[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1590,73 +1590,73 @@ void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - getSendGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->sendProcessNeighborF3Y[i].g[0], - para->getParD(level)->sendProcessNeighborF3Y[i].index, - para->getParD(level)->sendProcessNeighborF3Y[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborF3YFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->nbRecvDataGPU( - para->getParH(level)->recvProcessNeighborF3Y[i].g[0], - para->getParH(level)->recvProcessNeighborF3Y[i].numberOfGs, - para->getParH(level)->recvProcessNeighborF3Y[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->sendDataGPU( - para->getParH(level)->sendProcessNeighborF3Y[i].g[0], - para->getParH(level)->sendProcessNeighborF3Y[i].numberOfGs, - para->getParH(level)->sendProcessNeighborF3Y[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborF3YFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - setRecvGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->recvProcessNeighborF3Y[i].g[0], - para->getParD(level)->recvProcessNeighborF3Y[i].index, - para->getParD(level)->recvProcessNeighborF3Y[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + getSendGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->sendProcessNeighborF3Y[i].g[0], + para->getParD(level)->sendProcessNeighborF3Y[i].index, + para->getParD(level)->sendProcessNeighborF3Y[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborF3YFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->nbRecvDataGPU( + para->getParH(level)->recvProcessNeighborF3Y[i].g[0], + para->getParH(level)->recvProcessNeighborF3Y[i].numberOfGs, + para->getParH(level)->recvProcessNeighborF3Y[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->sendDataGPU( + para->getParH(level)->sendProcessNeighborF3Y[i].g[0], + para->getParH(level)->sendProcessNeighborF3Y[i].numberOfGs, + para->getParH(level)->sendProcessNeighborF3Y[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborF3YFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + setRecvGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->recvProcessNeighborF3Y[i].g[0], + para->getParD(level)->recvProcessNeighborF3Y[i].index, + para->getParD(level)->recvProcessNeighborF3Y[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1668,73 +1668,73 @@ void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangeCollDataF3ZGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - getSendGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->sendProcessNeighborF3Z[i].g[0], - para->getParD(level)->sendProcessNeighborF3Z[i].index, - para->getParD(level)->sendProcessNeighborF3Z[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborF3ZFsDH(level, i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start non blocking MPI receive - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->nbRecvDataGPU( - para->getParH(level)->recvProcessNeighborF3Z[i].g[0], - para->getParH(level)->recvProcessNeighborF3Z[i].numberOfGs, - para->getParH(level)->recvProcessNeighborF3Z[i].rankNeighbor); - } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //start blocking MPI send - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->sendDataGPU( - para->getParH(level)->sendProcessNeighborF3Z[i].g[0], - para->getParH(level)->sendProcessNeighborF3Z[i].numberOfGs, - para->getParH(level)->sendProcessNeighborF3Z[i].rankNeighbor); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - comm->waitGPU(i); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array - if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) - { - comm->resetRequest(); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { - cudaManager->cudaCopyProcessNeighborF3ZFsHD(level, i); - ////////////////////////////////////////////////////////////////////////// - setRecvGsDevF3( - para->getParD(level)->g6.g[0], - para->getParD(level)->recvProcessNeighborF3Z[i].g[0], - para->getParD(level)->recvProcessNeighborF3Z[i].index, - para->getParD(level)->recvProcessNeighborF3Z[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + getSendGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->sendProcessNeighborF3Z[i].g[0], + para->getParD(level)->sendProcessNeighborF3Z[i].index, + para->getParD(level)->sendProcessNeighborF3Z[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + ////////////////////////////////////////////////////////////////////////// + cudaManager->cudaCopyProcessNeighborF3ZFsDH(level, i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start non blocking MPI receive + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->nbRecvDataGPU( + para->getParH(level)->recvProcessNeighborF3Z[i].g[0], + para->getParH(level)->recvProcessNeighborF3Z[i].numberOfGs, + para->getParH(level)->recvProcessNeighborF3Z[i].rankNeighbor); + } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //start blocking MPI send + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->sendDataGPU( + para->getParH(level)->sendProcessNeighborF3Z[i].g[0], + para->getParH(level)->sendProcessNeighborF3Z[i].numberOfGs, + para->getParH(level)->sendProcessNeighborF3Z[i].rankNeighbor); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //Wait + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + comm->waitGPU(i); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //reset the request array + if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) + { + comm->resetRequest(); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + { + cudaManager->cudaCopyProcessNeighborF3ZFsHD(level, i); + ////////////////////////////////////////////////////////////////////////// + setRecvGsDevF3( + para->getParD(level)->g6.g[0], + para->getParD(level)->recvProcessNeighborF3Z[i].g[0], + para->getParD(level)->recvProcessNeighborF3Z[i].index, + para->getParD(level)->recvProcessNeighborF3Z[i].numberOfNodes, + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads); + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////