diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc index dfa52639b64e6..df168da110301 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc @@ -72,13 +72,11 @@ class SiPixelRawToClusterCUDA : public edm::stream::EDProducer regions_; pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; - std::unique_ptr wordFedAppender_; PixelDataFormatter::Errors errors_; const bool isRun2_; const bool includeErrors_; const bool useQuality_; - const uint32_t maxFedWords_; uint32_t nDigis_; const SiPixelClusterThresholds clusterThresholds_; }; @@ -94,7 +92,6 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi isRun2_(iConfig.getParameter("isRun2")), includeErrors_(iConfig.getParameter("IncludeErrors")), useQuality_(iConfig.getParameter("UseQualityInfo")), - maxFedWords_(iConfig.getParameter("MaxFEDWords")), clusterThresholds_{iConfig.getParameter("clusterThreshold_layer1"), iConfig.getParameter("clusterThreshold_otherLayers")} { if (includeErrors_) { @@ -105,11 +102,6 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi if (!iConfig.getParameter("Regions").getParameterNames().empty()) { regions_ = std::make_unique(iConfig, consumesCollector()); } - - edm::Service cs; - if (cs->enabled()) { - wordFedAppender_ = std::make_unique(maxFedWords_); - } } void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -117,7 +109,10 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d desc.add("isRun2", true); desc.add("IncludeErrors", true); desc.add("UseQualityInfo", false); - desc.add("MaxFEDWords", pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD); + // Note: this parameter is obsolete: it is ignored and will have no effect. + // It is kept to avoid breaking older configurations, and will not be printed in the generated cfi.py file. + desc.addOptionalNode(edm::ParameterDescription("MaxFEDWords", 0, true), false) + ->setComment("This parameter is obsolete and will be ignored."); desc.add("clusterThreshold_layer1", kSiPixelClusterThresholdsDefaultPhase1.layer1); desc.add("clusterThreshold_otherLayers", kSiPixelClusterThresholdsDefaultPhase1.otherLayers); desc.add("InputLabel", edm::InputTag("rawDataCollector")); @@ -181,13 +176,18 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, errors_.clear(); // GPU specific: Data extraction for RawToDigi GPU - unsigned int wordCounterGPU = 0; + unsigned int wordCounter = 0; unsigned int fedCounter = 0; bool errorsInEvent = false; + std::vector index(fedIds_.size(), 0); + std::vector start(fedIds_.size(), nullptr); + std::vector words(fedIds_.size(), 0); + // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() ErrorChecker errorcheck; - for (int fedId : fedIds_) { + for (uint32_t i = 0; i < fedIds_.size(); ++i) { + const int fedId = fedIds_[i]; if (regions_ && !regions_->mayUnpackFED(fedId)) continue; @@ -235,26 +235,33 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const cms_uint32_t* ew = (const cms_uint32_t*)(trailer); assert(0 == (ew - bw) % 2); - wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw)); - wordCounterGPU += (ew - bw); + index[i] = wordCounter; + start[i] = bw; + words[i] = (ew - bw); + wordCounter += (ew - bw); } // end of for loop - nDigis_ = wordCounterGPU; + nDigis_ = wordCounter; if (nDigis_ == 0) return; + // copy the FED data to a single cpu buffer + pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender wordFedAppender(nDigis_, ctx.stream()); + for (uint32_t i = 0; i < fedIds_.size(); ++i) { + wordFedAppender.initializeWordFed(fedIds_[i], index[i], start[i], words[i]); + } + gpuAlgo_.makeClustersAsync(isRun2_, clusterThresholds_, gpuMap, gpuModulesToUnpack, gpuGains, - *wordFedAppender_, + wordFedAppender, std::move(errors_), - wordCounterGPU, + wordCounter, fedCounter, - maxFedWords_, useQuality_, includeErrors_, edm::MessageDrop::instance()->debugEnabled, @@ -266,9 +273,8 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& if (nDigis_ == 0) { // default construct collections and place them in event - auto tmp = std::make_pair(SiPixelDigisCUDA{}, SiPixelClustersCUDA{}); - ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); - ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); + ctx.emplace(iEvent, digiPutToken_, SiPixelDigisCUDA{}); + ctx.emplace(iEvent, clusterPutToken_, SiPixelClustersCUDA{}); if (includeErrors_) { ctx.emplace(iEvent, digiErrorPutToken_, SiPixelDigiErrorsCUDA{}); } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index e66e322ddff4e..4b6f58ceb38c5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -34,21 +34,6 @@ namespace pixelgpudetails { - SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(uint32_t maxFedWords) { - word_ = cms::cuda::make_host_noncached_unique(maxFedWords, cudaHostAllocWriteCombined); - fedId_ = cms::cuda::make_host_noncached_unique(maxFedWords, cudaHostAllocWriteCombined); - } - - void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, - unsigned int wordCounterGPU, - const cms_uint32_t *src, - unsigned int length) { - std::memcpy(word_.get() + wordCounterGPU, src, sizeof(cms_uint32_t) * length); - std::memset(fedId_.get() + wordCounterGPU / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2); - } - - //////////////////// - __device__ bool isBarrel(uint32_t rawId) { return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask)); } @@ -543,7 +528,6 @@ namespace pixelgpudetails { SiPixelFormatterErrors &&errors, const uint32_t wordCounter, const uint32_t fedCounter, - const uint32_t maxFedWords, bool useQualityInfo, bool includeErrors, bool debug, @@ -553,7 +537,7 @@ namespace pixelgpudetails { nDigis = wordCounter; #ifdef GPU_DEBUG - std::cout << "decoding " << wordCounter << " digis. Max is " << maxFedWords << std::endl; + std::cout << "decoding " << wordCounter << " digis." << std::endl; #endif // since wordCounter != 0 we're not allocating 0 bytes, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 8678fe2e763f9..ace787514486e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -75,18 +75,21 @@ namespace pixelgpudetails { public: class WordFedAppender { public: - WordFedAppender(); - WordFedAppender(uint32_t maxFedWords); - ~WordFedAppender() = default; + WordFedAppender(uint32_t words, cudaStream_t stream) + : word_{cms::cuda::make_host_unique(words, stream)}, + fedId_{cms::cuda::make_host_unique(words, stream)} {} - void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t* src, unsigned int length); + void initializeWordFed(int fedId, unsigned int index, cms_uint32_t const* src, unsigned int length) { + std::memcpy(word_.get() + index, src, sizeof(cms_uint32_t) * length); + std::memset(fedId_.get() + index / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2); + } const unsigned int* word() const { return word_.get(); } const unsigned char* fedId() const { return fedId_.get(); } private: - cms::cuda::host::noncached::unique_ptr word_; - cms::cuda::host::noncached::unique_ptr fedId_; + cms::cuda::host::unique_ptr word_; + cms::cuda::host::unique_ptr fedId_; }; SiPixelRawToClusterGPUKernel() = default; @@ -106,7 +109,6 @@ namespace pixelgpudetails { SiPixelFormatterErrors&& errors, const uint32_t wordCounter, const uint32_t fedCounter, - const uint32_t maxFedWords, bool useQualityInfo, bool includeErrors, bool debug, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h index b89e3604438e7..3bfba582936ea 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -30,7 +30,8 @@ namespace gpuClustering { __shared__ uint16_t newclusId[maxNumClustersPerModules]; constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1]; - constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + [[maybe_unused]] constexpr int nMaxModules = + isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; assert(nMaxModules < maxNumModules); assert(startBPIX2 < nMaxModules); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index b4fb89c3a709e..d502f8b15600c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -21,7 +21,8 @@ namespace gpuClustering { int32_t* __restrict__ clusterId, int numElements) { int first = blockDim.x * blockIdx.x + threadIdx.x; - constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + [[maybe_unused]] constexpr int nMaxModules = + isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; assert(nMaxModules < maxNumModules); for (int i = first; i < numElements; i += gridDim.x * blockDim.x) { clusterId[i] = i; @@ -52,7 +53,8 @@ namespace gpuClustering { auto firstModule = blockIdx.x; auto endModule = moduleStart[0]; - constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + [[maybe_unused]] constexpr int nMaxModules = + isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; assert(nMaxModules < maxNumModules); for (auto module = firstModule; module < endModule; module += gridDim.x) {