Skip to content

Commit

Permalink
Merge pull request #38868 from fwyzard/resize_Pixel_FED_buffer
Browse files Browse the repository at this point in the history
Automatically size the Pixel FED buffer used by SiPixelRawToClusterCUDA
  • Loading branch information
cmsbuild authored Aug 2, 2022
2 parents c15bd80 + c8fae54 commit cd4f41a
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,11 @@ class SiPixelRawToClusterCUDA : public edm::stream::EDProducer<edm::ExternalWork
std::unique_ptr<PixelUnpackingRegions> regions_;

pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
PixelDataFormatter::Errors errors_;

const bool isRun2_;
const bool includeErrors_;
const bool useQuality_;
const uint32_t maxFedWords_;
uint32_t nDigis_;
const SiPixelClusterThresholds clusterThresholds_;
};
Expand All @@ -94,7 +92,6 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
isRun2_(iConfig.getParameter<bool>("isRun2")),
includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
maxFedWords_(iConfig.getParameter<uint32_t>("MaxFEDWords")),
clusterThresholds_{iConfig.getParameter<int32_t>("clusterThreshold_layer1"),
iConfig.getParameter<int32_t>("clusterThreshold_otherLayers")} {
if (includeErrors_) {
Expand All @@ -105,19 +102,17 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
if (!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
}

edm::Service<CUDAService> cs;
if (cs->enabled()) {
wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>(maxFedWords_);
}
}

void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;
desc.add<bool>("isRun2", true);
desc.add<bool>("IncludeErrors", true);
desc.add<bool>("UseQualityInfo", false);
desc.add<uint32_t>("MaxFEDWords", pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD);
// Note: this parameter is obsolete: it is ignored and will have no effect.
// It is kept to avoid breaking older configurations, and will not be printed in the generated cfi.py file.
desc.addOptionalNode(edm::ParameterDescription<uint32_t>("MaxFEDWords", 0, true), false)
->setComment("This parameter is obsolete and will be ignored.");
desc.add<int32_t>("clusterThreshold_layer1", kSiPixelClusterThresholdsDefaultPhase1.layer1);
desc.add<int32_t>("clusterThreshold_otherLayers", kSiPixelClusterThresholdsDefaultPhase1.otherLayers);
desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
Expand Down Expand Up @@ -181,13 +176,18 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
errors_.clear();

// GPU specific: Data extraction for RawToDigi GPU
unsigned int wordCounterGPU = 0;
unsigned int wordCounter = 0;
unsigned int fedCounter = 0;
bool errorsInEvent = false;

std::vector<unsigned int> index(fedIds_.size(), 0);
std::vector<cms_uint32_t const*> start(fedIds_.size(), nullptr);
std::vector<ptrdiff_t> words(fedIds_.size(), 0);

// In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
ErrorChecker errorcheck;
for (int fedId : fedIds_) {
for (uint32_t i = 0; i < fedIds_.size(); ++i) {
const int fedId = fedIds_[i];
if (regions_ && !regions_->mayUnpackFED(fedId))
continue;

Expand Down Expand Up @@ -235,26 +235,33 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
const cms_uint32_t* ew = (const cms_uint32_t*)(trailer);

assert(0 == (ew - bw) % 2);
wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
wordCounterGPU += (ew - bw);
index[i] = wordCounter;
start[i] = bw;
words[i] = (ew - bw);
wordCounter += (ew - bw);

} // end of for loop

nDigis_ = wordCounterGPU;
nDigis_ = wordCounter;

if (nDigis_ == 0)
return;

// copy the FED data to a single cpu buffer
pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender wordFedAppender(nDigis_, ctx.stream());
for (uint32_t i = 0; i < fedIds_.size(); ++i) {
wordFedAppender.initializeWordFed(fedIds_[i], index[i], start[i], words[i]);
}

gpuAlgo_.makeClustersAsync(isRun2_,
clusterThresholds_,
gpuMap,
gpuModulesToUnpack,
gpuGains,
*wordFedAppender_,
wordFedAppender,
std::move(errors_),
wordCounterGPU,
wordCounter,
fedCounter,
maxFedWords_,
useQuality_,
includeErrors_,
edm::MessageDrop::instance()->debugEnabled,
Expand All @@ -266,9 +273,8 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup&

if (nDigis_ == 0) {
// default construct collections and place them in event
auto tmp = std::make_pair(SiPixelDigisCUDA{}, SiPixelClustersCUDA{});
ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
ctx.emplace(iEvent, digiPutToken_, SiPixelDigisCUDA{});
ctx.emplace(iEvent, clusterPutToken_, SiPixelClustersCUDA{});
if (includeErrors_) {
ctx.emplace(iEvent, digiErrorPutToken_, SiPixelDigiErrorsCUDA{});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,6 @@

namespace pixelgpudetails {

SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(uint32_t maxFedWords) {
word_ = cms::cuda::make_host_noncached_unique<unsigned int[]>(maxFedWords, cudaHostAllocWriteCombined);
fedId_ = cms::cuda::make_host_noncached_unique<unsigned char[]>(maxFedWords, cudaHostAllocWriteCombined);
}

void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId,
unsigned int wordCounterGPU,
const cms_uint32_t *src,
unsigned int length) {
std::memcpy(word_.get() + wordCounterGPU, src, sizeof(cms_uint32_t) * length);
std::memset(fedId_.get() + wordCounterGPU / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2);
}

////////////////////

__device__ bool isBarrel(uint32_t rawId) {
return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
}
Expand Down Expand Up @@ -543,7 +528,6 @@ namespace pixelgpudetails {
SiPixelFormatterErrors &&errors,
const uint32_t wordCounter,
const uint32_t fedCounter,
const uint32_t maxFedWords,
bool useQualityInfo,
bool includeErrors,
bool debug,
Expand All @@ -553,7 +537,7 @@ namespace pixelgpudetails {
nDigis = wordCounter;

#ifdef GPU_DEBUG
std::cout << "decoding " << wordCounter << " digis. Max is " << maxFedWords << std::endl;
std::cout << "decoding " << wordCounter << " digis." << std::endl;
#endif

// since wordCounter != 0 we're not allocating 0 bytes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,21 @@ namespace pixelgpudetails {
public:
class WordFedAppender {
public:
WordFedAppender();
WordFedAppender(uint32_t maxFedWords);
~WordFedAppender() = default;
WordFedAppender(uint32_t words, cudaStream_t stream)
: word_{cms::cuda::make_host_unique<unsigned int[]>(words, stream)},
fedId_{cms::cuda::make_host_unique<unsigned char[]>(words, stream)} {}

void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t* src, unsigned int length);
void initializeWordFed(int fedId, unsigned int index, cms_uint32_t const* src, unsigned int length) {
std::memcpy(word_.get() + index, src, sizeof(cms_uint32_t) * length);
std::memset(fedId_.get() + index / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2);
}

const unsigned int* word() const { return word_.get(); }
const unsigned char* fedId() const { return fedId_.get(); }

private:
cms::cuda::host::noncached::unique_ptr<unsigned int[]> word_;
cms::cuda::host::noncached::unique_ptr<unsigned char[]> fedId_;
cms::cuda::host::unique_ptr<unsigned int[]> word_;
cms::cuda::host::unique_ptr<unsigned char[]> fedId_;
};

SiPixelRawToClusterGPUKernel() = default;
Expand All @@ -106,7 +109,6 @@ namespace pixelgpudetails {
SiPixelFormatterErrors&& errors,
const uint32_t wordCounter,
const uint32_t fedCounter,
const uint32_t maxFedWords,
bool useQualityInfo,
bool includeErrors,
bool debug,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ namespace gpuClustering {
__shared__ uint16_t newclusId[maxNumClustersPerModules];

constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1];
constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
[[maybe_unused]] constexpr int nMaxModules =
isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;

assert(nMaxModules < maxNumModules);
assert(startBPIX2 < nMaxModules);
Expand Down
6 changes: 4 additions & 2 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ namespace gpuClustering {
int32_t* __restrict__ clusterId,
int numElements) {
int first = blockDim.x * blockIdx.x + threadIdx.x;
constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
[[maybe_unused]] constexpr int nMaxModules =
isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
assert(nMaxModules < maxNumModules);
for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
clusterId[i] = i;
Expand Down Expand Up @@ -52,7 +53,8 @@ namespace gpuClustering {
auto firstModule = blockIdx.x;
auto endModule = moduleStart[0];

constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
[[maybe_unused]] constexpr int nMaxModules =
isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
assert(nMaxModules < maxNumModules);

for (auto module = firstModule; module < endModule; module += gridDim.x) {
Expand Down

0 comments on commit cd4f41a

Please sign in to comment.