diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index dfa52639b64e6..df168da110301 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -72,13 +72,11 @@ class SiPixelRawToClusterCUDA : public edm::stream::EDProducer<edm::ExternalWork
   std::unique_ptr<PixelUnpackingRegions> regions_;
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
-  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
   PixelDataFormatter::Errors errors_;
 
   const bool isRun2_;
   const bool includeErrors_;
   const bool useQuality_;
-  const uint32_t maxFedWords_;
   uint32_t nDigis_;
   const SiPixelClusterThresholds clusterThresholds_;
 };
@@ -94,7 +92,6 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
       isRun2_(iConfig.getParameter<bool>("isRun2")),
       includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
       useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
-      maxFedWords_(iConfig.getParameter<uint32_t>("MaxFEDWords")),
       clusterThresholds_{iConfig.getParameter<int32_t>("clusterThreshold_layer1"),
                          iConfig.getParameter<int32_t>("clusterThreshold_otherLayers")} {
   if (includeErrors_) {
@@ -105,11 +102,6 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   if (!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
     regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
   }
-
-  edm::Service<CUDAService> cs;
-  if (cs->enabled()) {
-    wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>(maxFedWords_);
-  }
 }
 
 void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -117,7 +109,10 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
   desc.add<bool>("isRun2", true);
   desc.add<bool>("IncludeErrors", true);
   desc.add<bool>("UseQualityInfo", false);
-  desc.add<uint32_t>("MaxFEDWords", pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD);
+  // Note: this parameter is obsolete: it is ignored and will have no effect.
+  // It is kept to avoid breaking older configurations, and will not be printed in the generated cfi.py file.
+  desc.addOptionalNode(edm::ParameterDescription<uint32_t>("MaxFEDWords", 0, true), false)
+      ->setComment("This parameter is obsolete and will be ignored.");
   desc.add<int32_t>("clusterThreshold_layer1", kSiPixelClusterThresholdsDefaultPhase1.layer1);
   desc.add<int32_t>("clusterThreshold_otherLayers", kSiPixelClusterThresholdsDefaultPhase1.otherLayers);
   desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
@@ -181,13 +176,18 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
   errors_.clear();
 
   // GPU specific: Data extraction for RawToDigi GPU
-  unsigned int wordCounterGPU = 0;
+  unsigned int wordCounter = 0;
   unsigned int fedCounter = 0;
   bool errorsInEvent = false;
 
+  std::vector<unsigned int> index(fedIds_.size(), 0);
+  std::vector<cms_uint32_t const*> start(fedIds_.size(), nullptr);
+  std::vector<ptrdiff_t> words(fedIds_.size(), 0);
+
   // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
   ErrorChecker errorcheck;
-  for (int fedId : fedIds_) {
+  for (uint32_t i = 0; i < fedIds_.size(); ++i) {
+    const int fedId = fedIds_[i];
     if (regions_ && !regions_->mayUnpackFED(fedId))
       continue;
 
@@ -235,26 +235,33 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
     const cms_uint32_t* ew = (const cms_uint32_t*)(trailer);
 
     assert(0 == (ew - bw) % 2);
-    wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
-    wordCounterGPU += (ew - bw);
+    index[i] = wordCounter;
+    start[i] = bw;
+    words[i] = (ew - bw);
+    wordCounter += (ew - bw);
 
   }  // end of for loop
 
-  nDigis_ = wordCounterGPU;
+  nDigis_ = wordCounter;
 
   if (nDigis_ == 0)
     return;
 
+  // copy the FED data to a single cpu buffer
+  pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender wordFedAppender(nDigis_, ctx.stream());
+  for (uint32_t i = 0; i < fedIds_.size(); ++i) {
+    wordFedAppender.initializeWordFed(fedIds_[i], index[i], start[i], words[i]);
+  }
+
   gpuAlgo_.makeClustersAsync(isRun2_,
                              clusterThresholds_,
                              gpuMap,
                              gpuModulesToUnpack,
                              gpuGains,
-                             *wordFedAppender_,
+                             wordFedAppender,
                              std::move(errors_),
-                             wordCounterGPU,
+                             wordCounter,
                              fedCounter,
-                             maxFedWords_,
                              useQuality_,
                              includeErrors_,
                              edm::MessageDrop::instance()->debugEnabled,
@@ -266,9 +273,8 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
 
   if (nDigis_ == 0) {
     // default construct collections and place them in event
-    auto tmp = std::make_pair(SiPixelDigisCUDA{}, SiPixelClustersCUDA{});
-    ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
-    ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+    ctx.emplace(iEvent, digiPutToken_, SiPixelDigisCUDA{});
+    ctx.emplace(iEvent, clusterPutToken_, SiPixelClustersCUDA{});
     if (includeErrors_) {
       ctx.emplace(iEvent, digiErrorPutToken_, SiPixelDigiErrorsCUDA{});
     }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index e66e322ddff4e..4b6f58ceb38c5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -34,21 +34,6 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(uint32_t maxFedWords) {
-    word_ = cms::cuda::make_host_noncached_unique<unsigned int[]>(maxFedWords, cudaHostAllocWriteCombined);
-    fedId_ = cms::cuda::make_host_noncached_unique<unsigned char[]>(maxFedWords, cudaHostAllocWriteCombined);
-  }
-
-  void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId,
-                                                                        unsigned int wordCounterGPU,
-                                                                        const cms_uint32_t *src,
-                                                                        unsigned int length) {
-    std::memcpy(word_.get() + wordCounterGPU, src, sizeof(cms_uint32_t) * length);
-    std::memset(fedId_.get() + wordCounterGPU / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2);
-  }
-
-  ////////////////////
-
   __device__ bool isBarrel(uint32_t rawId) {
     return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
   }
@@ -543,7 +528,6 @@ namespace pixelgpudetails {
                                                        SiPixelFormatterErrors &&errors,
                                                        const uint32_t wordCounter,
                                                        const uint32_t fedCounter,
-                                                       const uint32_t maxFedWords,
                                                        bool useQualityInfo,
                                                        bool includeErrors,
                                                        bool debug,
@@ -553,7 +537,7 @@ namespace pixelgpudetails {
     nDigis = wordCounter;
 
 #ifdef GPU_DEBUG
-    std::cout << "decoding " << wordCounter << " digis. Max is " << maxFedWords << std::endl;
+    std::cout << "decoding " << wordCounter << " digis." << std::endl;
 #endif
 
     // since wordCounter != 0 we're not allocating 0 bytes,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 8678fe2e763f9..ace787514486e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -75,18 +75,21 @@ namespace pixelgpudetails {
   public:
     class WordFedAppender {
     public:
-      WordFedAppender();
-      WordFedAppender(uint32_t maxFedWords);
-      ~WordFedAppender() = default;
+      WordFedAppender(uint32_t words, cudaStream_t stream)
+          : word_{cms::cuda::make_host_unique<unsigned int[]>(words, stream)},
+            fedId_{cms::cuda::make_host_unique<unsigned char[]>(words, stream)} {}
 
-      void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t* src, unsigned int length);
+      void initializeWordFed(int fedId, unsigned int index, cms_uint32_t const* src, unsigned int length) {
+        std::memcpy(word_.get() + index, src, sizeof(cms_uint32_t) * length);
+        std::memset(fedId_.get() + index / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2);
+      }
 
       const unsigned int* word() const { return word_.get(); }
       const unsigned char* fedId() const { return fedId_.get(); }
 
     private:
-      cms::cuda::host::noncached::unique_ptr<unsigned int[]> word_;
-      cms::cuda::host::noncached::unique_ptr<unsigned char[]> fedId_;
+      cms::cuda::host::unique_ptr<unsigned int[]> word_;
+      cms::cuda::host::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
@@ -106,7 +109,6 @@ namespace pixelgpudetails {
                            SiPixelFormatterErrors&& errors,
                            const uint32_t wordCounter,
                            const uint32_t fedCounter,
-                           const uint32_t maxFedWords,
                            bool useQualityInfo,
                            bool includeErrors,
                            bool debug,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index b89e3604438e7..3bfba582936ea 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -30,7 +30,8 @@ namespace gpuClustering {
     __shared__ uint16_t newclusId[maxNumClustersPerModules];
 
     constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1];
-    constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+    [[maybe_unused]] constexpr int nMaxModules =
+        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
 
     assert(nMaxModules < maxNumModules);
     assert(startBPIX2 < nMaxModules);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index b4fb89c3a709e..d502f8b15600c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -21,7 +21,8 @@ namespace gpuClustering {
                                int32_t* __restrict__ clusterId,
                                int numElements) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+    [[maybe_unused]] constexpr int nMaxModules =
+        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
     assert(nMaxModules < maxNumModules);
     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
       clusterId[i] = i;
@@ -52,7 +53,8 @@ namespace gpuClustering {
     auto firstModule = blockIdx.x;
     auto endModule = moduleStart[0];
 
-    constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+    [[maybe_unused]] constexpr int nMaxModules =
+        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
     assert(nMaxModules < maxNumModules);
 
     for (auto module = firstModule; module < endModule; module += gridDim.x) {