Skip to content

Commit

Permalink
[ntuple] improve RPageRange::Find() complexity from O(n) to O(logn)
Browse files Browse the repository at this point in the history
  • Loading branch information
jblomer committed Nov 20, 2024
1 parent e9276c2 commit 1c410bd
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 12 deletions.
5 changes: 5 additions & 0 deletions tree/ntuple/v7/inc/ROOT/RNTupleDescriptor.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,10 @@ public:
std::size_t ExtendToFitColumnRange(const RColumnRange &columnRange, const Internal::RColumnElementBase &element,
std::size_t pageSize);

/// Has the same length than fPageInfos and stores the sum of the number of elements of all the pages
/// up to and including a given index. Used for binary search in Find().
std::vector<NTupleSize_t> fCumulativeNElements;

public:
/// We do not need to store the element size / uncompressed page size because we know to which column
/// the page belongs
Expand Down Expand Up @@ -319,6 +323,7 @@ public:
RPageRange clone;
clone.fPhysicalColumnId = fPhysicalColumnId;
clone.fPageInfos = fPageInfos;
clone.fCumulativeNElements = fCumulativeNElements;
return clone;
}

Expand Down
42 changes: 30 additions & 12 deletions tree/ntuple/v7/src/RNTupleDescriptor.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -179,21 +179,32 @@ ROOT::Experimental::RColumnDescriptor ROOT::Experimental::RColumnDescriptor::Clo
ROOT::Experimental::RClusterDescriptor::RPageRange::RPageInfoExtended
ROOT::Experimental::RClusterDescriptor::RPageRange::Find(ClusterSize_t::ValueType idxInCluster) const
{
// TODO(jblomer): binary search
RPageInfo pageInfo;
decltype(idxInCluster) firstInPage = 0;
NTupleSize_t pageNo = 0;
for (const auto &pi : fPageInfos) {
if (firstInPage + pi.fNElements > idxInCluster) {
pageInfo = pi;
break;
const auto N = fCumulativeNElements.size();
R__ASSERT(N > 0);
R__ASSERT(N == fPageInfos.size());

std::size_t left = 0;
std::size_t right = N - 1;
std::size_t midpoint = N;
while (left <= right) {
midpoint = (left + right) / 2;
if (fCumulativeNElements[midpoint] <= idxInCluster) {
left = midpoint + 1;
continue;
}
firstInPage += pi.fNElements;
++pageNo;

if ((midpoint == 0) || (fCumulativeNElements[midpoint - 1] <= idxInCluster))
break;

right = midpoint - 1;
}
R__ASSERT(midpoint < N);

auto pageInfo = fPageInfos[midpoint];
decltype(idxInCluster) firstInPage = (midpoint == 0) ? 0 : fCumulativeNElements[midpoint - 1];
R__ASSERT(firstInPage <= idxInCluster);
R__ASSERT((firstInPage + pageInfo.fNElements) > idxInCluster);
return RPageInfoExtended{pageInfo, firstInPage, pageNo};
return RPageInfoExtended{pageInfo, firstInPage, midpoint};
}

std::size_t
Expand Down Expand Up @@ -824,10 +835,17 @@ ROOT::Experimental::Internal::RClusterDescriptorBuilder::MoveDescriptor()
return R__FAIL("unset cluster ID");
if (fCluster.fNEntries == 0)
return R__FAIL("empty cluster");
for (const auto &pr : fCluster.fPageRanges) {
for (auto &pr : fCluster.fPageRanges) {
if (fCluster.fColumnRanges.count(pr.first) == 0) {
return R__FAIL("missing column range");
}
pr.second.fCumulativeNElements.clear();
pr.second.fCumulativeNElements.reserve(pr.second.fPageInfos.size());
NTupleSize_t sum = 0;
for (const auto &pi : pr.second.fPageInfos) {
sum += pi.fNElements;
pr.second.fCumulativeNElements.emplace_back(sum);
}
}
RClusterDescriptor result;
std::swap(result, fCluster);
Expand Down

0 comments on commit 1c410bd

Please sign in to comment.