Skip to content

Commit

Permalink
better performance on cpuMt
Browse files Browse the repository at this point in the history
  • Loading branch information
Serafadam committed Dec 20, 2024
1 parent a9878ba commit 893f3c1
Showing 1 changed file with 61 additions and 39 deletions.
100 changes: 61 additions & 39 deletions src/pipeline/node/host/RGBD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,53 +136,76 @@ class RGBD::Impl {
}
#endif
}
void calcPoints(const uint8_t* depthData, const uint8_t* colorData, std::vector<Point3fRGB>& points, int startRow, int endRow) {
float scale = outputMeters ? (1.0f / 1000.0f) : 1.0f;
for(int i = startRow * width; i < endRow * width; i++) {
float x = i % width;
float y = static_cast<float>(i) / width;
void calcPointsChunk(
const uint8_t* depthData,
const uint8_t* colorData,
std::vector<Point3fRGB>& outChunk,
int startRow, int endRow
) {
float scale = outputMeters ? (1.0f / 1000.0f) : 1.0f;
outChunk.reserve((endRow - startRow) * width);

for (int row = startRow; row < endRow; row++) {
int rowStart = row * width;
for (int col = 0; col < width; col++) {
size_t i = rowStart + col;

uint16_t depthValue = *(reinterpret_cast<const uint16_t*>(depthData + i * 2));
float z = static_cast<float>(depthValue) * scale;
x = (x - cx) * z / fx;
y = (y - cy) * z / fy;
uint8_t r = static_cast<uint8_t>(colorData[i * 3 + 0]);
uint8_t g = static_cast<uint8_t>(colorData[i * 3 + 1]);
uint8_t b = static_cast<uint8_t>(colorData[i * 3 + 2]);
Point3fRGB p;
p.x = x;
p.y = y;
p.z = z;
p.r = r;
p.g = g;
p.b = b;
std::lock_guard<std::mutex> lock(pointsMtx);
points.emplace_back(p);

float xCoord = (col - cx) * z / fx;
float yCoord = (row - cy) * z / fy;

// BGR order in colorData
uint8_t r = colorData[i * 3 + 0];
uint8_t g = colorData[i * 3 + 1];
uint8_t b = colorData[i * 3 + 2];

outChunk.push_back(Point3fRGB{xCoord, yCoord, z, r, g, b});
}
}
void computePointCloudCPU(const uint8_t* depthData, const uint8_t* colorData, std::vector<Point3fRGB>& points) {
calcPoints(depthData, colorData, points, 0, height);
}
void computePointCloudCPUMT(const uint8_t* depthData, const uint8_t* colorData, std::vector<Point3fRGB>& points) {
// Lambda function for processing a block of rows
auto processRows = [&](int startRow, int endRow) { calcPoints(depthData, colorData, points, startRow, endRow); };
}

// Divide rows into chunks and process in parallel
const int numThreads = threadNum;
const int rowsPerThread = height / numThreads;
std::vector<std::future<void>> futures;
void computePointCloudCPU(
const uint8_t* depthData,
const uint8_t* colorData,
std::vector<Point3fRGB>& points
) {

for(int t = 0; t < numThreads; ++t) {
int startRow = t * rowsPerThread;
int endRow = (t == numThreads - 1) ? height : startRow + rowsPerThread;
// Single-threaded directly writes into points
calcPointsChunk(depthData, colorData, points, 0, height);
}

futures.emplace_back(std::async(std::launch::async, processRows, startRow, endRow));
}
void computePointCloudCPUMT(
const uint8_t* depthData,
const uint8_t* colorData,
std::vector<Point3fRGB>& points
) {

// Wait for all threads to finish
for(auto& future : futures) {
future.get();
}
int rowsPerThread = height / threadNum;
std::vector<std::future<std::vector<Point3fRGB>>> futures;

// Each thread returns a local vector
auto processRows = [&](int startRow, int endRow) {
std::vector<Point3fRGB> localPoints;
calcPointsChunk(depthData, colorData, localPoints, startRow, endRow);
return localPoints;
};

for (int t = 0; t < threadNum; ++t) {
int startRow = t * rowsPerThread;
int endRow = (t == threadNum - 1) ? height : (startRow + rowsPerThread);
futures.emplace_back(std::async(std::launch::async, processRows, startRow, endRow));
}

// Merge all results
for (auto& f : futures) {
auto localPoints = f.get();
// Now we do one lock per merge if needed
// If this is not called concurrently from multiple places, no lock needed
points.insert(points.end(), localPoints.begin(), localPoints.end());
}
}
enum class ComputeMethod { CPU, CPU_MT, GPU };
ComputeMethod computeMethod = ComputeMethod::CPU;
#ifdef DEPTHAI_ENABLE_KOMPUTE
Expand All @@ -201,7 +224,6 @@ class RGBD::Impl {
int width, height;
int size;
bool intrinsicsSet = false;
std::mutex pointsMtx;
int threadNum = 2;
};

Expand Down

0 comments on commit 893f3c1

Please sign in to comment.