GPU Body Part Connector (#834)

04362b9d · Raaj · Gines · a03253a2 · 04362b9d · 04362b9d
5 changed file
--- a/include/openpose/pose/bodyPartConnectorBase.hpp
+++ b/include/openpose/pose/bodyPartConnectorBase.hpp
@@ -13,11 +13,11 @@ namespace op
                                    const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);

    template <typename T>
-    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
+    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
                                    const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
                                    const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
                                    const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f,
-                                    const T* const heatMapGpuPtr = nullptr, const T* const peaksGpuPtr = nullptr);
+                                    const T* const peaksGpuPtr = nullptr);

    // Private functions used by the 2 above functions
    template <typename T>
@@ -25,7 +25,8 @@ namespace op
        const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
        const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
        const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
-        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex);
+        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex,
+        const Array<T>& precomputedPAFs = Array<T>());

    template <typename T>
    OP_API void removeSubsetsBelowThresholds(std::vector<int>& validSubsetIndexes, int& numberPeople,

--- a/src/openpose/pose/bodyPartConnectorBase.cpp
+++ b/src/openpose/pose/bodyPartConnectorBase.cpp
@@ -101,7 +101,7 @@ namespace op
        const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
        const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
        const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
-        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex)
+        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex, const Array<T>& precomputedPAFs)
    {
        try
        {
@@ -160,6 +160,9 @@ namespace op
                                    auto maxScoreIndex = -1;
                                    if (poseModel == PoseModel::BODY_25E && bodyPartPairsStar[bodyPartB] > -1)
                                    {
+                                        if (heatMapPtr == nullptr)
+                                            error("HeatMapPtr is null. GPU PAF not implemented for star architecture.",
+                                                  __LINE__, __FUNCTION__, __FILE__);
                                        const auto pairIndex2 = bodyPartPairsStar[bodyPartB];
                                        const auto* mapX0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2) * heatMapOffset;
                                        const auto* mapY0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2+1) * heatMapOffset;
@@ -293,6 +296,7 @@ namespace op
                    std::vector<std::tuple<double, int, int>> allABConnections;
                    // Note: Problem of this function, if no right PAF between A and B, both elements are discarded.
                    // However, they should be added indepently, not discarded
+                    if (heatMapPtr != nullptr)
                    {
                        const auto* mapX = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex]) * heatMapOffset;
                        const auto* mapY = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex+1]) * heatMapOffset;
@@ -336,6 +340,25 @@ namespace op
                            }
                        }
                    }
+                    else if (!precomputedPAFs.empty())
+                    {
+                        for (auto i = 1; i <= numberA; i++)
+                        {
+                            // E.g. neck-nose connection. For each nose
+                            for (auto j = 1; j <= numberB; j++)
+                            {
+                                T scoreAB = precomputedPAFs.at({(int)pairIndex, i+(int)bodyPartA, j+(int)bodyPartB});
+
+                                // E.g. neck-nose connection. If possible PAF between neck i, nose j --> add
+                                // parts score + connection score
+                                if (scoreAB > 1e-6)
+                                    allABConnections.emplace_back(std::make_tuple(scoreAB, i, j));
+                            }
+                        }
+                        //error("Not implemented", __LINE__, __FUNCTION__, __FILE__);
+                    }
+                    else
+                        error("Error. Should not reach here.", __LINE__, __FUNCTION__, __FILE__);

                    // select the top minAB connection, assuming that each part occur only once
                    // sort rows in descending order based on parts + connection score

--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
@@ -2,33 +2,153 @@
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/pose/bodyPartConnectorBase.hpp>
+#include <iostream>

 namespace op
 {
+    template<typename T>
+    inline __device__ int intRoundGPU(const T a)
+    {
+        return int(a+0.5f);
+    }
+
    template <typename T>
-    void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
+    inline __device__  T process(const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY,
+                                 const int heatmapWidth, const int heatmapHeight, const T interThreshold = T(0.05),
+                                 const T interMinAboveThreshold = T(0.95), const T renderThreshold = T(0.05))
+    {
+        T finalOutput = -1;
+        if (bodyPartA[2] < renderThreshold || bodyPartB[2] < renderThreshold) return finalOutput;
+
+        const auto vectorAToBX = bodyPartB[0] - bodyPartA[0];
+        const auto vectorAToBY = bodyPartB[1] - bodyPartA[1];
+        const auto vectorAToBMax = max(abs(vectorAToBX), abs(vectorAToBY));
+        const auto numberPointsInLine = max(5, min(25, intRoundGPU(sqrt(5*vectorAToBMax))));
+        const auto vectorNorm = T(sqrt(vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY));
+
+        if (vectorNorm > 1e-6)
+        {
+            const auto sX = bodyPartA[0];
+            const auto sY = bodyPartA[1];
+            const auto vectorAToBNormX = vectorAToBX/vectorNorm;
+            const auto vectorAToBNormY = vectorAToBY/vectorNorm;
+
+            auto sum = 0.;
+            auto count = 0;
+            const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine;
+            const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine;
+            for (auto lm = 0; lm < numberPointsInLine; lm++)
+            {
+                const auto mX = min(heatmapWidth-1, intRoundGPU(sX + lm*vectorAToBXInLine));
+                const auto mY = min(heatmapHeight-1, intRoundGPU(sY + lm*vectorAToBYInLine));
+                const auto idx = mY * heatmapWidth + mX;
+                const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]);
+                if (score > interThreshold)
+                {
+                    sum += score;
+                    count++;
+                }
+            }
+
+            // // L2 Hack
+            // int l2Dist = (int)sqrt(pow(vectorAToBX,2) + pow(vectorAToBY,2));
+            // if (l2Dist <= 2)
+            //     count = numberPointsInLine;
+
+            // parts score + connection score
+            if (count/(float)numberPointsInLine > interMinAboveThreshold)
+                finalOutput = sum/count;
+        }
+
+        return finalOutput;
+    }
+
+    template <typename T>
+    __global__ void pafScoreKernel(T* finalOutputPtr, const T* const heatMapPtr, const T* const peaksPtr,
+                                   const unsigned int* const bodyPartPairsPtr, const unsigned int* const mapIdxPtr,
+                                   const unsigned int poseMaxPeople, const int numberBodyPartPairs,
+                                   const int heatmapWidth, const int heatmapHeight)
+    {
+        const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto k = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        if (i < numberBodyPartPairs)
+        {
+            const int partA = bodyPartPairsPtr[i*2];
+            const int partB = bodyPartPairsPtr[i*2 + 1];
+            const int mapIdxX = mapIdxPtr[i*2];
+            const int mapIdxY = mapIdxPtr[i*2 + 1];
+
+            const T* const bodyPartA = peaksPtr + (partA*poseMaxPeople*3 + j*3);
+            const T* const bodyPartB = peaksPtr + (partB*poseMaxPeople*3 + k*3);
+            const T* const mapX = heatMapPtr + mapIdxX*heatmapWidth*heatmapHeight;
+            const T* const mapY = heatMapPtr + mapIdxY*heatmapWidth*heatmapHeight;
+
+            const T finalOutput = process(bodyPartA, bodyPartB, mapX, mapY, heatmapWidth, heatmapHeight);
+            finalOutputPtr[(i*poseMaxPeople+j)*poseMaxPeople + k] = finalOutput;
+        }
+    }
+
+    template <typename T>
+    void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
                             const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
                             const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
                             const int minSubsetCnt, const T minSubsetScore, const T scaleFactor,
-                             const T* const heatMapGpuPtr, const T* const peaksGpuPtr)
+                             const T* const peaksGpuPtr)
    {
        try
        {
            // Parts Connection
            const auto& bodyPartPairs = getPosePartPairs(poseModel);
+            const auto& mapIdxOffset = getPoseMapIndex(poseModel);
            const auto numberBodyParts = getPoseNumberBodyParts(poseModel);
            const auto numberBodyPartPairs = bodyPartPairs.size() / 2;
            const auto subsetCounterIndex = numberBodyParts;
+            // Update mapIdx
+            auto mapIdx = mapIdxOffset;
+            for (auto& i : mapIdx)
+                i += (numberBodyParts+1);
+
            if (numberBodyParts == 0)
                error("Invalid value of numberBodyParts, it must be positive, not " + std::to_string(numberBodyParts),
                      __LINE__, __FUNCTION__, __FILE__);

+            // Upload required data to GPU
+            unsigned int* bodyPartPairsGpuPtr;
+            cudaMalloc((void **)&bodyPartPairsGpuPtr, bodyPartPairs.size() * sizeof(unsigned int));
+            cudaMemcpy(bodyPartPairsGpuPtr, &bodyPartPairs[0], bodyPartPairs.size() * sizeof(unsigned int),
+                       cudaMemcpyHostToDevice);
+            unsigned int* mapIdxGpuPtr;
+            cudaMalloc((void **)&mapIdxGpuPtr, mapIdx.size() * sizeof(unsigned int));
+            cudaMemcpy(mapIdxGpuPtr, &mapIdx[0], mapIdx.size() * sizeof(unsigned int), cudaMemcpyHostToDevice);
+            T* finalOutputGpuPtr;
+            Array<T> finalOutputCpu;
+            finalOutputCpu.reset({(int)numberBodyPartPairs, (int)POSE_MAX_PEOPLE, (int)POSE_MAX_PEOPLE},-1);
+            int totalComputations = numberBodyPartPairs * POSE_MAX_PEOPLE * POSE_MAX_PEOPLE;
+            cudaMalloc((void **)&finalOutputGpuPtr, totalComputations * sizeof(float));
+
+            // Run Kernel
+            const dim3 threadsPerBlock{4, 8, 8}; //4 is good for BODY_25, 8 for COCO?
+            if ((POSE_MAX_PEOPLE+1) % threadsPerBlock.y || (POSE_MAX_PEOPLE+1) % threadsPerBlock.z)
+                error("Invalid value of POSE_MAX_PEOPLE, it must be multiple of 16, rather than "
+                      + std::to_string(POSE_MAX_PEOPLE), __LINE__, __FUNCTION__, __FILE__);
+            int pairBlocks = intRound((numberBodyPartPairs/threadsPerBlock.x) + 0.5);
+            const dim3 numBlocks{(unsigned int)pairBlocks, (POSE_MAX_PEOPLE+1) / threadsPerBlock.y,
+                                 (POSE_MAX_PEOPLE+1) / threadsPerBlock.z};
+            pafScoreKernel<<<numBlocks, threadsPerBlock>>>(
+                finalOutputGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr,
+                POSE_MAX_PEOPLE, numberBodyPartPairs, heatMapSize.x, heatMapSize.y);
+            cudaMemcpy(finalOutputCpu.getPtr(), finalOutputGpuPtr, totalComputations * sizeof(float),
+                       cudaMemcpyDeviceToHost);
+
            // std::vector<std::pair<std::vector<int>, double>> refers to:
            //     - std::vector<int>: [body parts locations, #body parts found]
            //     - double: subset score
+            const T* const tNullptr = nullptr;
            const auto subsets = generateInitialSubsets(
-                heatMapPtr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
-                bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex);
+                tNullptr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
+                bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex, finalOutputCpu);

            // Delete people below the following thresholds:
                // a) minSubsetCnt: removed if less than minSubsetCnt body parts
@@ -45,8 +165,9 @@ namespace op
                                            peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs);

            // Differences w.r.t. CPU version for now
-            UNUSED(heatMapGpuPtr);
-            UNUSED(peaksGpuPtr);
+            cudaFree(bodyPartPairsGpuPtr);
+            cudaFree(mapIdxGpuPtr);
+            cudaFree(finalOutputGpuPtr);
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
@@ -56,15 +177,15 @@ namespace op
    }

    template void connectBodyPartsGpu(Array<float>& poseKeypoints, Array<float>& poseScores,
-                                      const float* const heatMapPtr, const float* const peaksPtr,
+                                      const float* const heatMapGpuPtr, const float* const peaksPtr,
                                      const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                      const float interMinAboveThreshold, const float interThreshold,
                                      const int minSubsetCnt, const float minSubsetScore, const float scaleFactor,
-                                      const float* const heatMapGpuPtr, const float* const peaksGpuPtr);
+                                      const float* const peaksGpuPtr);
    template void connectBodyPartsGpu(Array<double>& poseKeypoints, Array<double>& poseScores,
-                                      const double* const heatMapPtr, const double* const peaksPtr,
+                                      const double* const heatMapGpuPtr, const double* const peaksPtr,
                                      const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                      const double interMinAboveThreshold, const double interThreshold,
                                      const int minSubsetCnt, const double minSubsetScore, const double scaleFactor,
-                                      const double* const heatMapGpuPtr, const double* const peaksGpuPtr);
+                                      const double* const peaksGpuPtr);
 }
--- a/src/openpose/pose/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/pose/bodyPartConnectorCaffe.cpp
@@ -163,16 +163,14 @@ namespace op
        {
            #if defined USE_CAFFE && defined USE_CUDA
                const auto heatMapsBlob = bottom.at(0);
-                const auto* const heatMapsPtr = heatMapsBlob->cpu_data();
                const auto* const peaksPtr = bottom.at(1)->cpu_data();
                const auto* const heatMapsGpuPtr = heatMapsBlob->gpu_data();
                const auto* const peaksGpuPtr = bottom.at(1)->gpu_data();
                const auto maxPeaks = mTopSize[1];
-                connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsPtr, peaksPtr, mPoseModel,
+                connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsGpuPtr, peaksPtr, mPoseModel,
                                    Point<int>{heatMapsBlob->shape(3), heatMapsBlob->shape(2)},
                                    maxPeaks, mInterMinAboveThreshold, mInterThreshold,
-                                    mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput,
-                                    heatMapsGpuPtr, peaksGpuPtr);
+                                    mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput, peaksGpuPtr);
            #else
                UNUSED(bottom);
                UNUSED(poseKeypoints);

--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -294,7 +294,6 @@ namespace op
                upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
                upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));

-                // CUDA version not implemented yet
                // #ifdef USE_CUDA
                //     upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(),
                //                                                    upImpl->spPeaksBlob.get()},
@@ -304,6 +303,9 @@ namespace op
                                                                   upImpl->spPeaksBlob.get()},
                                                                  mPoseKeypoints, mPoseScores);
                // #endif
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
            #else
                UNUSED(inputNetData);
                UNUSED(inputDataSize);