diff --git a/include/openpose/pose/bodyPartConnectorBase.hpp b/include/openpose/pose/bodyPartConnectorBase.hpp index 62033e7c0383b214c9b0b122d5d4144ef660f631..765953ba58503f6060cb499e640c928d50054fbf 100644 --- a/include/openpose/pose/bodyPartConnectorBase.hpp +++ b/include/openpose/pose/bodyPartConnectorBase.hpp @@ -13,11 +13,11 @@ namespace op const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f); template - OP_API void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, const T* const heatMapPtr, + OP_API void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, const T* const heatMapGpuPtr, const T* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const T interMinAboveThreshold, const T interThreshold, const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f, - const T* const heatMapGpuPtr = nullptr, const T* const peaksGpuPtr = nullptr); + const T* const peaksGpuPtr = nullptr); // Private functions used by the 2 above functions template @@ -25,7 +25,8 @@ namespace op const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const T interThreshold, const T interMinAboveThreshold, const std::vector& bodyPartPairs, const unsigned int numberBodyParts, - const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex); + const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex, + const Array& precomputedPAFs = Array()); template OP_API void removeSubsetsBelowThresholds(std::vector& validSubsetIndexes, int& numberPeople, diff --git a/src/openpose/pose/bodyPartConnectorBase.cpp b/src/openpose/pose/bodyPartConnectorBase.cpp index 4a054750a447ac8a9c9021ddd79068bd1d4eda93..835a5d20e51699e2e740b31bb343c78769b08121 100644 --- a/src/openpose/pose/bodyPartConnectorBase.cpp +++ b/src/openpose/pose/bodyPartConnectorBase.cpp @@ -101,7 +101,7 @@ namespace op const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const T interThreshold, const T interMinAboveThreshold, const std::vector& bodyPartPairs, const unsigned int numberBodyParts, - const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex) + const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex, const Array& precomputedPAFs) { try { @@ -160,6 +160,9 @@ namespace op auto maxScoreIndex = -1; if (poseModel == PoseModel::BODY_25E && bodyPartPairsStar[bodyPartB] > -1) { + if (heatMapPtr == nullptr) + error("HeatMapPtr is null. GPU PAF not implemented for star architecture.", + __LINE__, __FUNCTION__, __FILE__); const auto pairIndex2 = bodyPartPairsStar[bodyPartB]; const auto* mapX0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2) * heatMapOffset; const auto* mapY0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2+1) * heatMapOffset; @@ -293,6 +296,7 @@ namespace op std::vector> allABConnections; // Note: Problem of this function, if no right PAF between A and B, both elements are discarded. // However, they should be added indepently, not discarded + if (heatMapPtr != nullptr) { const auto* mapX = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex]) * heatMapOffset; const auto* mapY = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex+1]) * heatMapOffset; @@ -336,6 +340,25 @@ namespace op } } } + else if (!precomputedPAFs.empty()) + { + for (auto i = 1; i <= numberA; i++) + { + // E.g. neck-nose connection. For each nose + for (auto j = 1; j <= numberB; j++) + { + T scoreAB = precomputedPAFs.at({(int)pairIndex, i+(int)bodyPartA, j+(int)bodyPartB}); + + // E.g. neck-nose connection. If possible PAF between neck i, nose j --> add + // parts score + connection score + if (scoreAB > 1e-6) + allABConnections.emplace_back(std::make_tuple(scoreAB, i, j)); + } + } + //error("Not implemented", __LINE__, __FUNCTION__, __FILE__); + } + else + error("Error. Should not reach here.", __LINE__, __FUNCTION__, __FILE__); // select the top minAB connection, assuming that each part occur only once // sort rows in descending order based on parts + connection score diff --git a/src/openpose/pose/bodyPartConnectorBase.cu b/src/openpose/pose/bodyPartConnectorBase.cu index f54eaf5d4eae388bdce171d9a29531f190f23e98..2f71d7a5973c900e7f089d96be4f84aa1c9e506d 100644 --- a/src/openpose/pose/bodyPartConnectorBase.cu +++ b/src/openpose/pose/bodyPartConnectorBase.cu @@ -2,33 +2,153 @@ #include #include #include +#include namespace op { + template + inline __device__ int intRoundGPU(const T a) + { + return int(a+0.5f); + } + template - void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, const T* const heatMapPtr, + inline __device__ T process(const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY, + const int heatmapWidth, const int heatmapHeight, const T interThreshold = T(0.05), + const T interMinAboveThreshold = T(0.95), const T renderThreshold = T(0.05)) + { + T finalOutput = -1; + if (bodyPartA[2] < renderThreshold || bodyPartB[2] < renderThreshold) return finalOutput; + + const auto vectorAToBX = bodyPartB[0] - bodyPartA[0]; + const auto vectorAToBY = bodyPartB[1] - bodyPartA[1]; + const auto vectorAToBMax = max(abs(vectorAToBX), abs(vectorAToBY)); + const auto numberPointsInLine = max(5, min(25, intRoundGPU(sqrt(5*vectorAToBMax)))); + const auto vectorNorm = T(sqrt(vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY)); + + if (vectorNorm > 1e-6) + { + const auto sX = bodyPartA[0]; + const auto sY = bodyPartA[1]; + const auto vectorAToBNormX = vectorAToBX/vectorNorm; + const auto vectorAToBNormY = vectorAToBY/vectorNorm; + + auto sum = 0.; + auto count = 0; + const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine; + const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine; + for (auto lm = 0; lm < numberPointsInLine; lm++) + { + const auto mX = min(heatmapWidth-1, intRoundGPU(sX + lm*vectorAToBXInLine)); + const auto mY = min(heatmapHeight-1, intRoundGPU(sY + lm*vectorAToBYInLine)); + const auto idx = mY * heatmapWidth + mX; + const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]); + if (score > interThreshold) + { + sum += score; + count++; + } + } + + // // L2 Hack + // int l2Dist = (int)sqrt(pow(vectorAToBX,2) + pow(vectorAToBY,2)); + // if (l2Dist <= 2) + // count = numberPointsInLine; + + // parts score + connection score + if (count/(float)numberPointsInLine > interMinAboveThreshold) + finalOutput = sum/count; + } + + return finalOutput; + } + + template + __global__ void pafScoreKernel(T* finalOutputPtr, const T* const heatMapPtr, const T* const peaksPtr, + const unsigned int* const bodyPartPairsPtr, const unsigned int* const mapIdxPtr, + const unsigned int poseMaxPeople, const int numberBodyPartPairs, + const int heatmapWidth, const int heatmapHeight) + { + const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; + const auto j = (blockIdx.y * blockDim.y) + threadIdx.y; + const auto k = (blockIdx.z * blockDim.z) + threadIdx.z; + + if (i < numberBodyPartPairs) + { + const int partA = bodyPartPairsPtr[i*2]; + const int partB = bodyPartPairsPtr[i*2 + 1]; + const int mapIdxX = mapIdxPtr[i*2]; + const int mapIdxY = mapIdxPtr[i*2 + 1]; + + const T* const bodyPartA = peaksPtr + (partA*poseMaxPeople*3 + j*3); + const T* const bodyPartB = peaksPtr + (partB*poseMaxPeople*3 + k*3); + const T* const mapX = heatMapPtr + mapIdxX*heatmapWidth*heatmapHeight; + const T* const mapY = heatMapPtr + mapIdxY*heatmapWidth*heatmapHeight; + + const T finalOutput = process(bodyPartA, bodyPartB, mapX, mapY, heatmapWidth, heatmapHeight); + finalOutputPtr[(i*poseMaxPeople+j)*poseMaxPeople + k] = finalOutput; + } + } + + template + void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, const T* const heatMapGpuPtr, const T* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const T interMinAboveThreshold, const T interThreshold, const int minSubsetCnt, const T minSubsetScore, const T scaleFactor, - const T* const heatMapGpuPtr, const T* const peaksGpuPtr) + const T* const peaksGpuPtr) { try { // Parts Connection const auto& bodyPartPairs = getPosePartPairs(poseModel); + const auto& mapIdxOffset = getPoseMapIndex(poseModel); const auto numberBodyParts = getPoseNumberBodyParts(poseModel); const auto numberBodyPartPairs = bodyPartPairs.size() / 2; const auto subsetCounterIndex = numberBodyParts; + // Update mapIdx + auto mapIdx = mapIdxOffset; + for (auto& i : mapIdx) + i += (numberBodyParts+1); + if (numberBodyParts == 0) error("Invalid value of numberBodyParts, it must be positive, not " + std::to_string(numberBodyParts), __LINE__, __FUNCTION__, __FILE__); + // Upload required data to GPU + unsigned int* bodyPartPairsGpuPtr; + cudaMalloc((void **)&bodyPartPairsGpuPtr, bodyPartPairs.size() * sizeof(unsigned int)); + cudaMemcpy(bodyPartPairsGpuPtr, &bodyPartPairs[0], bodyPartPairs.size() * sizeof(unsigned int), + cudaMemcpyHostToDevice); + unsigned int* mapIdxGpuPtr; + cudaMalloc((void **)&mapIdxGpuPtr, mapIdx.size() * sizeof(unsigned int)); + cudaMemcpy(mapIdxGpuPtr, &mapIdx[0], mapIdx.size() * sizeof(unsigned int), cudaMemcpyHostToDevice); + T* finalOutputGpuPtr; + Array finalOutputCpu; + finalOutputCpu.reset({(int)numberBodyPartPairs, (int)POSE_MAX_PEOPLE, (int)POSE_MAX_PEOPLE},-1); + int totalComputations = numberBodyPartPairs * POSE_MAX_PEOPLE * POSE_MAX_PEOPLE; + cudaMalloc((void **)&finalOutputGpuPtr, totalComputations * sizeof(float)); + + // Run Kernel + const dim3 threadsPerBlock{4, 8, 8}; //4 is good for BODY_25, 8 for COCO? + if ((POSE_MAX_PEOPLE+1) % threadsPerBlock.y || (POSE_MAX_PEOPLE+1) % threadsPerBlock.z) + error("Invalid value of POSE_MAX_PEOPLE, it must be multiple of 16, rather than " + + std::to_string(POSE_MAX_PEOPLE), __LINE__, __FUNCTION__, __FILE__); + int pairBlocks = intRound((numberBodyPartPairs/threadsPerBlock.x) + 0.5); + const dim3 numBlocks{(unsigned int)pairBlocks, (POSE_MAX_PEOPLE+1) / threadsPerBlock.y, + (POSE_MAX_PEOPLE+1) / threadsPerBlock.z}; + pafScoreKernel<<>>( + finalOutputGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr, + POSE_MAX_PEOPLE, numberBodyPartPairs, heatMapSize.x, heatMapSize.y); + cudaMemcpy(finalOutputCpu.getPtr(), finalOutputGpuPtr, totalComputations * sizeof(float), + cudaMemcpyDeviceToHost); + // std::vector, double>> refers to: // - std::vector: [body parts locations, #body parts found] // - double: subset score + const T* const tNullptr = nullptr; const auto subsets = generateInitialSubsets( - heatMapPtr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold, - bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex); + tNullptr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold, + bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex, finalOutputCpu); // Delete people below the following thresholds: // a) minSubsetCnt: removed if less than minSubsetCnt body parts @@ -45,8 +165,9 @@ namespace op peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs); // Differences w.r.t. CPU version for now - UNUSED(heatMapGpuPtr); - UNUSED(peaksGpuPtr); + cudaFree(bodyPartPairsGpuPtr); + cudaFree(mapIdxGpuPtr); + cudaFree(finalOutputGpuPtr); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) @@ -56,15 +177,15 @@ namespace op } template void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, - const float* const heatMapPtr, const float* const peaksPtr, + const float* const heatMapGpuPtr, const float* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const float interMinAboveThreshold, const float interThreshold, const int minSubsetCnt, const float minSubsetScore, const float scaleFactor, - const float* const heatMapGpuPtr, const float* const peaksGpuPtr); + const float* const peaksGpuPtr); template void connectBodyPartsGpu(Array& poseKeypoints, Array& poseScores, - const double* const heatMapPtr, const double* const peaksPtr, + const double* const heatMapGpuPtr, const double* const peaksPtr, const PoseModel poseModel, const Point& heatMapSize, const int maxPeaks, const double interMinAboveThreshold, const double interThreshold, const int minSubsetCnt, const double minSubsetScore, const double scaleFactor, - const double* const heatMapGpuPtr, const double* const peaksGpuPtr); + const double* const peaksGpuPtr); } diff --git a/src/openpose/pose/bodyPartConnectorCaffe.cpp b/src/openpose/pose/bodyPartConnectorCaffe.cpp index d1f8e427f84e13add62559f16093b85277046939..e4c67a3c9ce55524b1915a866bc595eb8d2b9d96 100644 --- a/src/openpose/pose/bodyPartConnectorCaffe.cpp +++ b/src/openpose/pose/bodyPartConnectorCaffe.cpp @@ -163,16 +163,14 @@ namespace op { #if defined USE_CAFFE && defined USE_CUDA const auto heatMapsBlob = bottom.at(0); - const auto* const heatMapsPtr = heatMapsBlob->cpu_data(); const auto* const peaksPtr = bottom.at(1)->cpu_data(); const auto* const heatMapsGpuPtr = heatMapsBlob->gpu_data(); const auto* const peaksGpuPtr = bottom.at(1)->gpu_data(); const auto maxPeaks = mTopSize[1]; - connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsPtr, peaksPtr, mPoseModel, + connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsGpuPtr, peaksPtr, mPoseModel, Point{heatMapsBlob->shape(3), heatMapsBlob->shape(2)}, maxPeaks, mInterMinAboveThreshold, mInterThreshold, - mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput, - heatMapsGpuPtr, peaksGpuPtr); + mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput, peaksGpuPtr); #else UNUSED(bottom); UNUSED(poseKeypoints); diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp index cf03ec8f1cf21bb4da4477bb47aef76c8b9793f9..22b74d1511e39c51729295e1e98f7e0b577341cd 100644 --- a/src/openpose/pose/poseExtractorCaffe.cpp +++ b/src/openpose/pose/poseExtractorCaffe.cpp @@ -294,7 +294,6 @@ namespace op upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); - // CUDA version not implemented yet // #ifdef USE_CUDA // upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(), // upImpl->spPeaksBlob.get()}, @@ -304,6 +303,9 @@ namespace op upImpl->spPeaksBlob.get()}, mPoseKeypoints, mPoseScores); // #endif + #ifdef USE_CUDA + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif #else UNUSED(inputNetData); UNUSED(inputDataSize);