提交 04362b9d 编写于 作者: R Raaj 提交者: Gines

GPU Body Part Connector (#834)

上级 a03253a2
......@@ -13,11 +13,11 @@ namespace op
const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
template <typename T>
OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f,
const T* const heatMapGpuPtr = nullptr, const T* const peaksGpuPtr = nullptr);
const T* const peaksGpuPtr = nullptr);
// Private functions used by the 2 above functions
template <typename T>
......@@ -25,7 +25,8 @@ namespace op
const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex);
const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex,
const Array<T>& precomputedPAFs = Array<T>());
template <typename T>
OP_API void removeSubsetsBelowThresholds(std::vector<int>& validSubsetIndexes, int& numberPeople,
......
......@@ -101,7 +101,7 @@ namespace op
const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex)
const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex, const Array<T>& precomputedPAFs)
{
try
{
......@@ -160,6 +160,9 @@ namespace op
auto maxScoreIndex = -1;
if (poseModel == PoseModel::BODY_25E && bodyPartPairsStar[bodyPartB] > -1)
{
if (heatMapPtr == nullptr)
error("HeatMapPtr is null. GPU PAF not implemented for star architecture.",
__LINE__, __FUNCTION__, __FILE__);
const auto pairIndex2 = bodyPartPairsStar[bodyPartB];
const auto* mapX0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2) * heatMapOffset;
const auto* mapY0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2+1) * heatMapOffset;
......@@ -293,6 +296,7 @@ namespace op
std::vector<std::tuple<double, int, int>> allABConnections;
// Note: Problem of this function, if no right PAF between A and B, both elements are discarded.
// However, they should be added indepently, not discarded
if (heatMapPtr != nullptr)
{
const auto* mapX = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex]) * heatMapOffset;
const auto* mapY = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex+1]) * heatMapOffset;
......@@ -336,6 +340,25 @@ namespace op
}
}
}
else if (!precomputedPAFs.empty())
{
for (auto i = 1; i <= numberA; i++)
{
// E.g. neck-nose connection. For each nose
for (auto j = 1; j <= numberB; j++)
{
T scoreAB = precomputedPAFs.at({(int)pairIndex, i+(int)bodyPartA, j+(int)bodyPartB});
// E.g. neck-nose connection. If possible PAF between neck i, nose j --> add
// parts score + connection score
if (scoreAB > 1e-6)
allABConnections.emplace_back(std::make_tuple(scoreAB, i, j));
}
}
//error("Not implemented", __LINE__, __FUNCTION__, __FILE__);
}
else
error("Error. Should not reach here.", __LINE__, __FUNCTION__, __FILE__);
// select the top minAB connection, assuming that each part occur only once
// sort rows in descending order based on parts + connection score
......
......@@ -2,33 +2,153 @@
#include <openpose/pose/poseParameters.hpp>
#include <openpose/utilities/fastMath.hpp>
#include <openpose/pose/bodyPartConnectorBase.hpp>
#include <iostream>
namespace op
{
template<typename T>
inline __device__ int intRoundGPU(const T a)
{
return int(a+0.5f);
}
template <typename T>
void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
inline __device__ T process(const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY,
const int heatmapWidth, const int heatmapHeight, const T interThreshold = T(0.05),
const T interMinAboveThreshold = T(0.95), const T renderThreshold = T(0.05))
{
T finalOutput = -1;
if (bodyPartA[2] < renderThreshold || bodyPartB[2] < renderThreshold) return finalOutput;
const auto vectorAToBX = bodyPartB[0] - bodyPartA[0];
const auto vectorAToBY = bodyPartB[1] - bodyPartA[1];
const auto vectorAToBMax = max(abs(vectorAToBX), abs(vectorAToBY));
const auto numberPointsInLine = max(5, min(25, intRoundGPU(sqrt(5*vectorAToBMax))));
const auto vectorNorm = T(sqrt(vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY));
if (vectorNorm > 1e-6)
{
const auto sX = bodyPartA[0];
const auto sY = bodyPartA[1];
const auto vectorAToBNormX = vectorAToBX/vectorNorm;
const auto vectorAToBNormY = vectorAToBY/vectorNorm;
auto sum = 0.;
auto count = 0;
const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine;
const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine;
for (auto lm = 0; lm < numberPointsInLine; lm++)
{
const auto mX = min(heatmapWidth-1, intRoundGPU(sX + lm*vectorAToBXInLine));
const auto mY = min(heatmapHeight-1, intRoundGPU(sY + lm*vectorAToBYInLine));
const auto idx = mY * heatmapWidth + mX;
const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]);
if (score > interThreshold)
{
sum += score;
count++;
}
}
// // L2 Hack
// int l2Dist = (int)sqrt(pow(vectorAToBX,2) + pow(vectorAToBY,2));
// if (l2Dist <= 2)
// count = numberPointsInLine;
// parts score + connection score
if (count/(float)numberPointsInLine > interMinAboveThreshold)
finalOutput = sum/count;
}
return finalOutput;
}
template <typename T>
__global__ void pafScoreKernel(T* finalOutputPtr, const T* const heatMapPtr, const T* const peaksPtr,
const unsigned int* const bodyPartPairsPtr, const unsigned int* const mapIdxPtr,
const unsigned int poseMaxPeople, const int numberBodyPartPairs,
const int heatmapWidth, const int heatmapHeight)
{
const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto k = (blockIdx.z * blockDim.z) + threadIdx.z;
if (i < numberBodyPartPairs)
{
const int partA = bodyPartPairsPtr[i*2];
const int partB = bodyPartPairsPtr[i*2 + 1];
const int mapIdxX = mapIdxPtr[i*2];
const int mapIdxY = mapIdxPtr[i*2 + 1];
const T* const bodyPartA = peaksPtr + (partA*poseMaxPeople*3 + j*3);
const T* const bodyPartB = peaksPtr + (partB*poseMaxPeople*3 + k*3);
const T* const mapX = heatMapPtr + mapIdxX*heatmapWidth*heatmapHeight;
const T* const mapY = heatMapPtr + mapIdxY*heatmapWidth*heatmapHeight;
const T finalOutput = process(bodyPartA, bodyPartB, mapX, mapY, heatmapWidth, heatmapHeight);
finalOutputPtr[(i*poseMaxPeople+j)*poseMaxPeople + k] = finalOutput;
}
}
template <typename T>
void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
const int minSubsetCnt, const T minSubsetScore, const T scaleFactor,
const T* const heatMapGpuPtr, const T* const peaksGpuPtr)
const T* const peaksGpuPtr)
{
try
{
// Parts Connection
const auto& bodyPartPairs = getPosePartPairs(poseModel);
const auto& mapIdxOffset = getPoseMapIndex(poseModel);
const auto numberBodyParts = getPoseNumberBodyParts(poseModel);
const auto numberBodyPartPairs = bodyPartPairs.size() / 2;
const auto subsetCounterIndex = numberBodyParts;
// Update mapIdx
auto mapIdx = mapIdxOffset;
for (auto& i : mapIdx)
i += (numberBodyParts+1);
if (numberBodyParts == 0)
error("Invalid value of numberBodyParts, it must be positive, not " + std::to_string(numberBodyParts),
__LINE__, __FUNCTION__, __FILE__);
// Upload required data to GPU
unsigned int* bodyPartPairsGpuPtr;
cudaMalloc((void **)&bodyPartPairsGpuPtr, bodyPartPairs.size() * sizeof(unsigned int));
cudaMemcpy(bodyPartPairsGpuPtr, &bodyPartPairs[0], bodyPartPairs.size() * sizeof(unsigned int),
cudaMemcpyHostToDevice);
unsigned int* mapIdxGpuPtr;
cudaMalloc((void **)&mapIdxGpuPtr, mapIdx.size() * sizeof(unsigned int));
cudaMemcpy(mapIdxGpuPtr, &mapIdx[0], mapIdx.size() * sizeof(unsigned int), cudaMemcpyHostToDevice);
T* finalOutputGpuPtr;
Array<T> finalOutputCpu;
finalOutputCpu.reset({(int)numberBodyPartPairs, (int)POSE_MAX_PEOPLE, (int)POSE_MAX_PEOPLE},-1);
int totalComputations = numberBodyPartPairs * POSE_MAX_PEOPLE * POSE_MAX_PEOPLE;
cudaMalloc((void **)&finalOutputGpuPtr, totalComputations * sizeof(float));
// Run Kernel
const dim3 threadsPerBlock{4, 8, 8}; //4 is good for BODY_25, 8 for COCO?
if ((POSE_MAX_PEOPLE+1) % threadsPerBlock.y || (POSE_MAX_PEOPLE+1) % threadsPerBlock.z)
error("Invalid value of POSE_MAX_PEOPLE, it must be multiple of 16, rather than "
+ std::to_string(POSE_MAX_PEOPLE), __LINE__, __FUNCTION__, __FILE__);
int pairBlocks = intRound((numberBodyPartPairs/threadsPerBlock.x) + 0.5);
const dim3 numBlocks{(unsigned int)pairBlocks, (POSE_MAX_PEOPLE+1) / threadsPerBlock.y,
(POSE_MAX_PEOPLE+1) / threadsPerBlock.z};
pafScoreKernel<<<numBlocks, threadsPerBlock>>>(
finalOutputGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr,
POSE_MAX_PEOPLE, numberBodyPartPairs, heatMapSize.x, heatMapSize.y);
cudaMemcpy(finalOutputCpu.getPtr(), finalOutputGpuPtr, totalComputations * sizeof(float),
cudaMemcpyDeviceToHost);
// std::vector<std::pair<std::vector<int>, double>> refers to:
// - std::vector<int>: [body parts locations, #body parts found]
// - double: subset score
const T* const tNullptr = nullptr;
const auto subsets = generateInitialSubsets(
heatMapPtr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex);
tNullptr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex, finalOutputCpu);
// Delete people below the following thresholds:
// a) minSubsetCnt: removed if less than minSubsetCnt body parts
......@@ -45,8 +165,9 @@ namespace op
peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs);
// Differences w.r.t. CPU version for now
UNUSED(heatMapGpuPtr);
UNUSED(peaksGpuPtr);
cudaFree(bodyPartPairsGpuPtr);
cudaFree(mapIdxGpuPtr);
cudaFree(finalOutputGpuPtr);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
catch (const std::exception& e)
......@@ -56,15 +177,15 @@ namespace op
}
template void connectBodyPartsGpu(Array<float>& poseKeypoints, Array<float>& poseScores,
const float* const heatMapPtr, const float* const peaksPtr,
const float* const heatMapGpuPtr, const float* const peaksPtr,
const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
const float interMinAboveThreshold, const float interThreshold,
const int minSubsetCnt, const float minSubsetScore, const float scaleFactor,
const float* const heatMapGpuPtr, const float* const peaksGpuPtr);
const float* const peaksGpuPtr);
template void connectBodyPartsGpu(Array<double>& poseKeypoints, Array<double>& poseScores,
const double* const heatMapPtr, const double* const peaksPtr,
const double* const heatMapGpuPtr, const double* const peaksPtr,
const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
const double interMinAboveThreshold, const double interThreshold,
const int minSubsetCnt, const double minSubsetScore, const double scaleFactor,
const double* const heatMapGpuPtr, const double* const peaksGpuPtr);
const double* const peaksGpuPtr);
}
......@@ -163,16 +163,14 @@ namespace op
{
#if defined USE_CAFFE && defined USE_CUDA
const auto heatMapsBlob = bottom.at(0);
const auto* const heatMapsPtr = heatMapsBlob->cpu_data();
const auto* const peaksPtr = bottom.at(1)->cpu_data();
const auto* const heatMapsGpuPtr = heatMapsBlob->gpu_data();
const auto* const peaksGpuPtr = bottom.at(1)->gpu_data();
const auto maxPeaks = mTopSize[1];
connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsPtr, peaksPtr, mPoseModel,
connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsGpuPtr, peaksPtr, mPoseModel,
Point<int>{heatMapsBlob->shape(3), heatMapsBlob->shape(2)},
maxPeaks, mInterMinAboveThreshold, mInterThreshold,
mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput,
heatMapsGpuPtr, peaksGpuPtr);
mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput, peaksGpuPtr);
#else
UNUSED(bottom);
UNUSED(poseKeypoints);
......
......@@ -294,7 +294,6 @@ namespace op
upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
// CUDA version not implemented yet
// #ifdef USE_CUDA
// upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(),
// upImpl->spPeaksBlob.get()},
......@@ -304,6 +303,9 @@ namespace op
upImpl->spPeaksBlob.get()},
mPoseKeypoints, mPoseScores);
// #endif
#ifdef USE_CUDA
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#endif
#else
UNUSED(inputNetData);
UNUSED(inputDataSize);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册