diff --git a/include/openpose/pose/bodyPartConnectorBase.hpp b/include/openpose/pose/bodyPartConnectorBase.hpp
index 62033e7c0383b214c9b0b122d5d4144ef660f631..765953ba58503f6060cb499e640c928d50054fbf 100644
--- a/include/openpose/pose/bodyPartConnectorBase.hpp
+++ b/include/openpose/pose/bodyPartConnectorBase.hpp
@@ -13,11 +13,11 @@ namespace op
                                     const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
 
     template <typename T>
-    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
+    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
                                     const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
                                     const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
                                     const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f,
-                                    const T* const heatMapGpuPtr = nullptr, const T* const peaksGpuPtr = nullptr);
+                                    const T* const peaksGpuPtr = nullptr);
 
     // Private functions used by the 2 above functions
     template <typename T>
@@ -25,7 +25,8 @@ namespace op
         const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
         const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
         const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
-        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex);
+        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex,
+        const Array<T>& precomputedPAFs = Array<T>());
 
     template <typename T>
     OP_API void removeSubsetsBelowThresholds(std::vector<int>& validSubsetIndexes, int& numberPeople,
diff --git a/src/openpose/pose/bodyPartConnectorBase.cpp b/src/openpose/pose/bodyPartConnectorBase.cpp
index 4a054750a447ac8a9c9021ddd79068bd1d4eda93..835a5d20e51699e2e740b31bb343c78769b08121 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cpp
+++ b/src/openpose/pose/bodyPartConnectorBase.cpp
@@ -101,7 +101,7 @@ namespace op
         const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
         const int maxPeaks, const T interThreshold, const T interMinAboveThreshold,
         const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts,
-        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex)
+        const unsigned int numberBodyPartPairs, const unsigned int subsetCounterIndex, const Array<T>& precomputedPAFs)
     {
         try
         {
@@ -160,6 +160,9 @@ namespace op
                                     auto maxScoreIndex = -1;
                                     if (poseModel == PoseModel::BODY_25E && bodyPartPairsStar[bodyPartB] > -1)
                                     {
+                                        if (heatMapPtr == nullptr)
+                                            error("HeatMapPtr is null. GPU PAF not implemented for star architecture.",
+                                                  __LINE__, __FUNCTION__, __FILE__);
                                         const auto pairIndex2 = bodyPartPairsStar[bodyPartB];
                                         const auto* mapX0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2) * heatMapOffset;
                                         const auto* mapY0 = heatMapPtr + (numberBodyPartsAndBkg + pairIndex2+1) * heatMapOffset;
@@ -293,6 +296,7 @@ namespace op
                     std::vector<std::tuple<double, int, int>> allABConnections;
                     // Note: Problem of this function, if no right PAF between A and B, both elements are discarded.
                     // However, they should be added indepently, not discarded
+                    if (heatMapPtr != nullptr)
                     {
                         const auto* mapX = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex]) * heatMapOffset;
                         const auto* mapY = heatMapPtr + (numberBodyPartsAndBkg + mapIdx[2*pairIndex+1]) * heatMapOffset;
@@ -336,6 +340,25 @@ namespace op
                             }
                         }
                     }
+                    else if (!precomputedPAFs.empty())
+                    {
+                        for (auto i = 1; i <= numberA; i++)
+                        {
+                            // E.g. neck-nose connection. For each nose
+                            for (auto j = 1; j <= numberB; j++)
+                            {
+                                T scoreAB = precomputedPAFs.at({(int)pairIndex, i+(int)bodyPartA, j+(int)bodyPartB});
+
+                                // E.g. neck-nose connection. If possible PAF between neck i, nose j --> add
+                                // parts score + connection score
+                                if (scoreAB > 1e-6)
+                                    allABConnections.emplace_back(std::make_tuple(scoreAB, i, j));
+                            }
+                        }
+                        //error("Not implemented", __LINE__, __FUNCTION__, __FILE__);
+                    }
+                    else
+                        error("Error. Should not reach here.", __LINE__, __FUNCTION__, __FILE__);
 
                     // select the top minAB connection, assuming that each part occur only once
                     // sort rows in descending order based on parts + connection score
diff --git a/src/openpose/pose/bodyPartConnectorBase.cu b/src/openpose/pose/bodyPartConnectorBase.cu
index f54eaf5d4eae388bdce171d9a29531f190f23e98..2f71d7a5973c900e7f089d96be4f84aa1c9e506d 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
@@ -2,33 +2,153 @@
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/pose/bodyPartConnectorBase.hpp>
+#include <iostream>
 
 namespace op
 {
+    template<typename T>
+    inline __device__ int intRoundGPU(const T a)
+    {
+        return int(a+0.5f);
+    }
+
     template <typename T>
-    void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
+    inline __device__  T process(const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY,
+                                 const int heatmapWidth, const int heatmapHeight, const T interThreshold = T(0.05),
+                                 const T interMinAboveThreshold = T(0.95), const T renderThreshold = T(0.05))
+    {
+        T finalOutput = -1;
+        if (bodyPartA[2] < renderThreshold || bodyPartB[2] < renderThreshold) return finalOutput;
+
+        const auto vectorAToBX = bodyPartB[0] - bodyPartA[0];
+        const auto vectorAToBY = bodyPartB[1] - bodyPartA[1];
+        const auto vectorAToBMax = max(abs(vectorAToBX), abs(vectorAToBY));
+        const auto numberPointsInLine = max(5, min(25, intRoundGPU(sqrt(5*vectorAToBMax))));
+        const auto vectorNorm = T(sqrt(vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY));
+
+        if (vectorNorm > 1e-6)
+        {
+            const auto sX = bodyPartA[0];
+            const auto sY = bodyPartA[1];
+            const auto vectorAToBNormX = vectorAToBX/vectorNorm;
+            const auto vectorAToBNormY = vectorAToBY/vectorNorm;
+
+            auto sum = 0.;
+            auto count = 0;
+            const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine;
+            const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine;
+            for (auto lm = 0; lm < numberPointsInLine; lm++)
+            {
+                const auto mX = min(heatmapWidth-1, intRoundGPU(sX + lm*vectorAToBXInLine));
+                const auto mY = min(heatmapHeight-1, intRoundGPU(sY + lm*vectorAToBYInLine));
+                const auto idx = mY * heatmapWidth + mX;
+                const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]);
+                if (score > interThreshold)
+                {
+                    sum += score;
+                    count++;
+                }
+            }
+
+            // // L2 Hack
+            // int l2Dist = (int)sqrt(pow(vectorAToBX,2) + pow(vectorAToBY,2));
+            // if (l2Dist <= 2)
+            //     count = numberPointsInLine;
+
+            // parts score + connection score
+            if (count/(float)numberPointsInLine > interMinAboveThreshold)
+                finalOutput = sum/count;
+        }
+
+        return finalOutput;
+    }
+
+    template <typename T>
+    __global__ void pafScoreKernel(T* finalOutputPtr, const T* const heatMapPtr, const T* const peaksPtr,
+                                   const unsigned int* const bodyPartPairsPtr, const unsigned int* const mapIdxPtr,
+                                   const unsigned int poseMaxPeople, const int numberBodyPartPairs,
+                                   const int heatmapWidth, const int heatmapHeight)
+    {
+        const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto k = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        if (i < numberBodyPartPairs)
+        {
+            const int partA = bodyPartPairsPtr[i*2];
+            const int partB = bodyPartPairsPtr[i*2 + 1];
+            const int mapIdxX = mapIdxPtr[i*2];
+            const int mapIdxY = mapIdxPtr[i*2 + 1];
+
+            const T* const bodyPartA = peaksPtr + (partA*poseMaxPeople*3 + j*3);
+            const T* const bodyPartB = peaksPtr + (partB*poseMaxPeople*3 + k*3);
+            const T* const mapX = heatMapPtr + mapIdxX*heatmapWidth*heatmapHeight;
+            const T* const mapY = heatMapPtr + mapIdxY*heatmapWidth*heatmapHeight;
+
+            const T finalOutput = process(bodyPartA, bodyPartB, mapX, mapY, heatmapWidth, heatmapHeight);
+            finalOutputPtr[(i*poseMaxPeople+j)*poseMaxPeople + k] = finalOutput;
+        }
+    }
+
+    template <typename T>
+    void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
                              const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
                              const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
                              const int minSubsetCnt, const T minSubsetScore, const T scaleFactor,
-                             const T* const heatMapGpuPtr, const T* const peaksGpuPtr)
+                             const T* const peaksGpuPtr)
     {
         try
         {
             // Parts Connection
             const auto& bodyPartPairs = getPosePartPairs(poseModel);
+            const auto& mapIdxOffset = getPoseMapIndex(poseModel);
             const auto numberBodyParts = getPoseNumberBodyParts(poseModel);
             const auto numberBodyPartPairs = bodyPartPairs.size() / 2;
             const auto subsetCounterIndex = numberBodyParts;
+            // Update mapIdx
+            auto mapIdx = mapIdxOffset;
+            for (auto& i : mapIdx)
+                i += (numberBodyParts+1);
+
             if (numberBodyParts == 0)
                 error("Invalid value of numberBodyParts, it must be positive, not " + std::to_string(numberBodyParts),
                       __LINE__, __FUNCTION__, __FILE__);
 
+            // Upload required data to GPU
+            unsigned int* bodyPartPairsGpuPtr;
+            cudaMalloc((void **)&bodyPartPairsGpuPtr, bodyPartPairs.size() * sizeof(unsigned int));
+            cudaMemcpy(bodyPartPairsGpuPtr, &bodyPartPairs[0], bodyPartPairs.size() * sizeof(unsigned int),
+                       cudaMemcpyHostToDevice);
+            unsigned int* mapIdxGpuPtr;
+            cudaMalloc((void **)&mapIdxGpuPtr, mapIdx.size() * sizeof(unsigned int));
+            cudaMemcpy(mapIdxGpuPtr, &mapIdx[0], mapIdx.size() * sizeof(unsigned int), cudaMemcpyHostToDevice);
+            T* finalOutputGpuPtr;
+            Array<T> finalOutputCpu;
+            finalOutputCpu.reset({(int)numberBodyPartPairs, (int)POSE_MAX_PEOPLE, (int)POSE_MAX_PEOPLE},-1);
+            int totalComputations = numberBodyPartPairs * POSE_MAX_PEOPLE * POSE_MAX_PEOPLE;
+            cudaMalloc((void **)&finalOutputGpuPtr, totalComputations * sizeof(float));
+
+            // Run Kernel
+            const dim3 threadsPerBlock{4, 8, 8}; //4 is good for BODY_25, 8 for COCO?
+            if ((POSE_MAX_PEOPLE+1) % threadsPerBlock.y || (POSE_MAX_PEOPLE+1) % threadsPerBlock.z)
+                error("Invalid value of POSE_MAX_PEOPLE, it must be multiple of 16, rather than "
+                      + std::to_string(POSE_MAX_PEOPLE), __LINE__, __FUNCTION__, __FILE__);
+            int pairBlocks = intRound((numberBodyPartPairs/threadsPerBlock.x) + 0.5);
+            const dim3 numBlocks{(unsigned int)pairBlocks, (POSE_MAX_PEOPLE+1) / threadsPerBlock.y,
+                                 (POSE_MAX_PEOPLE+1) / threadsPerBlock.z};
+            pafScoreKernel<<<numBlocks, threadsPerBlock>>>(
+                finalOutputGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr,
+                POSE_MAX_PEOPLE, numberBodyPartPairs, heatMapSize.x, heatMapSize.y);
+            cudaMemcpy(finalOutputCpu.getPtr(), finalOutputGpuPtr, totalComputations * sizeof(float),
+                       cudaMemcpyDeviceToHost);
+
             // std::vector<std::pair<std::vector<int>, double>> refers to:
             //     - std::vector<int>: [body parts locations, #body parts found]
             //     - double: subset score
+            const T* const tNullptr = nullptr;
             const auto subsets = generateInitialSubsets(
-                heatMapPtr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
-                bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex);
+                tNullptr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
+                bodyPartPairs, numberBodyParts, numberBodyPartPairs, subsetCounterIndex, finalOutputCpu);
 
             // Delete people below the following thresholds:
                 // a) minSubsetCnt: removed if less than minSubsetCnt body parts
@@ -45,8 +165,9 @@ namespace op
                                             peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs);
 
             // Differences w.r.t. CPU version for now
-            UNUSED(heatMapGpuPtr);
-            UNUSED(peaksGpuPtr);
+            cudaFree(bodyPartPairsGpuPtr);
+            cudaFree(mapIdxGpuPtr);
+            cudaFree(finalOutputGpuPtr);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
@@ -56,15 +177,15 @@ namespace op
     }
 
     template void connectBodyPartsGpu(Array<float>& poseKeypoints, Array<float>& poseScores,
-                                      const float* const heatMapPtr, const float* const peaksPtr,
+                                      const float* const heatMapGpuPtr, const float* const peaksPtr,
                                       const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                       const float interMinAboveThreshold, const float interThreshold,
                                       const int minSubsetCnt, const float minSubsetScore, const float scaleFactor,
-                                      const float* const heatMapGpuPtr, const float* const peaksGpuPtr);
+                                      const float* const peaksGpuPtr);
     template void connectBodyPartsGpu(Array<double>& poseKeypoints, Array<double>& poseScores,
-                                      const double* const heatMapPtr, const double* const peaksPtr,
+                                      const double* const heatMapGpuPtr, const double* const peaksPtr,
                                       const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                       const double interMinAboveThreshold, const double interThreshold,
                                       const int minSubsetCnt, const double minSubsetScore, const double scaleFactor,
-                                      const double* const heatMapGpuPtr, const double* const peaksGpuPtr);
+                                      const double* const peaksGpuPtr);
 }
diff --git a/src/openpose/pose/bodyPartConnectorCaffe.cpp b/src/openpose/pose/bodyPartConnectorCaffe.cpp
index d1f8e427f84e13add62559f16093b85277046939..e4c67a3c9ce55524b1915a866bc595eb8d2b9d96 100644
--- a/src/openpose/pose/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/pose/bodyPartConnectorCaffe.cpp
@@ -163,16 +163,14 @@ namespace op
         {
             #if defined USE_CAFFE && defined USE_CUDA
                 const auto heatMapsBlob = bottom.at(0);
-                const auto* const heatMapsPtr = heatMapsBlob->cpu_data();
                 const auto* const peaksPtr = bottom.at(1)->cpu_data();
                 const auto* const heatMapsGpuPtr = heatMapsBlob->gpu_data();
                 const auto* const peaksGpuPtr = bottom.at(1)->gpu_data();
                 const auto maxPeaks = mTopSize[1];
-                connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsPtr, peaksPtr, mPoseModel,
+                connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsGpuPtr, peaksPtr, mPoseModel,
                                     Point<int>{heatMapsBlob->shape(3), heatMapsBlob->shape(2)},
                                     maxPeaks, mInterMinAboveThreshold, mInterThreshold,
-                                    mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput,
-                                    heatMapsGpuPtr, peaksGpuPtr);
+                                    mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput, peaksGpuPtr);
             #else
                 UNUSED(bottom);
                 UNUSED(poseKeypoints);
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index cf03ec8f1cf21bb4da4477bb47aef76c8b9793f9..22b74d1511e39c51729295e1e98f7e0b577341cd 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -294,7 +294,6 @@ namespace op
                 upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
                 upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
 
-                // CUDA version not implemented yet
                 // #ifdef USE_CUDA
                 //     upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(),
                 //                                                    upImpl->spPeaksBlob.get()},
@@ -304,6 +303,9 @@ namespace op
                                                                    upImpl->spPeaksBlob.get()},
                                                                   mPoseKeypoints, mPoseScores);
                 // #endif
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
             #else
                 UNUSED(inputNetData);
                 UNUSED(inputDataSize);