diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b58bcc283aedc9afc995c39bc8791d5c528e96c..8d2565ae787dc85bc170f2c74eb0046af37825bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -235,9 +235,12 @@ option(DOWNLOAD_HAND_MODEL "Download hand model." ON)
 option(BUILD_EXAMPLES "Build OpenPose examples." ON)
 option(BUILD_DOCS "Build OpenPose documentation." OFF)
 option(BUILD_PYTHON "Build OpenPose python." OFF)
+if (WIN32)
+  option(BUILD_DLL "Copy all required DLL files into the same folder." ON)
+endif ()
 
 # Build as shared library
-option(BUILD_SHARED_LIBS "Build as shared lib" ON)
+option(BUILD_SHARED_LIBS "Build as shared lib." ON)
 
 # Speed profiler
 option(PROFILER_ENABLED "If enabled, OpenPose will be able to print out speed information at runtime." OFF)
@@ -467,14 +470,18 @@ if (WIN32)
       find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe/lib)
     endif (${GPU_MODE} MATCHES "CPU_ONLY")
   endif (${GPU_MODE} MATCHES "OPENCL")
-
-  if (${GPU_MODE} MATCHES "OPENCL")
-    unset(BOOST_SYSTEM_LIB_RELEASE CACHE)
-    unset(BOOST_SYSTEM_LIB_DEBUG CACHE)
-    find_library(BOOST_SYSTEM_LIB_RELEASE boost_system-vc140-mt-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
-    find_library(BOOST_SYSTEM_LIB_DEBUG boost_system-vc140-mt-gd-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
-  endif (${GPU_MODE} MATCHES "OPENCL")
-
+  # Boost DepCopy over required DLL F
+  if (${GPU_MODE} MATCHES "CPU_ONLY" OR ${GPU_MODE} MATCHES "OPENCL" OR BUILD_PYTHON)
+      find_library(BOOST_SYSTEM_LIB_RELEASE libboost_system-vc140-mt-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+      find_library(BOOST_SYSTEM_LIB_DEBUG libboost_system-vc140-mt-gd-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+      find_library(BOOST_FILESYSTEM_LIB_RELEASE libboost_filesystem-vc140-mt-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+      find_library(BOOST_FILESYSTEM_LIB_DEBUG libboost_filesystem-vc140-mt-gd-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+  else ()
+      set(BOOST_SYSTEM_LIB_RELEASE "")
+      set(BOOST_SYSTEM_LIB_DEBUG "")
+      set(BOOST_FILESYSTEM_LIB_RELEASE "")
+      set(BOOST_FILESYSTEM_LIB_DEBUG "")
+  endif ()
   if (WITH_3D_RENDERER)
     find_library(GLUT_LIBRARY freeglut HINTS ${FIND_LIB_PREFIX}/freeglut/lib)
     message(STATUS "\${GLUT_LIBRARY} = ${GLUT_LIBRARY}")
@@ -507,6 +514,32 @@ if (WIN32)
     set(SPINNAKER_INCLUDE_DIRS "3rdparty/windows/spinnaker/include")
   endif (WITH_FLIR_CAMERA)
   set(Caffe_FOUND 1)
+
+  # Build DLL Must be on if Build Python is on
+  if (BUILD_PYTHON)
+    if (NOT BUILD_DLL)
+      message(FATAL_ERROR "BUILD_DLL must be turned on to as well to build python library")
+    endif ()
+  endif ()
+
+  # Auto copy DLLs
+  if (BUILD_DLL)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+    # Auto copy DLLs
+    if (${GPU_MODE} MATCHES "CUDA")
+      file(GLOB CAFFE_DLL "${CMAKE_SOURCE_DIR}/3rdparty/windows/caffe/bin/*.dll")
+    elseif (${GPU_MODE} MATCHES "OPENCL")
+      file(GLOB CAFFE_DLL "${CMAKE_SOURCE_DIR}/3rdparty/windows/caffe_opencl/bin/*.dll")
+    elseif (${GPU_MODE} MATCHES "CPU_ONLY")
+      file(GLOB CAFFE_DLL "${CMAKE_SOURCE_DIR}/3rdparty/windows/caffe_cpu/bin/*.dll")
+    endif ()
+    file(GLOB OPENCV_DLL "${CMAKE_SOURCE_DIR}/3rdparty/windows/opencv/x64/vc14/bin/*.dll")
+    file(GLOB OPENCV3PTY_DLL "${CMAKE_SOURCE_DIR}/3rdparty/windows/caffe3rdparty/lib/*.dll")
+    file(COPY ${CAFFE_DLL} DESTINATION ${CMAKE_BINARY_DIR}/lib)
+    file(COPY ${OPENCV_DLL} DESTINATION ${CMAKE_BINARY_DIR}/lib)
+    file(COPY ${OPENCV3PTY_DLL} DESTINATION ${CMAKE_BINARY_DIR}/lib)
+  endif ()
+
 endif (WIN32)
 
 
@@ -739,11 +772,16 @@ if (USE_MKL)
 endif (USE_MKL)
 if (${GPU_MODE} MATCHES "OPENCL")
   set(OpenPose_3rdparty_libraries ${OpenPose_3rdparty_libraries} ${CMAKE_THREAD_LIBS_INIT} ${OpenCL_LIBRARIES})
-  if (WIN32)
-    set(OpenPose_3rdparty_libraries ${OpenPose_3rdparty_libraries}
-        debug ${BOOST_SYSTEM_LIB_DEBUG} optimized ${BOOST_SYSTEM_LIB_RELEASE})
-  endif (WIN32)
 endif (${GPU_MODE} MATCHES "OPENCL")
+# Boost
+if (WIN32)
+    if (${GPU_MODE} MATCHES "CPU_ONLY" OR ${GPU_MODE} MATCHES "OPENCL" OR BUILD_PYTHON)
+    set(OpenPose_3rdparty_libraries ${OpenPose_3rdparty_libraries}
+        debug ${BOOST_SYSTEM_LIB_RELEASE} optimized ${BOOST_SYSTEM_LIB_RELEASE})
+        set(OpenPose_3rdparty_libraries ${OpenPose_3rdparty_libraries}
+        debug ${BOOST_FILESYSTEM_LIB_RELEASE} optimized ${BOOST_FILESYSTEM_LIB_RELEASE})
+    endif ()
+endif (WIN32)
 # 3-D
 if (WITH_3D_ADAM_MODEL)
   set(OpenPose_3rdparty_libraries ${OpenPose_3rdparty_libraries}
diff --git a/doc/modules/python_module.md b/doc/modules/python_module.md
index 770a77ec4949f7a0c6eda81638ec1a275595ccf4..1dc9b00dd511f9f392e753725367d58df30d979c 100644
--- a/doc/modules/python_module.md
+++ b/doc/modules/python_module.md
@@ -9,14 +9,23 @@ OpenPose Python Module
 
 
 ## Introduction
-This experimental module exposes a Python API for OpenPose. This allows you to construct an OpenPose object, pass in a numpy array for an image, and get a numpy array of the pose positions. This API also exposes an API that allows you to directly pass in heatmaps from a network and extract poses out of it.
-
+This experimental module exposes a Python API for OpenPose. This allows you to construct an OpenPose object, pass in a numpy array for an image, and get a numpy array of the pose positions. This API also exposes an API that allows you to directly pass in heatmaps from a network and extract poses out of it (Requires Python Caffe to be installed seperately)
 
+At present the Python API only supports body pose. Hands and Face will be added in the future.
 
 ## Installation
 Check [doc/installation.md#python-module](./installation.md#python-module) for installation steps.
 
+To simply test the OpenPose API in your project without installation, ensure that the line `sys.path.append('{OpenPose_path}/python')` is set in your *.py files, where `{OpenPose_path}` points to your build folder of OpenPose. Take a look at `build/examples/tutorial_pose/1_extract_pose.py` for an example.
+
+On an Ubuntu or OSX based system, you may use it globally. Running `sudo make install` will install OpenPose by default into `/usr/local/python`. You can set this into your python path and start using it at any location.
 
+The Python API requires Numpy for array management, and OpenCV for image loading. They can be installed via:
+
+```
+pip install numpy
+pip install opencv-python
+```
 
 ## Compatibility
 The OpenPose Python module is compatible with both Python 2 and Python 3. In addition, it will also run in all OpenPose compatible operating systems.
@@ -26,56 +35,16 @@ The OpenPose Python module is compatible with both Python 2 and Python 3. In add
 ## Testing
 Two examples can be found in `build/examples/tutorial_python` in your build folder. Navigate directly to this path to run examples.
 
-    - `1_extract_pose` demonstrates a simple use of the API.
-    - `2_pose_from_heatmaps` demonstrates constructing pose from heatmaps from the caffe network.
+- `1_extract_pose` demonstrates a simple use of the API.
+- `2_pose_from_heatmaps` demonstrates constructing pose from heatmaps from the caffe network. (Requires Python Caffe to be installed seperately)
 
 ```
 # From command line
 cd build/examples/tutorial_python
-python
+python 1_extract_pose.py
 ```
 
-```python
-# From Python
-# It requires OpenCV installed for Python
-import cv2
-import os
-import sys
-
-# Remember to add your installation path here
-# Option a
-sys.path.append('{OpenPose_path}/python')
-# Option b
-# If you run `make install` (default path is `/usr/local/python` for Ubuntu), you can also access the OpenPose/python module from there. This will install OpenPose and the python library at your desired installation path. Ensure that this is in your python path in order to use it.
-# sys.path.append('/usr/local/python')
-
-from openpose import *
-
-# Parameters for OpenPose. Take a look at C++ OpenPose example for meaning of components. Ensure all below are filled
-params = dict() 
-params["logging_level"] = 3
-params["output_resolution"] = "-1x-1"
-params["net_resolution"] = "-1x368"
-params["model_pose"] = "BODY_25"
-params["alpha_pose"] = 0.6
-params["scale_gap"] = 0.3
-params["scale_number"] = 1
-params["render_threshold"] = 0.05
-params["num_gpu_start"] = 0 
-# If GPU version is built, and multiple GPUs are available, set the ID here
-params["disable_blending"] = False
-params["default_model_folder"] = "/home/user/openpose/models"
-# Construct OpenPose object allocates GPU memory
-openpose = OpenPose(params)
-
-while 1:
-    # Read new image
-    img = cv2.imread("image.png")
-    # Output keypoints and the image with the human skeleton blended on it
-    keypoints, output_image = openpose.forward(img, True)
-    # Print the human pose keypoints, i.e., a [#people x #keypoints x 3]-dimensional numpy object with the keypoints of all the people on that image
-    print keypoints
-    # Display the image
-    cv2.imshow("output", output_image)
-    cv2.waitKey(15)
-```
+
+
+## Code Sample
+See `examples/tutorial_python/1_extract_pose.py`.
diff --git a/examples/tutorial_python/1_extract_pose.py b/examples/tutorial_python/1_extract_pose.py
index ccf9a7fee215cd56c5d2c27aaa915471648ac3e0..dfc48fead660e5c04676c836b49b5499b9d69167 100644
--- a/examples/tutorial_python/1_extract_pose.py
+++ b/examples/tutorial_python/1_extract_pose.py
@@ -1,27 +1,45 @@
+# From Python
+# It requires OpenCV installed for Python
 import sys
 import cv2
 import os
+from sys import platform
+
+# Remember to add your installation path here
+# Option a
 dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append('../../python')
-from openpose import *
+if platform == "win32": sys.path.append(dir_path + '/../../python/openpose/');
+else: sys.path.append('../../python');
+# Option b
+# If you run `make install` (default path is `/usr/local/python` for Ubuntu), you can also access the OpenPose/python module from there. This will install OpenPose and the python library at your desired installation path. Ensure that this is in your python path in order to use it.
+# sys.path.append('/usr/local/python')
 
+# Parameters for OpenPose. Take a look at C++ OpenPose example for meaning of components. Ensure all below are filled
+from openpose import *
 params = dict()
 params["logging_level"] = 3
 params["output_resolution"] = "-1x-1"
 params["net_resolution"] = "-1x368"
-params["model_pose"] = "COCO"
+params["model_pose"] = "BODY_25"
 params["alpha_pose"] = 0.6
 params["scale_gap"] = 0.3
 params["scale_number"] = 1
 params["render_threshold"] = 0.05
+# If GPU version is built, and multiple GPUs are available, set the ID here
 params["num_gpu_start"] = 0
 params["disable_blending"] = False
+# Ensure you point to the correct path where models are located
 params["default_model_folder"] = dir_path + "/../../../models/"
+# Construct OpenPose object allocates GPU memory
 openpose = OpenPose(params)
-img = cv2.imread(dir_path + "/../../../examples/media/COCO_val2014_000000000192.jpg")
-arr, output_image = openpose.forward(img, True)
-print arr
 
 while 1:
+    # Read new image
+    img = cv2.imread("image.png")
+    # Output keypoints and the image with the human skeleton blended on it
+    keypoints, output_image = openpose.forward(img, True)
+    # Print the human pose keypoints, i.e., a [#people x #keypoints x 3]-dimensional numpy object with the keypoints of all the people on that image
+    print(keypoints)
+    # Display the image
     cv2.imshow("output", output_image)
     cv2.waitKey(15)
diff --git a/examples/tutorial_python/2_pose_from_heatmaps.py b/examples/tutorial_python/2_pose_from_heatmaps.py
index a07837078f881d4680f031d81b4d54c3855e8d3e..b615d4bac56703cd88beb8ef5bf9d856217032d6 100644
--- a/examples/tutorial_python/2_pose_from_heatmaps.py
+++ b/examples/tutorial_python/2_pose_from_heatmaps.py
@@ -1,3 +1,12 @@
+from sys import platform
+import sys
+try:
+    import caffe
+except ImportError:
+    print("This sample can only be run if Python Caffe if available on your system")
+    print("Currently OpenPose does not compile Python Caffe. This may be supported in the future")
+    sys.exit(-1)
+    
 import os
 os.environ["GLOG_minloglevel"] = "1"
 import caffe
@@ -36,7 +45,7 @@ caffe.set_device(0)
 nets = []
 for scale in scales:
     nets.append(caffe.Net(Param.prototxt, Param.caffemodel, caffe.TEST))
-print "Net loaded"
+print("Net loaded")
 
 # Test Function
 first_run = True
@@ -57,7 +66,7 @@ def func(frame):
             net.reshape()
 
         first_run = False
-        print "Reshaped"
+        print("Reshaped")
 
     # Forward pass to get heatmaps
     heatmaps = []
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c7f3cc7155b5e01078c4bcaf712ada68d3a475f2..da8ab7a7a9fe007e850fe9fdf3b1b026743fb7f7 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_subdirectory(openpose)
-
+add_subdirectory(openpose)
+
diff --git a/python/openpose/CMakeLists.txt b/python/openpose/CMakeLists.txt
index c266faa32a75c546bb6759c8c027fae75b0ace47..280348fda3fb86f55e4a71a7bb251392bb8107d5 100644
--- a/python/openpose/CMakeLists.txt
+++ b/python/openpose/CMakeLists.txt
@@ -1,14 +1,14 @@
-set(PYTHON_FILES
-    openpose.py
-    __init__.py
-    _openpose.cpp)
-
-add_library(_openpose SHARED ${PYTHON_FILES})
-target_link_libraries(_openpose openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-SET_TARGET_PROPERTIES(_openpose PROPERTIES PREFIX "")
-configure_file(openpose.py openpose.py)
-configure_file(__init__.py __init__.py)
-
-#install(TARGETS _openpose DESTINATION python)
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ DESTINATION python/openpose FILES_MATCHING PATTERN "*.so")
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ DESTINATION python/openpose FILES_MATCHING PATTERN "*.py")
+set(PYTHON_FILES
+    openpose.py
+    __init__.py
+    _openpose.cpp)
+
+add_library(_openpose SHARED ${PYTHON_FILES})
+target_link_libraries(_openpose openpose ${OpenPose_3rdparty_libraries})
+SET_TARGET_PROPERTIES(_openpose PROPERTIES PREFIX "")
+configure_file(openpose.py openpose.py)
+configure_file(__init__.py __init__.py)
+
+#install(TARGETS _openpose DESTINATION python)
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ DESTINATION python/openpose FILES_MATCHING PATTERN "*.so")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ DESTINATION python/openpose FILES_MATCHING PATTERN "*.py")
\ No newline at end of file
diff --git a/python/openpose/__init__.py b/python/openpose/__init__.py
index 53bfb853e0a608b78385bcce3e502c845e6ce0a9..60c2d27823fbe8f1e228607dd12af8359693ee71 100644
--- a/python/openpose/__init__.py
+++ b/python/openpose/__init__.py
@@ -1 +1 @@
-from openpose import *
+from openpose import *
diff --git a/python/openpose/_openpose.cpp b/python/openpose/_openpose.cpp
index 0b95fcb105022ae22ba7d61829ba58df8a8de854..9e9a433a67434452c6e8f74913e8bc4d797251fa 100644
--- a/python/openpose/_openpose.cpp
+++ b/python/openpose/_openpose.cpp
@@ -1,343 +1,350 @@
-#ifndef OPENPOSE_PYTHON_HPP
-#define OPENPOSE_PYTHON_HPP
-
-// OpenPose dependencies
-#include <openpose/core/headers.hpp>
-#include <openpose/filestream/headers.hpp>
-#include <openpose/gui/headers.hpp>
-#include <openpose/pose/headers.hpp>
-#include <openpose/utilities/headers.hpp>
-#include <caffe/caffe.hpp>
-#include <stdlib.h>
-
-#include <openpose/net/nmsCaffe.hpp>
-#include <openpose/net/resizeAndMergeCaffe.hpp>
-#include <openpose/pose/bodyPartConnectorCaffe.hpp>
-#include <boost/make_shared.hpp>
-#include <openpose/pose/poseParameters.hpp>
-#include <openpose/pose/enumClasses.hpp>
-#include <openpose/pose/poseExtractor.hpp>
-#include <openpose/gpu/cuda.hpp>
-#include <openpose/gpu/opencl.hcl>
-
-#define default_logging_level 3
-#define default_output_resolution "-1x-1"
-#define default_net_resolution "-1x368"
-#define default_model_pose "COCO"
-#define default_alpha_pose 0.6
-#define default_scale_gap 0.3
-#define default_scale_number 1
-#define default_render_threshold 0.05
-#define default_num_gpu_start 0
-#define default_disable_blending false
-#define default_model_folder "models/"
-
-// Todo, have GPU Number, handle, OpenCL/CPU Cases
-
-class OpenPose{
-public:
-    std::unique_ptr<op::PoseExtractorCaffe> poseExtractorCaffe;
-    std::unique_ptr<op::PoseCpuRenderer> poseRenderer;
-    std::unique_ptr<op::FrameDisplayer> frameDisplayer;
-    std::unique_ptr<op::ScaleAndSizeExtractor> scaleAndSizeExtractor;
-
-    std::unique_ptr<op::ResizeAndMergeCaffe<float>> resizeAndMergeCaffe;
-    std::unique_ptr<op::NmsCaffe<float>> nmsCaffe;
-    std::unique_ptr<op::BodyPartConnectorCaffe<float>> bodyPartConnectorCaffe;
-    std::shared_ptr<caffe::Blob<float>> heatMapsBlob;
-    std::shared_ptr<caffe::Blob<float>> peaksBlob;
-    op::Array<float> mPoseKeypoints;
-    op::Array<float> mPoseScores;
-    op::PoseModel poseModel;
-    int mGpuID;
-
-    OpenPose(int FLAGS_logging_level = default_logging_level,
-             std::string FLAGS_output_resolution = default_output_resolution,
-             std::string FLAGS_net_resolution = default_net_resolution,
-             std::string FLAGS_model_pose = default_model_pose,
-             float FLAGS_alpha_pose = default_alpha_pose,
-             float FLAGS_scale_gap = default_scale_gap,
-             int FLAGS_scale_number = default_scale_number,
-             float FLAGS_render_threshold = default_render_threshold,
-             int FLAGS_num_gpu_start = default_num_gpu_start,
-             int FLAGS_disable_blending = default_disable_blending,
-             std::string FLAGS_model_folder = default_model_folder
-             ){
-        mGpuID = FLAGS_num_gpu_start;
-        #ifdef USE_CUDA
-        caffe::Caffe::set_mode(caffe::Caffe::GPU);
-        caffe::Caffe::SetDevice(mGpuID);
-        #elif USE_OPENCL
-        caffe::Caffe::set_mode(caffe::Caffe::GPU);
-        std::vector<int> devices;
-        const int maxNumberGpu = op::OpenCL::getTotalGPU();
-        for (auto i = 0; i < maxNumberGpu; i++)
-            devices.emplace_back(i);
-        caffe::Caffe::SetDevices(devices);
-        caffe::Caffe::SelectDevice(mGpuID, true);
-        op::OpenCL::getInstance(mGpuID, CL_DEVICE_TYPE_GPU, true);
-        #else
-        caffe::Caffe::set_mode(caffe::Caffe::CPU);
-        #endif
-        op::log("OpenPose Library Python Wrapper", op::Priority::High);
-        // ------------------------- INITIALIZATION -------------------------
-        // Step 1 - Set logging level
-            // - 0 will output all the logging messages
-            // - 255 will output nothing
-        op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
-        op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-        // Step 2 - Read Google flags (user defined configuration)
-        // outputSize
-        const auto outputSize = op::flagsToPoint(FLAGS_output_resolution, "-1x-1");
-        // netInputSize
-        const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "-1x368");
-        // poseModel
-        poseModel = op::flagsToPoseModel(FLAGS_model_pose);
-        // Check no contradictory flags enabled
-        if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
-            op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
-        if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
-            op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.",
-                      __LINE__, __FUNCTION__, __FILE__);
-        // Logging
-        op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-        // Step 3 - Initialize all required classes
-        scaleAndSizeExtractor = std::unique_ptr<op::ScaleAndSizeExtractor>(new op::ScaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap));
-
-        poseExtractorCaffe = std::unique_ptr<op::PoseExtractorCaffe>(new op::PoseExtractorCaffe{poseModel, FLAGS_model_folder, FLAGS_num_gpu_start});
-
-        poseRenderer = std::unique_ptr<op::PoseCpuRenderer>(new op::PoseCpuRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending,
-                                                                                                    (float)FLAGS_alpha_pose});
-        frameDisplayer = std::unique_ptr<op::FrameDisplayer>(new op::FrameDisplayer{"OpenPose Tutorial - Example 1", outputSize});
-
-        // Custom
-        resizeAndMergeCaffe = std::unique_ptr<op::ResizeAndMergeCaffe<float>>(new op::ResizeAndMergeCaffe<float>{});
-        nmsCaffe = std::unique_ptr<op::NmsCaffe<float>>(new op::NmsCaffe<float>{});
-        bodyPartConnectorCaffe = std::unique_ptr<op::BodyPartConnectorCaffe<float>>(new op::BodyPartConnectorCaffe<float>{});
-        heatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-        peaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-        bodyPartConnectorCaffe->setPoseModel(poseModel);
-
-        // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
-        poseExtractorCaffe->initializationOnThread();
-        poseRenderer->initializationOnThread();
-    }
-
-    std::vector<caffe::Blob<float>*> caffeNetSharedToPtr(
-        std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob)
-    {
-        try
-        {
-            // Prepare spCaffeNetOutputBlobss
-            std::vector<caffe::Blob<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
-            for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++)
-                caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get();
-            return caffeNetOutputBlobs;
-        }
-        catch (const std::exception& e)
-        {
-            op::error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return {};
-        }
-    }
-
-    void forward(const cv::Mat& inputImage, op::Array<float>& poseKeypoints, cv::Mat& displayImage, bool display = false){
-        op::OpOutputToCvMat opOutputToCvMat;
-        op::CvMatToOpInput cvMatToOpInput;
-        op::CvMatToOpOutput cvMatToOpOutput;
-        if(inputImage.empty())
-            op::error("Could not open or find the image: ", __LINE__, __FUNCTION__, __FILE__);
-        const op::Point<int> imageSize{inputImage.cols, inputImage.rows};
-        // Step 2 - Get desired scale sizes
-        std::vector<double> scaleInputToNetInputs;
-        std::vector<op::Point<int>> netInputSizes;
-        double scaleInputToOutput;
-        op::Point<int> outputResolution;
-        std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
-            = scaleAndSizeExtractor->extract(imageSize);
-        // Step 3 - Format input image to OpenPose input and output formats
-        const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
-
-        // Step 4 - Estimate poseKeypoints
-        poseExtractorCaffe->forwardPass(netInputArray, imageSize, scaleInputToNetInputs);
-        poseKeypoints = poseExtractorCaffe->getPoseKeypoints();
-
-        if(display){
-            auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
-            // Step 5 - Render poseKeypoints
-            poseRenderer->renderPose(outputArray, poseKeypoints, scaleInputToOutput);
-            // Step 6 - OpenPose output format to cv::Mat
-            displayImage = opOutputToCvMat.formatToCvMat(outputArray);
-        }
-    }
-
-    void poseFromHeatmap(const cv::Mat& inputImage, std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob, op::Array<float>& poseKeypoints, cv::Mat& displayImage, std::vector<op::Point<int>>& imageSizes){
-        // Get Scale
-        const op::Point<int> inputDataSize{inputImage.cols, inputImage.rows};
-
-        // Convert to Ptr
-        //std::vector<boost::shared_ptr<caffe::Blob<float>>> a;
-        //caffeNetOutputBlob.emplace_back(caffeHmPtr);
-        const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
-
-        // To be called once only
-        resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
-                                     op::getPoseNetDecreaseFactor(poseModel), 1.f/1.f, true,
-                                     0);
-        nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, op::getPoseMaxPeaks(poseModel),
-                          op::getPoseNumberBodyParts(poseModel), 0);
-        bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()});
-
-        // Normal
-        op::OpOutputToCvMat opOutputToCvMat;
-        op::CvMatToOpInput cvMatToOpInput;
-        op::CvMatToOpOutput cvMatToOpOutput;
-        if(inputImage.empty())
-            op::error("Could not open or find the image: ", __LINE__, __FUNCTION__, __FILE__);
-        const op::Point<int> imageSize{inputImage.cols, inputImage.rows};
-        // Step 2 - Get desired scale sizes
-        std::vector<double> scaleInputToNetInputs;
-        std::vector<op::Point<int>> netInputSizes;
-        double scaleInputToOutput;
-        op::Point<int> outputResolution;
-
-        std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
-            = scaleAndSizeExtractor->extract(imageSize);
-
-        const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
-
-        // Run the modes
-        const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
-        resizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
-        std::vector<caffe::Blob<float>*> heatMapsBlobs{heatMapsBlob.get()};
-        std::vector<caffe::Blob<float>*> peaksBlobs{peaksBlob.get()};
-        #ifdef USE_CUDA
-        resizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
-        #elif USE_OPENCL
-        resizeAndMergeCaffe->Forward_ocl(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
-        #else
-        resizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
-        #endif
-
-        nmsCaffe->setThreshold((float)poseExtractorCaffe->get(op::PoseProperty::NMSThreshold));
-        #ifdef USE_CUDA
-        nmsCaffe->Forward_gpu(heatMapsBlobs, peaksBlobs);// ~2ms
-        #elif USE_OPENCL
-        nmsCaffe->Forward_ocl(heatMapsBlobs, peaksBlobs);// ~2ms
-        #else
-        nmsCaffe->Forward_cpu(heatMapsBlobs, peaksBlobs);// ~2ms
-        #endif
-        op::cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-        float mScaleNetToOutput = 1./scaleInputToNetInputs[0];
-        bodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
-        bodyPartConnectorCaffe->setInterMinAboveThreshold(
-            (float)poseExtractorCaffe->get(op::PoseProperty::ConnectInterMinAboveThreshold)
-        );
-        bodyPartConnectorCaffe->setInterThreshold((float)poseExtractorCaffe->get(op::PoseProperty::ConnectInterThreshold));
-        bodyPartConnectorCaffe->setMinSubsetCnt((int)poseExtractorCaffe->get(op::PoseProperty::ConnectMinSubsetCnt));
-        bodyPartConnectorCaffe->setMinSubsetScore((float)poseExtractorCaffe->get(op::PoseProperty::ConnectMinSubsetScore));
-
-        bodyPartConnectorCaffe->Forward_cpu({heatMapsBlob.get(),
-                                             peaksBlob.get()},
-                                             mPoseKeypoints, mPoseScores);
-        poseKeypoints = mPoseKeypoints;
-
-        auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
-        // Step 5 - Render poseKeypoints
-        poseRenderer->renderPose(outputArray, mPoseKeypoints, scaleInputToOutput);
-        // Step 6 - OpenPose output format to cv::Mat
-        displayImage = opOutputToCvMat.formatToCvMat(outputArray);
-    }
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void* c_OP;
-op::Array<float> output;
-
-c_OP newOP(int logging_level,
-           char* output_resolution,
-           char* net_resolution,
-           char* model_pose,
-           float alpha_pose,
-           float scale_gap,
-           int scale_number,
-           float render_threshold,
-           int num_gpu_start,
-           bool disable_blending,
-           char* model_folder
-           ){
-    return new OpenPose(logging_level, output_resolution, net_resolution, model_pose, alpha_pose,
-                        scale_gap, scale_number, render_threshold, num_gpu_start, disable_blending, model_folder);
-}
-void delOP(c_OP op){
-    delete (OpenPose *)op;
-}
-void forward(c_OP op, unsigned char* img, size_t rows, size_t cols, int* size, unsigned char* displayImg, bool display){
-    OpenPose* openPose = (OpenPose*)op;
-    cv::Mat image(rows, cols, CV_8UC3, img);
-    cv::Mat displayImage(rows, cols, CV_8UC3, displayImg);
-    openPose->forward(image, output, displayImage, display);
-    if(output.getSize().size()){
-        size[0] = output.getSize()[0];
-        size[1] = output.getSize()[1];
-        size[2] = output.getSize()[2];
-    }else{
-        size[0] = 0; size[1] = 0; size[2] = 0;
-    }
-    if(display) memcpy(displayImg, displayImage.ptr(), sizeof(unsigned char)*rows*cols*3);
-}
-void getOutputs(c_OP op, float* array){
-    if(output.getSize().size())
-    memcpy(array, output.getPtr(), output.getSize()[0]*output.getSize()[1]*output.getSize()[2]*sizeof(float));
-}
-
-void poseFromHeatmap(c_OP op, unsigned char* img, size_t rows, size_t cols, unsigned char* displayImg, float* hm, int* size, float* ratios){
-    OpenPose* openPose = (OpenPose*)op;
-    cv::Mat image(rows, cols, CV_8UC3, img);
-    cv::Mat displayImage(rows, cols, CV_8UC3, displayImg);
-
-    std::vector<boost::shared_ptr<caffe::Blob<float>>> caffeNetOutputBlob;
-
-    for(int i=0; i<size[0]; i++){
-        boost::shared_ptr<caffe::Blob<float>> caffeHmPtr(new caffe::Blob<float>());
-        caffeHmPtr->Reshape(1,size[1],size[2]*((float)ratios[i]/(float)ratios[0]),size[3]*((float)ratios[i]/(float)ratios[0]));
-        float* startIndex = &hm[i*size[1]*size[2]*size[3]];
-        for(int d=0; d<caffeHmPtr->shape()[1]; d++){
-            for(int r=0; r<caffeHmPtr->shape()[2]; r++){
-                for(int c=0; c<caffeHmPtr->shape()[3]; c++){
-                    int toI = d*caffeHmPtr->shape()[2]*caffeHmPtr->shape()[3] + r*caffeHmPtr->shape()[3] + c;
-                    int fromI = d*size[2]*size[3] + r*size[3] + c;
-                    caffeHmPtr->mutable_cpu_data()[toI] = startIndex[fromI];
-                }
-            }
-        }
-        caffeNetOutputBlob.emplace_back(caffeHmPtr);
-    }
-
-    std::vector<op::Point<int>> imageSizes;
-    for(int i=0; i<size[0]; i++){
-        op::Point<int> point(cols*ratios[i], rows*ratios[i]);
-        imageSizes.emplace_back(point);
-    }
-
-    openPose->poseFromHeatmap(image, caffeNetOutputBlob, output, displayImage, imageSizes);
-    memcpy(displayImg, displayImage.ptr(), sizeof(unsigned char)*rows*cols*3);
-    // Copy back kp size
-    if(output.getSize().size()){
-        size[0] = output.getSize()[0];
-        size[1] = output.getSize()[1];
-        size[2] = output.getSize()[2];
-    }else{
-        size[0] = 0; size[1] = 0; size[2] = 0;
-    }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+#ifndef OPENPOSE_PYTHON_HPP
+#define OPENPOSE_PYTHON_HPP
+#define BOOST_DATE_TIME_NO_LIB
+
+// OpenPose dependencies
+#include <openpose/core/headers.hpp>
+#include <openpose/filestream/headers.hpp>
+#include <openpose/gui/headers.hpp>
+#include <openpose/pose/headers.hpp>
+#include <openpose/utilities/headers.hpp>
+#include <caffe/caffe.hpp>
+#include <stdlib.h>
+
+#include <openpose/net/nmsCaffe.hpp>
+#include <openpose/net/resizeAndMergeCaffe.hpp>
+#include <openpose/pose/bodyPartConnectorCaffe.hpp>
+#include <openpose/pose/poseParameters.hpp>
+#include <openpose/pose/enumClasses.hpp>
+#include <openpose/pose/poseExtractor.hpp>
+#include <openpose/gpu/cuda.hpp>
+#include <openpose/gpu/opencl.hcl>
+#include <openpose/core/macros.hpp>
+
+#ifdef _WIN32
+    #define OP_EXPORT __declspec(dllexport)
+#else
+    #define OP_EXPORT
+#endif
+
+#define default_logging_level 3
+#define default_output_resolution "-1x-1"
+#define default_net_resolution "-1x368"
+#define default_model_pose "COCO"
+#define default_alpha_pose 0.6
+#define default_scale_gap 0.3
+#define default_scale_number 1
+#define default_render_threshold 0.05
+#define default_num_gpu_start 0
+#define default_disable_blending false
+#define default_model_folder "models/"
+
+// Todo, have GPU Number, handle, OpenCL/CPU Cases
+OP_API class OpenPose {
+public:
+	std::unique_ptr<op::PoseExtractorCaffe> poseExtractorCaffe;
+	std::unique_ptr<op::PoseCpuRenderer> poseRenderer;
+	std::unique_ptr<op::FrameDisplayer> frameDisplayer;
+	std::unique_ptr<op::ScaleAndSizeExtractor> scaleAndSizeExtractor;
+
+	std::unique_ptr<op::ResizeAndMergeCaffe<float>> resizeAndMergeCaffe;
+	std::unique_ptr<op::NmsCaffe<float>> nmsCaffe;
+	std::unique_ptr<op::BodyPartConnectorCaffe<float>> bodyPartConnectorCaffe;
+	std::shared_ptr<caffe::Blob<float>> heatMapsBlob;
+	std::shared_ptr<caffe::Blob<float>> peaksBlob;
+	op::Array<float> mPoseKeypoints;
+	op::Array<float> mPoseScores;
+	op::PoseModel poseModel;
+	int mGpuID;
+
+	OpenPose(int FLAGS_logging_level = default_logging_level,
+		std::string FLAGS_output_resolution = default_output_resolution,
+		std::string FLAGS_net_resolution = default_net_resolution,
+		std::string FLAGS_model_pose = default_model_pose,
+		float FLAGS_alpha_pose = default_alpha_pose,
+		float FLAGS_scale_gap = default_scale_gap,
+		int FLAGS_scale_number = default_scale_number,
+		float FLAGS_render_threshold = default_render_threshold,
+		int FLAGS_num_gpu_start = default_num_gpu_start,
+		int FLAGS_disable_blending = default_disable_blending,
+		std::string FLAGS_model_folder = default_model_folder
+	) {
+		mGpuID = FLAGS_num_gpu_start;
+#ifdef USE_CUDA
+		caffe::Caffe::set_mode(caffe::Caffe::GPU);
+		caffe::Caffe::SetDevice(mGpuID);
+#elif USE_OPENCL
+		caffe::Caffe::set_mode(caffe::Caffe::GPU);
+		std::vector<int> devices;
+		const int maxNumberGpu = op::OpenCL::getTotalGPU();
+		for (auto i = 0; i < maxNumberGpu; i++)
+			devices.emplace_back(i);
+		caffe::Caffe::SetDevices(devices);
+		caffe::Caffe::SelectDevice(mGpuID, true);
+		op::OpenCL::getInstance(mGpuID, CL_DEVICE_TYPE_GPU, true);
+#else
+		caffe::Caffe::set_mode(caffe::Caffe::CPU);
+#endif
+		op::log("OpenPose Library Python Wrapper", op::Priority::High);
+		// ------------------------- INITIALIZATION -------------------------
+		// Step 1 - Set logging level
+		// - 0 will output all the logging messages
+		// - 255 will output nothing
+		op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
+		op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+		// Step 2 - Read Google flags (user defined configuration)
+		// outputSize
+		const auto outputSize = op::flagsToPoint(FLAGS_output_resolution, "-1x-1");
+		// netInputSize
+		const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "-1x368");
+		// poseModel
+		poseModel = op::flagsToPoseModel(FLAGS_model_pose);
+		// Check no contradictory flags enabled
+		if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
+			op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
+		if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
+			op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.",
+				__LINE__, __FUNCTION__, __FILE__);
+		// Logging
+		op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+		// Step 3 - Initialize all required classes
+		scaleAndSizeExtractor = std::unique_ptr<op::ScaleAndSizeExtractor>(new op::ScaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap));
+
+		poseExtractorCaffe = std::unique_ptr<op::PoseExtractorCaffe>(new op::PoseExtractorCaffe{ poseModel, FLAGS_model_folder, FLAGS_num_gpu_start });
+
+		poseRenderer = std::unique_ptr<op::PoseCpuRenderer>(new op::PoseCpuRenderer{ poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending,
+			(float)FLAGS_alpha_pose });
+		frameDisplayer = std::unique_ptr<op::FrameDisplayer>(new op::FrameDisplayer{ "OpenPose Tutorial - Example 1", outputSize });
+
+		// Custom
+		resizeAndMergeCaffe = std::unique_ptr<op::ResizeAndMergeCaffe<float>>(new op::ResizeAndMergeCaffe<float>{});
+		nmsCaffe = std::unique_ptr<op::NmsCaffe<float>>(new op::NmsCaffe<float>{});
+		bodyPartConnectorCaffe = std::unique_ptr<op::BodyPartConnectorCaffe<float>>(new op::BodyPartConnectorCaffe<float>{});
+		heatMapsBlob = { std::make_shared<caffe::Blob<float>>(1,1,1,1) };
+		peaksBlob = { std::make_shared<caffe::Blob<float>>(1,1,1,1) };
+		bodyPartConnectorCaffe->setPoseModel(poseModel);
+
+		// Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
+		poseExtractorCaffe->initializationOnThread();
+		poseRenderer->initializationOnThread();
+	}
+
+	std::vector<caffe::Blob<float>*> caffeNetSharedToPtr(
+		std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob)
+	{
+		try
+		{
+			// Prepare spCaffeNetOutputBlobss
+			std::vector<caffe::Blob<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
+			for (auto i = 0u; i < caffeNetOutputBlobs.size(); i++)
+				caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get();
+			return caffeNetOutputBlobs;
+		}
+		catch (const std::exception& e)
+		{
+			op::error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+			return{};
+		}
+	}
+
+	void forward(const cv::Mat& inputImage, op::Array<float>& poseKeypoints, cv::Mat& displayImage, bool display = false) {
+		op::OpOutputToCvMat opOutputToCvMat;
+		op::CvMatToOpInput cvMatToOpInput;
+		op::CvMatToOpOutput cvMatToOpOutput;
+		if (inputImage.empty())
+			op::error("Could not open or find the image: ", __LINE__, __FUNCTION__, __FILE__);
+		const op::Point<int> imageSize{ inputImage.cols, inputImage.rows };
+		// Step 2 - Get desired scale sizes
+		std::vector<double> scaleInputToNetInputs;
+		std::vector<op::Point<int>> netInputSizes;
+		double scaleInputToOutput;
+		op::Point<int> outputResolution;
+		std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
+			= scaleAndSizeExtractor->extract(imageSize);
+		// Step 3 - Format input image to OpenPose input and output formats
+		const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
+
+		// Step 4 - Estimate poseKeypoints
+		poseExtractorCaffe->forwardPass(netInputArray, imageSize, scaleInputToNetInputs);
+		poseKeypoints = poseExtractorCaffe->getPoseKeypoints();
+
+		if (display) {
+			auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
+			// Step 5 - Render poseKeypoints
+			poseRenderer->renderPose(outputArray, poseKeypoints, scaleInputToOutput);
+			// Step 6 - OpenPose output format to cv::Mat
+			displayImage = opOutputToCvMat.formatToCvMat(outputArray);
+		}
+	}
+
+	void poseFromHeatmap(const cv::Mat& inputImage, std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob, op::Array<float>& poseKeypoints, cv::Mat& displayImage, std::vector<op::Point<int>>& imageSizes) {
+		// Get Scale
+		const op::Point<int> inputDataSize{ inputImage.cols, inputImage.rows };
+
+		// Convert to Ptr
+		//std::vector<boost::shared_ptr<caffe::Blob<float>>> a;
+		//caffeNetOutputBlob.emplace_back(caffeHmPtr);
+		const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
+
+		// To be called once only
+		resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, { heatMapsBlob.get() },
+			op::getPoseNetDecreaseFactor(poseModel), 1.f / 1.f, true,
+			0);
+		nmsCaffe->Reshape({ heatMapsBlob.get() }, { peaksBlob.get() }, op::getPoseMaxPeaks(poseModel),
+			op::getPoseNumberBodyParts(poseModel), 0);
+		bodyPartConnectorCaffe->Reshape({ heatMapsBlob.get(), peaksBlob.get() });
+
+		// Normal
+		op::OpOutputToCvMat opOutputToCvMat;
+		op::CvMatToOpInput cvMatToOpInput;
+		op::CvMatToOpOutput cvMatToOpOutput;
+		if (inputImage.empty())
+			op::error("Could not open or find the image: ", __LINE__, __FUNCTION__, __FILE__);
+		const op::Point<int> imageSize{ inputImage.cols, inputImage.rows };
+		// Step 2 - Get desired scale sizes
+		std::vector<double> scaleInputToNetInputs;
+		std::vector<op::Point<int>> netInputSizes;
+		double scaleInputToOutput;
+		op::Point<int> outputResolution;
+
+		std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
+			= scaleAndSizeExtractor->extract(imageSize);
+
+		const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
+
+		// Run the modes
+		const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
+		resizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
+		std::vector<caffe::Blob<float>*> heatMapsBlobs{ heatMapsBlob.get() };
+		std::vector<caffe::Blob<float>*> peaksBlobs{ peaksBlob.get() };
+#ifdef USE_CUDA
+		resizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
+#elif USE_OPENCL
+		resizeAndMergeCaffe->Forward_ocl(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
+#else
+		resizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, heatMapsBlobs); // ~5ms
+#endif
+
+		nmsCaffe->setThreshold((float)poseExtractorCaffe->get(op::PoseProperty::NMSThreshold));
+#ifdef USE_CUDA
+		nmsCaffe->Forward_gpu(heatMapsBlobs, peaksBlobs);// ~2ms
+#elif USE_OPENCL
+		nmsCaffe->Forward_ocl(heatMapsBlobs, peaksBlobs);// ~2ms
+#else
+		nmsCaffe->Forward_cpu(heatMapsBlobs, peaksBlobs);// ~2ms
+#endif
+		op::cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+
+		float mScaleNetToOutput = 1. / scaleInputToNetInputs[0];
+		bodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
+		bodyPartConnectorCaffe->setInterMinAboveThreshold(
+			(float)poseExtractorCaffe->get(op::PoseProperty::ConnectInterMinAboveThreshold)
+		);
+		bodyPartConnectorCaffe->setInterThreshold((float)poseExtractorCaffe->get(op::PoseProperty::ConnectInterThreshold));
+		bodyPartConnectorCaffe->setMinSubsetCnt((int)poseExtractorCaffe->get(op::PoseProperty::ConnectMinSubsetCnt));
+		bodyPartConnectorCaffe->setMinSubsetScore((float)poseExtractorCaffe->get(op::PoseProperty::ConnectMinSubsetScore));
+
+		bodyPartConnectorCaffe->Forward_cpu({ heatMapsBlob.get(),
+			peaksBlob.get() },
+			mPoseKeypoints, mPoseScores);
+		poseKeypoints = mPoseKeypoints;
+
+		auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
+		// Step 5 - Render poseKeypoints
+		poseRenderer->renderPose(outputArray, mPoseKeypoints, scaleInputToOutput);
+		// Step 6 - OpenPose output format to cv::Mat
+		displayImage = opOutputToCvMat.formatToCvMat(outputArray);
+	}
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	typedef void* c_OP;
+	op::Array<float> output;
+
+    OP_EXPORT c_OP newOP(int logging_level,
+		char* output_resolution,
+		char* net_resolution,
+		char* model_pose,
+		float alpha_pose,
+		float scale_gap,
+		int scale_number,
+		float render_threshold,
+		int num_gpu_start,
+		bool disable_blending,
+		char* model_folder
+	) {
+		return new OpenPose(logging_level, output_resolution, net_resolution, model_pose, alpha_pose,
+			scale_gap, scale_number, render_threshold, num_gpu_start, disable_blending, model_folder);
+	}
+    OP_EXPORT void delOP(c_OP op) {
+		delete (OpenPose *)op;
+	}
+    OP_EXPORT void forward(c_OP op, unsigned char* img, size_t rows, size_t cols, int* size, unsigned char* displayImg, bool display) {
+		OpenPose* openPose = (OpenPose*)op;
+		cv::Mat image(rows, cols, CV_8UC3, img);
+		cv::Mat displayImage(rows, cols, CV_8UC3, displayImg);
+		openPose->forward(image, output, displayImage, display);
+		if (output.getSize().size()) {
+			size[0] = output.getSize()[0];
+			size[1] = output.getSize()[1];
+			size[2] = output.getSize()[2];
+		}
+		else {
+			size[0] = 0; size[1] = 0; size[2] = 0;
+		}
+		if (display) memcpy(displayImg, displayImage.ptr(), sizeof(unsigned char)*rows*cols * 3);
+	}
+    OP_EXPORT void getOutputs(c_OP op, float* array) {
+		if (output.getSize().size())
+			memcpy(array, output.getPtr(), output.getSize()[0] * output.getSize()[1] * output.getSize()[2] * sizeof(float));
+	}
+    OP_EXPORT void poseFromHeatmap(c_OP op, unsigned char* img, size_t rows, size_t cols, unsigned char* displayImg, float* hm, int* size, float* ratios) {
+		OpenPose* openPose = (OpenPose*)op;
+		cv::Mat image(rows, cols, CV_8UC3, img);
+		cv::Mat displayImage(rows, cols, CV_8UC3, displayImg);
+
+		std::vector<boost::shared_ptr<caffe::Blob<float>>> caffeNetOutputBlob;
+
+		for (int i = 0; i<size[0]; i++) {
+			boost::shared_ptr<caffe::Blob<float>> caffeHmPtr(new caffe::Blob<float>());
+			caffeHmPtr->Reshape(1, size[1], size[2] * ((float)ratios[i] / (float)ratios[0]), size[3] * ((float)ratios[i] / (float)ratios[0]));
+			float* startIndex = &hm[i*size[1] * size[2] * size[3]];
+			for (int d = 0; d<caffeHmPtr->shape()[1]; d++) {
+				for (int r = 0; r<caffeHmPtr->shape()[2]; r++) {
+					for (int c = 0; c<caffeHmPtr->shape()[3]; c++) {
+						int toI = d*caffeHmPtr->shape()[2] * caffeHmPtr->shape()[3] + r*caffeHmPtr->shape()[3] + c;
+						int fromI = d*size[2] * size[3] + r*size[3] + c;
+						caffeHmPtr->mutable_cpu_data()[toI] = startIndex[fromI];
+					}
+				}
+			}
+			caffeNetOutputBlob.emplace_back(caffeHmPtr);
+		}
+
+		std::vector<op::Point<int>> imageSizes;
+		for (int i = 0; i<size[0]; i++) {
+			op::Point<int> point(cols*ratios[i], rows*ratios[i]);
+			imageSizes.emplace_back(point);
+		}
+
+		openPose->poseFromHeatmap(image, caffeNetOutputBlob, output, displayImage, imageSizes);
+		memcpy(displayImg, displayImage.ptr(), sizeof(unsigned char)*rows*cols * 3);
+		// Copy back kp size
+		if (output.getSize().size()) {
+			size[0] = output.getSize()[0];
+			size[1] = output.getSize()[1];
+			size[2] = output.getSize()[2];
+		}
+		else {
+			size[0] = 0; size[1] = 0; size[2] = 0;
+		}
+	}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openpose/openpose.py b/python/openpose/openpose.py
index 5913993380117daf07eb7131bfeb326fd7bcbeeb..707f2f9154876e24d849a475fe177112ffe7c8c2 100644
--- a/python/openpose/openpose.py
+++ b/python/openpose/openpose.py
@@ -1,227 +1,239 @@
-"""
-Wrap the OpenPose library with Python.
-To install run `make install` and library will be stored in /usr/local/python
-"""
-
-import numpy as np
-import ctypes as ct
-import cv2
-import os
-dir_path = os.path.dirname(os.path.realpath(__file__))
-
-class OpenPose(object):
-    """
-    Ctypes linkage
-    """
-    _libop= np.ctypeslib.load_library('_openpose', dir_path+'/_openpose.so')
-    _libop.newOP.argtypes = [
-        ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_char_p, ct.c_float, ct.c_float, ct.c_int, ct.c_float, ct.c_int, ct.c_bool, ct.c_char_p]
-    _libop.newOP.restype = ct.c_void_p
-    _libop.delOP.argtypes = [ct.c_void_p]
-    _libop.delOP.restype = None
-
-    _libop.forward.argtypes = [
-        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.uint8),
-        ct.c_size_t, ct.c_size_t,
-        np.ctypeslib.ndpointer(dtype=np.int32), np.ctypeslib.ndpointer(dtype=np.uint8), ct.c_bool]
-    _libop.forward.restype = None
-
-    _libop.getOutputs.argtypes = [
-        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.float32)]
-    _libop.getOutputs.restype = None
-
-    _libop.poseFromHeatmap.argtypes = [
-        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.uint8),
-        ct.c_size_t, ct.c_size_t,
-        np.ctypeslib.ndpointer(dtype=np.uint8),
-        np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.int32), np.ctypeslib.ndpointer(dtype=np.float32)]
-    _libop.poseFromHeatmap.restype = None
-
-    def __init__(self, params):
-        """
-        OpenPose Constructor: Prepares OpenPose object
-
-        Parameters
-        ----------
-        params : dict of required parameters. refer to openpose example for more details
-
-        Returns
-        -------
-        outs: OpenPose object
-        """
-        self.op = self._libop.newOP(params["logging_level"],
-                                    params["output_resolution"],
-                                    params["net_resolution"],
-                                    params["model_pose"],
-                                    params["alpha_pose"],
-                                    params["scale_gap"],
-                                    params["scale_number"],
-                                    params["render_threshold"],
-                                    params["num_gpu_start"],
-                                    params["disable_blending"],
-                                    params["default_model_folder"])
-
-    def __del__(self):
-        """
-        OpenPose Destructor: Destroys OpenPose object
-        """
-        self._libop.delOP(self.op)
-
-    def forward(self, image, display = False):
-        """
-        Forward: Takes in an image and returns the human 2D poses, along with drawn image if required
-
-        Parameters
-        ----------
-        image : color image of type ndarray
-        display : If set to true, we return both the pose and an annotated image for visualization
-
-        Returns
-        -------
-        array: ndarray of human 2D poses [People * BodyPart * XYConfidence]
-        displayImage : image for visualization
-        """
-        shape = image.shape
-        displayImage = np.zeros(shape=(image.shape),dtype=np.uint8)
-        size = np.zeros(shape=(3),dtype=np.int32)
-        self._libop.forward(self.op, image, shape[0], shape[1], size, displayImage, display)
-        array = np.zeros(shape=(size),dtype=np.float32)
-        self._libop.getOutputs(self.op, array)
-        if display:
-            return array, displayImage
-        return array
-
-    def poseFromHM(self, image, hm, ratios=[1]):
-        """
-        Pose From Heatmap: Takes in an image, computed heatmaps, and require scales and computes pose
-
-        Parameters
-        ----------
-        image : color image of type ndarray
-        hm : heatmap of type ndarray with heatmaps and part affinity fields
-        ratios : scaling ration if needed to fuse multiple scales
-
-        Returns
-        -------
-        array: ndarray of human 2D poses [People * BodyPart * XYConfidence]
-        displayImage : image for visualization
-        """
-        if len(ratios) != len(hm):
-            raise Exception("Ratio shape mismatch")
-
-        # Find largest
-        hm_combine = np.zeros(shape=(len(hm), hm[0].shape[1], hm[0].shape[2], hm[0].shape[3]),dtype=np.float32)
-        i=0
-        for h in hm:
-           hm_combine[i,:,0:h.shape[2],0:h.shape[3]] = h
-           i+=1
-        hm = hm_combine
-
-        ratios = np.array(ratios,dtype=np.float32)
-
-        shape = image.shape
-        displayImage = np.zeros(shape=(image.shape),dtype=np.uint8)
-        size = np.zeros(shape=(4),dtype=np.int32)
-        size[0] = hm.shape[0]
-        size[1] = hm.shape[1]
-        size[2] = hm.shape[2]
-        size[3] = hm.shape[3]
-
-        self._libop.poseFromHeatmap(self.op, image, shape[0], shape[1], displayImage, hm, size, ratios)
-        array = np.zeros(shape=(size[0],size[1],size[2]),dtype=np.float32)
-        self._libop.getOutputs(self.op, array)
-        return array, displayImage
-
-    @staticmethod
-    def process_frames(frame, boxsize = 368, scales = [1]):
-        base_net_res = None
-        imagesForNet = []
-        imagesOrig = []
-        for idx, scale in enumerate(scales):
-            # Calculate net resolution (width, height)
-            if idx == 0:
-                net_res = (16 * int((boxsize * frame.shape[1] / float(frame.shape[0]) / 16) + 0.5), boxsize)
-                base_net_res = net_res
-            else:
-                net_res = ((min(base_net_res[0], max(1, int((base_net_res[0] * scale)+0.5)/16*16))),
-                          (min(base_net_res[1], max(1, int((base_net_res[1] * scale)+0.5)/16*16))))
-            input_res = [frame.shape[1], frame.shape[0]]
-            scale_factor = min((net_res[0] - 1) / float(input_res[0] - 1), (net_res[1] - 1) / float(input_res[1] - 1))
-            warp_matrix = np.array([[scale_factor,0,0],
-                                    [0,scale_factor,0]])
-            if scale_factor != 1:
-                imageForNet = cv2.warpAffine(frame, warp_matrix, net_res, flags=(cv2.INTER_AREA if scale_factor < 1. else cv2.INTER_CUBIC), borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
-            else:
-                imageForNet = frame.copy()
-
-            imageOrig = imageForNet.copy()
-            imageForNet = imageForNet.astype(float)
-            imageForNet = imageForNet/256. - 0.5
-            imageForNet = np.transpose(imageForNet, (2,0,1))
-
-            imagesForNet.append(imageForNet)
-            imagesOrig.append(imageOrig)
-
-        return imagesForNet, imagesOrig
-
-    @staticmethod
-    def draw_all(imageForNet, heatmaps, currIndex, div=4., norm=False):
-        netDecreaseFactor = float(imageForNet.shape[0]) / float(heatmaps.shape[2]) # 8
-        resized_heatmaps = np.zeros(shape=(heatmaps.shape[0], heatmaps.shape[1], imageForNet.shape[0], imageForNet.shape[1]))
-        num_maps = heatmaps.shape[1]
-        combined = None
-        for i in range(0, num_maps):
-            heatmap = heatmaps[0,i,:,:]
-            resizedHeatmap = cv2.resize(heatmap, (0,0), fx=netDecreaseFactor, fy=netDecreaseFactor)
-
-            minVal, maxVal, minLoc, maxLoc = cv2.minMaxLoc(resizedHeatmap)
-
-            if i==currIndex and currIndex >=0:
-                resizedHeatmap = np.abs(resizedHeatmap)
-                resizedHeatmap = (resizedHeatmap*255.).astype(dtype='uint8')
-                im_color = cv2.applyColorMap(resizedHeatmap, cv2.COLORMAP_JET)
-                resizedHeatmap = cv2.addWeighted(imageForNet, 1, im_color, 0.3, 0)
-                cv2.circle(resizedHeatmap, (int(maxLoc[0]),int(maxLoc[1])), 5, (255,0,0), -1)
-                return resizedHeatmap
-            else:
-                resizedHeatmap = np.abs(resizedHeatmap)
-                if combined is None:
-                    combined = np.copy(resizedHeatmap);
-                else:
-                    if i <= num_maps-2:
-                        combined += resizedHeatmap;
-                        if norm:
-                            combined = np.maximum(0, np.minimum(1, combined));
-
-        if currIndex < 0:
-            combined /= div
-            combined = (combined*255.).astype(dtype='uint8')
-            im_color = cv2.applyColorMap(combined, cv2.COLORMAP_JET)
-            combined = cv2.addWeighted(imageForNet, 0.5, im_color, 0.5, 0)
-            cv2.circle(combined, (int(maxLoc[0]),int(maxLoc[1])), 5, (255,0,0), -1)
-            return combined
-
-
-if __name__ == "__main__":
-    params = dict()
-    params["logging_level"] = 3
-    params["output_resolution"] = "-1x-1"
-    params["net_resolution"] = "-1x736"
-    params["model_pose"] = "COCO"
-    params["alpha_pose"] = 0.6
-    params["scale_gap"] = 0.3
-    params["scale_number"] = 2
-    params["render_threshold"] = 0.05
-    params["num_gpu_start"] = 0
-    params["disable_blending"] = False
-    params["default_model_folder"] = "models/"
-    openpose = OpenPose(params)
-
-    img = cv2.imread("examples/media/COCO_val2014_000000000192.jpg")
-    arr, output_image = openpose.forward(img, True)
-    print arr
-
-    while 1:
-        cv2.imshow("output", output_image)
-        cv2.waitKey(15)
-
+"""
+Wrap the OpenPose library with Python.
+To install run `make install` and library will be stored in /usr/local/python
+"""
+import numpy as np
+import ctypes as ct
+import cv2
+import os
+from sys import platform
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+if platform == "win32":
+    os.environ['PATH'] = dir_path + "/../../lib;" + os.environ['PATH']
+    os.environ['PATH'] = dir_path + "/../../x64/Release;" + os.environ['PATH']
+
+class OpenPose(object):
+    """
+    Ctypes linkage
+    """
+    if platform == "linux" or platform == "linux2":
+        _libop= np.ctypeslib.load_library('_openpose', dir_path+'/_openpose.so')
+    elif platform == "darwin":
+        _libop= np.ctypeslib.load_library('_openpose', dir_path+'/_openpose.dylib')
+    elif platform == "win32":
+        _libop= np.ctypeslib.load_library('_openpose', dir_path+'/Release/_openpose.dll')
+    _libop.newOP.argtypes = [
+        ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_char_p, ct.c_float, ct.c_float, ct.c_int, ct.c_float, ct.c_int, ct.c_bool, ct.c_char_p]
+    _libop.newOP.restype = ct.c_void_p
+    _libop.delOP.argtypes = [ct.c_void_p]
+    _libop.delOP.restype = None
+
+    _libop.forward.argtypes = [
+        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.uint8),
+        ct.c_size_t, ct.c_size_t,
+        np.ctypeslib.ndpointer(dtype=np.int32), np.ctypeslib.ndpointer(dtype=np.uint8), ct.c_bool]
+    _libop.forward.restype = None
+
+    _libop.getOutputs.argtypes = [
+        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.float32)]
+    _libop.getOutputs.restype = None
+
+    _libop.poseFromHeatmap.argtypes = [
+        ct.c_void_p, np.ctypeslib.ndpointer(dtype=np.uint8),
+        ct.c_size_t, ct.c_size_t,
+        np.ctypeslib.ndpointer(dtype=np.uint8),
+        np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.int32), np.ctypeslib.ndpointer(dtype=np.float32)]
+    _libop.poseFromHeatmap.restype = None
+
+    def encode(self, string):
+        return ct.c_char_p(string.encode('utf-8'))
+
+    def __init__(self, params):
+        """
+        OpenPose Constructor: Prepares OpenPose object
+
+        Parameters
+        ----------
+        params : dict of required parameters. refer to openpose example for more details
+
+        Returns
+        -------
+        outs: OpenPose object
+        """
+        self.op = self._libop.newOP(params["logging_level"],
+		                            self.encode(params["output_resolution"]),
+                                    self.encode(params["net_resolution"]),
+                                    self.encode(params["model_pose"]),
+                                    params["alpha_pose"],
+                                    params["scale_gap"],
+                                    params["scale_number"],
+                                    params["render_threshold"],
+                                    params["num_gpu_start"],
+                                    params["disable_blending"],
+                                    self.encode(params["default_model_folder"]))
+
+    def __del__(self):
+        """
+        OpenPose Destructor: Destroys OpenPose object
+        """
+        self._libop.delOP(self.op)
+
+    def forward(self, image, display = False):
+        """
+        Forward: Takes in an image and returns the human 2D poses, along with drawn image if required
+
+        Parameters
+        ----------
+        image : color image of type ndarray
+        display : If set to true, we return both the pose and an annotated image for visualization
+
+        Returns
+        -------
+        array: ndarray of human 2D poses [People * BodyPart * XYConfidence]
+        displayImage : image for visualization
+        """
+        shape = image.shape
+        displayImage = np.zeros(shape=(image.shape),dtype=np.uint8)
+        size = np.zeros(shape=(3),dtype=np.int32)
+        self._libop.forward(self.op, image, shape[0], shape[1], size, displayImage, display)
+        array = np.zeros(shape=(size),dtype=np.float32)
+        self._libop.getOutputs(self.op, array)
+        if display:
+            return array, displayImage
+        return array
+
+    def poseFromHM(self, image, hm, ratios=[1]):
+        """
+        Pose From Heatmap: Takes in an image, computed heatmaps, and require scales and computes pose
+
+        Parameters
+        ----------
+        image : color image of type ndarray
+        hm : heatmap of type ndarray with heatmaps and part affinity fields
+        ratios : scaling ration if needed to fuse multiple scales
+
+        Returns
+        -------
+        array: ndarray of human 2D poses [People * BodyPart * XYConfidence]
+        displayImage : image for visualization
+        """
+        if len(ratios) != len(hm):
+            raise Exception("Ratio shape mismatch")
+
+        # Find largest
+        hm_combine = np.zeros(shape=(len(hm), hm[0].shape[1], hm[0].shape[2], hm[0].shape[3]),dtype=np.float32)
+        i=0
+        for h in hm:
+           hm_combine[i,:,0:h.shape[2],0:h.shape[3]] = h
+           i+=1
+        hm = hm_combine
+
+        ratios = np.array(ratios,dtype=np.float32)
+
+        shape = image.shape
+        displayImage = np.zeros(shape=(image.shape),dtype=np.uint8)
+        size = np.zeros(shape=(4),dtype=np.int32)
+        size[0] = hm.shape[0]
+        size[1] = hm.shape[1]
+        size[2] = hm.shape[2]
+        size[3] = hm.shape[3]
+
+        self._libop.poseFromHeatmap(self.op, image, shape[0], shape[1], displayImage, hm, size, ratios)
+        array = np.zeros(shape=(size[0],size[1],size[2]),dtype=np.float32)
+        self._libop.getOutputs(self.op, array)
+        return array, displayImage
+
+    @staticmethod
+    def process_frames(frame, boxsize = 368, scales = [1]):
+        base_net_res = None
+        imagesForNet = []
+        imagesOrig = []
+        for idx, scale in enumerate(scales):
+            # Calculate net resolution (width, height)
+            if idx == 0:
+                net_res = (16 * int((boxsize * frame.shape[1] / float(frame.shape[0]) / 16) + 0.5), boxsize)
+                base_net_res = net_res
+            else:
+                net_res = ((min(base_net_res[0], max(1, int((base_net_res[0] * scale)+0.5)/16*16))),
+                          (min(base_net_res[1], max(1, int((base_net_res[1] * scale)+0.5)/16*16))))
+            input_res = [frame.shape[1], frame.shape[0]]
+            scale_factor = min((net_res[0] - 1) / float(input_res[0] - 1), (net_res[1] - 1) / float(input_res[1] - 1))
+            warp_matrix = np.array([[scale_factor,0,0],
+                                    [0,scale_factor,0]])
+            if scale_factor != 1:
+                imageForNet = cv2.warpAffine(frame, warp_matrix, net_res, flags=(cv2.INTER_AREA if scale_factor < 1. else cv2.INTER_CUBIC), borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
+            else:
+                imageForNet = frame.copy()
+
+            imageOrig = imageForNet.copy()
+            imageForNet = imageForNet.astype(float)
+            imageForNet = imageForNet/256. - 0.5
+            imageForNet = np.transpose(imageForNet, (2,0,1))
+
+            imagesForNet.append(imageForNet)
+            imagesOrig.append(imageOrig)
+
+        return imagesForNet, imagesOrig
+
+    @staticmethod
+    def draw_all(imageForNet, heatmaps, currIndex, div=4., norm=False):
+        netDecreaseFactor = float(imageForNet.shape[0]) / float(heatmaps.shape[2]) # 8
+        resized_heatmaps = np.zeros(shape=(heatmaps.shape[0], heatmaps.shape[1], imageForNet.shape[0], imageForNet.shape[1]))
+        num_maps = heatmaps.shape[1]
+        combined = None
+        for i in range(0, num_maps):
+            heatmap = heatmaps[0,i,:,:]
+            resizedHeatmap = cv2.resize(heatmap, (0,0), fx=netDecreaseFactor, fy=netDecreaseFactor)
+
+            minVal, maxVal, minLoc, maxLoc = cv2.minMaxLoc(resizedHeatmap)
+
+            if i==currIndex and currIndex >=0:
+                resizedHeatmap = np.abs(resizedHeatmap)
+                resizedHeatmap = (resizedHeatmap*255.).astype(dtype='uint8')
+                im_color = cv2.applyColorMap(resizedHeatmap, cv2.COLORMAP_JET)
+                resizedHeatmap = cv2.addWeighted(imageForNet, 1, im_color, 0.3, 0)
+                cv2.circle(resizedHeatmap, (int(maxLoc[0]),int(maxLoc[1])), 5, (255,0,0), -1)
+                return resizedHeatmap
+            else:
+                resizedHeatmap = np.abs(resizedHeatmap)
+                if combined is None:
+                    combined = np.copy(resizedHeatmap);
+                else:
+                    if i <= num_maps-2:
+                        combined += resizedHeatmap;
+                        if norm:
+                            combined = np.maximum(0, np.minimum(1, combined));
+
+        if currIndex < 0:
+            combined /= div
+            combined = (combined*255.).astype(dtype='uint8')
+            im_color = cv2.applyColorMap(combined, cv2.COLORMAP_JET)
+            combined = cv2.addWeighted(imageForNet, 0.5, im_color, 0.5, 0)
+            cv2.circle(combined, (int(maxLoc[0]),int(maxLoc[1])), 5, (255,0,0), -1)
+            return combined
+
+
+if __name__ == "__main__":
+    params = dict()
+    params["logging_level"] = 3
+    params["output_resolution"] = "-1x-1"
+    params["net_resolution"] = "-1x368"
+    params["model_pose"] = "BODY_25"
+    params["alpha_pose"] = 0.6
+    params["scale_gap"] = 0.3
+    params["scale_number"] = 1
+    params["render_threshold"] = 0.05
+    params["num_gpu_start"] = 0
+    params["disable_blending"] = False
+    params["default_model_folder"] = "../../../models/"
+    openpose = OpenPose(params)
+
+    img = cv2.imread("../../../examples/media/COCO_val2014_000000000192.jpg")
+    arr, output_image = openpose.forward(img, True)
+    print(arr)
+
+    while 1:
+        cv2.imshow("output", output_image)
+        cv2.waitKey(15)
+
diff --git a/src/openpose/net/nmsBaseCL.cpp b/src/openpose/net/nmsBaseCL.cpp
index 1e060bcedc78b94244896e9dd33c9c2b0d7e5ae4..39883179c0d2351d1c93233e4c998da532897d0c 100644
--- a/src/openpose/net/nmsBaseCL.cpp
+++ b/src/openpose/net/nmsBaseCL.cpp
@@ -176,10 +176,10 @@ namespace op
                 cl::Buffer targetPtrBuffer = cl::Buffer((cl_mem)(targetPtr), true);
                 auto nmsRegisterKernel = OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
                         <NMSRegisterKernelFunctor, T>(
-                         "nmsRegisterKernel",nmsOclCommonFunctions + nmsRegisterKernel);
+                         "nmsRegisterKernel", op::nmsOclCommonFunctions + op::nmsRegisterKernel);
                 auto nmsWriteKernel = OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
                         <NMSWriteKernelFunctor, T>(
-                         "nmsWriteKernel", nmsOclCommonFunctions + nmsWriteKernel);
+                         "nmsWriteKernel", op::nmsOclCommonFunctions + op::nmsWriteKernel);
 
                 // log("num_b: " + std::to_string(bottom->shape(0)));       // = 1
                 // log("channel_b: " + std::to_string(bottom->shape(1)));   // = 57 = 18 body parts + bkg + 19x2 PAFs