提交 e14f78e4 编写于 作者: Eric.Lee2021's avatar Eric.Lee2021 🚴🏻

增加了 imagenet 识别模块,且有部分语音识别功能

上级 a4cd2d1e
......@@ -43,12 +43,17 @@
```
detect_model_path=./latest_416.pt #手部检测模型地址
detect_model_arch=yolo_v3 #检测模型类型 ,yolo or yolo-tiny
yolo_anchor_scale=1.0 # yolo anchor 比例,默认为 1
detect_conf_thres=0.5 # 检测模型阈值
detect_nms_thres=0.45 # 检测模型 nms 阈值
handpose_x_model_path=./ReXNetV1-size-256-wingloss102-0.1063.pth # 21点手回归模型地址
handpose_x_model_arch=rexnetv1 # 回归模型结构
classify_model_path=./imagenet_size-256_20210409.pth # 分类识别模型地址
classify_model_arch=resnet_50 # 分类识别模型结构
classify_model_classify_num=1000 # 分类类别数
camera_id = 0 # 相机 ID ,一般默认为0,如果不是请自行确认
vis_gesture_lines = True # True: 点击时的轨迹可视化, False:点击时的轨迹不可视化
charge_cycle_step = 32 # 点击稳定状态计数器,点击稳定充电环。
......
......@@ -30,6 +30,7 @@ import time
# 加载模型组件库
from hand_detect.yolo_v3_hand import yolo_v3_hand_model
from hand_keypoints.handpose_x import handpose_x_model
from classify_imagenet.imagenet_c import classify_imagenet_model
# 加载工具库
import sys
......@@ -190,13 +191,20 @@ def audio_process_recognize_up_edge(info_dict):
if (info_dict[g_]^gesture_dict[g_]) and info_dict[g_]==True:# 判断Click手势信号为上升沿,Click动作开始
playsound("./materials/audio/sentences/IdentifyingObjectsWait.mp3")
playsound("./materials/audio/sentences/ObjectMayBeIdentified.mp3")
if info_dict["reco_msg"] is not None:
print("process - (audio_process_recognize_up_edge) reco_msg : {} ".format(info_dict["reco_msg"]))
doc_name = info_dict["reco_msg"]["label_msg"]["doc_name"]
reco_audio_file = "./materials/audio/imagenet_2012/{}.mp3".format(doc_name)
if os.access(reco_audio_file,os.F_OK):# 判断语音文件是否存在
playsound(reco_audio_file)
info_dict["reco_msg"] = None
gesture_dict[g_] = info_dict[g_]
except Exception as inst:
print(type(inst),inst) # exception instance
if info_dict["break"] == True:
break
'''
......@@ -209,13 +217,15 @@ def handpose_x_process(info_dict,config):
print("load model component ...")
# yolo v3 手部检测模型初始化
hand_detect_model = yolo_v3_hand_model(conf_thres=float(config["detect_conf_thres"]),nms_thres=float(config["detect_nms_thres"]),
model_arch = config["detect_model_arch"],model_path = config["detect_model_path"])
model_arch = config["detect_model_arch"],model_path = config["detect_model_path"],yolo_anchor_scale = float(config["yolo_anchor_scale"]),
img_size = float(config["detect_input_size"]),
)
# handpose_x 21 关键点回归模型初始化
handpose_model = handpose_x_model(model_arch = config["handpose_x_model_arch"],model_path = config["handpose_x_model_path"])
#
gesture_model = None # 目前缺省
#
object_recognize_model = None # 识别分类模型,目前缺省
object_recognize_model = classify_imagenet_model(model_arch = config["classify_model_arch"],model_path = config["classify_model_path"]) # 识别分类模型
#
img_reco_crop = None
......@@ -309,8 +319,8 @@ def handpose_x_process(info_dict,config):
# 判断各手的click状态是否稳定,且满足设定阈值
flag_click_stable = judge_click_stabel(img,handpose_list,int(config["charge_cycle_step"]))
# 判断是否启动识别语音,且进行选中目标识别
img_reco_crop = audio_recognize(img,algo_img,img_reco_crop,object_recognize_model,info_dict,double_en_pts,flag_click_stable)
img_reco_crop,reco_msg = audio_recognize(img,algo_img,img_reco_crop,object_recognize_model,info_dict,double_en_pts,flag_click_stable)
# print(reco_msg)
cv2.putText(img, 'HandNum:[{}]'.format(len(hand_bbox)), (5,25),cv2.FONT_HERSHEY_COMPLEX, 0.7, (255, 0, 0),5)
cv2.putText(img, 'HandNum:[{}]'.format(len(hand_bbox)), (5,25),cv2.FONT_HERSHEY_COMPLEX, 0.7, (0, 0, 255))
......@@ -342,6 +352,8 @@ def main_handpose_x(cfg_file):
g_info_dict["click_up_cnt"] = 0
g_info_dict["click_dw_cnt"] = 0
g_info_dict["reco_msg"] = None
print(" multiprocessing dict key:\n")
for key_ in g_info_dict.keys():
print( " -> ",key_)
......
#-*-coding:utf-8-*-
# date:2020-04-11
# author: Eric.Lee
# function : classify
import os
import torch
import cv2
import numpy as np
import json
import torch
import torch.nn as nn
import numpy as np
import time
import datetime
import os
import math
from datetime import datetime
import cv2
import torch.nn.functional as F
from classify_imagenet.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
#
class classify_imagenet_model(object):
def __init__(self,
model_path = './components/classify_imagenet/weights/imagenet_size-256_20210409.pth',
model_arch = "resnet_50",
img_size= 256,
num_classes = 1000,
):
f = open("./components/classify_imagenet/imagenet_msg.json", encoding='utf-8')#读取 json文件
dict_ = json.load(f)
f.close()
self.classify_dict = dict_
# print("-------------->>\n dict_ : \n",dict_)
#
print("classify model loading : ",model_path)
# print('use model : %s'%(model_arch))
if model_arch == 'resnet_18':
model_=resnet18(num_classes=num_classes, img_size=img_size)
elif model_arch == 'resnet_34':
model_=resnet34(num_classes=num_classes, img_size=img_size)
elif model_arch == 'resnet_50':
model_=resnet50(num_classes=num_classes, img_size=img_size)
elif model_arch == 'resnet_101':
model_=resnet101(num_classes=num_classes, img_size=img_size)
elif model_arch == 'resnet_152':
model_=resnet152(num_classes=num_classes, img_size=img_size)
else:
print('error no the struct model : {}'.format(model_arch))
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
model_ = model_.to(device)
model_.eval() # 设置为前向推断模式
# print(model_)# 打印模型结构
# 加载测试模型
if os.access(model_path,os.F_OK):# checkpoint
chkpt = torch.load(model_path, map_location=device)
model_.load_state_dict(chkpt)
# print('load classify model : {}'.format(model_path))
self.model_ = model_
self.use_cuda = use_cuda
self.img_size = img_size
def predict(self, img, vis = False):# img is align img
with torch.no_grad():
img_ = cv2.resize(img, (self.img_size,self.img_size), interpolation = cv2.INTER_CUBIC)
img_ = img_.astype(np.float32)
img_ = (img_-128.)/256.
img_ = img_.transpose(2, 0, 1)
img_ = torch.from_numpy(img_)
img_ = img_.unsqueeze_(0)
if self.use_cuda:
img_ = img_.cuda() # (bs, 3, h, w)
pre_ = self.model_(img_.float())
outputs = F.softmax(pre_,dim = 1)
outputs = outputs[0]
output = outputs.cpu().detach().numpy()
output = np.array(output)
max_index = np.argmax(output)
score_ = output[max_index]
# print("max_index:",max_index)
# print("name:",self.label_dict[max_index])
return max_index,self.classify_dict[str(max_index)],score_
此差异已折叠。
import torch
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, img_size=224,dropout_factor = 1.):
self.inplanes = 64
self.dropout_factor = dropout_factor
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
# see this issue: https://github.com/xxradon/PytorchToCaffe/issues/16
# self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
assert img_size % 32 == 0
pool_kernel = int(img_size / 32)
self.avgpool = nn.AvgPool2d(pool_kernel, stride=1, ceil_mode=True)
self.dropout = nn.Dropout(self.dropout_factor)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.dropout(x)
x = self.fc(x)
return x
def load_model(model, pretrained_state_dict):
model_dict = model.state_dict()
pretrained_dict = {k: v for k, v in pretrained_state_dict.items() if
k in model_dict and model_dict[k].size() == pretrained_state_dict[k].size()}
model.load_state_dict(pretrained_dict, strict=False)
if len(pretrained_dict) == 0:
print("[INFO] No params were loaded ...")
else:
for k, v in pretrained_state_dict.items():
if k in pretrained_dict:
print("==>> Load {} {}".format(k, v.size()))
else:
print("[INFO] Skip {} {}".format(k, v.size()))
return model
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
print("Load pretrained model from {}".format(model_urls['resnet18']))
pretrained_state_dict = model_zoo.load_url(model_urls['resnet18'])
model = load_model(model, pretrained_state_dict)
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
print("Load pretrained model from {}".format(model_urls['resnet34']))
pretrained_state_dict = model_zoo.load_url(model_urls['resnet34'])
model = load_model(model, pretrained_state_dict)
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
print("Load pretrained model from {}".format(model_urls['resnet50']))
pretrained_state_dict = model_zoo.load_url(model_urls['resnet50'])
model = load_model(model, pretrained_state_dict)
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
print("Load pretrained model from {}".format(model_urls['resnet101']))
pretrained_state_dict = model_zoo.load_url(model_urls['resnet101'])
model = load_model(model, pretrained_state_dict)
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
print("Load pretrained model from {}".format(model_urls['resnet152']))
pretrained_state_dict = model_zoo.load_url(model_urls['resnet152'])
model = load_model(model, pretrained_state_dict)
return model
if __name__ == "__main__":
input = torch.randn([32, 3, 256,256])
model = resnet34(False, num_classes=2, img_size=256)
output = model(input)
print(output.size())
#-*-coding:utf-8-*-
# date:2020-04-11
# author: Eric.Lee
# function: utils
import os
import shutil
import cv2
import numpy as np
import json
def mkdir_(path, flag_rm=False):
if os.path.exists(path):
if flag_rm == True:
shutil.rmtree(path)
os.mkdir(path)
print('remove {} done ~ '.format(path))
else:
os.mkdir(path)
def plot_box(bbox, img, color=None, label=None, line_thickness=None):
tl = line_thickness or round(0.002 * max(img.shape[0:2])) + 1
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl)# 目标的bbox
if label:
tf = max(tl - 2, 1)
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] # label size
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 # 字体的bbox
cv2.rectangle(img, c1, c2, color, -1) # label 矩形填充
# 文本绘制
cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 4, [225, 255, 255],thickness=tf, lineType=cv2.LINE_AA)
class JSON_Encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(JSON_Encoder, self).default(obj)
#-*-coding:utf-8-*-
# date:2020-04-11
# author: Eric.Lee
# function : utils
import os
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import random
def get_acc(output, label):
total = output.shape[0]
_, pred_label = output.max(1)
num_correct = (pred_label == label).sum().item()
return num_correct / float(total)
def set_learning_rate(optimizer, lr):
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def set_seed(seed = 666):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
cudnn.deterministic = True
def split_trainval_datasets(ops):
print(' --------------->>> split_trainval_datasets ')
train_split_datasets = []
train_split_datasets_label = []
val_split_datasets = []
val_split_datasets_label = []
for idx,doc in enumerate(sorted(os.listdir(ops.train_path), key=lambda x:int(x.split('-')[0]), reverse=False)):
# print(' %s label is %s \n'%(doc,idx))
data_list = os.listdir(ops.train_path+doc)
random.shuffle(data_list)
cal_split_num = int(len(data_list)*ops.val_factor)
for i,file in enumerate(data_list):
if '.jpg' in file:
if i < cal_split_num:
val_split_datasets.append(ops.train_path+doc + '/' + file)
val_split_datasets_label.append(idx)
else:
train_split_datasets.append(ops.train_path+doc + '/' + file)
train_split_datasets_label.append(idx)
print(ops.train_path+doc + '/' + file,idx)
print('\n')
print('train_split_datasets len {}'.format(len(train_split_datasets)))
print('val_split_datasets len {}'.format(len(val_split_datasets)))
return train_split_datasets,train_split_datasets_label,val_split_datasets,val_split_datasets_label
......@@ -236,6 +236,7 @@ class yolo_v3_hand_model(object):
def __init__(self,
model_path = './components/hand_detect/weights/latest_416-2021-02-19.pt',
model_arch = 'yolov3',
yolo_anchor_scale = 1.,
img_size=416,
conf_thres=0.16,
nms_thres=0.4,):
......@@ -250,7 +251,7 @@ class yolo_v3_hand_model(object):
#-----------------------------------------------------------------------
weights = model_path
if "tiny" in model_arch:
a_scalse = 416./img_size
a_scalse = 416./img_size*yolo_anchor_scale
anchors=[(10, 14), (23, 27), (37, 58), (81, 82), (135, 169), (344, 319)]
anchors_new = [ (int(anchors[j][0]/a_scalse),int(anchors[j][1]/a_scalse)) for j in range(len(anchors)) ]
......
detect_model_path=./components/hand_detect/weights/latest_416-2021-02-19.pt
detect_model_arch=yolo_v3
detect_conf_thres=0.5
detect_model_path=./hand_416-2021-01-29.pt
detect_model_arch=yolo
detect_input_size = 416
yolo_anchor_scale=1.
detect_conf_thres=0.4
detect_nms_thres=0.45
handpose_x_model_path=./components/hand_keypoints/weights/ReXNetV1-size-256-wingloss102-0.1063.pth
handpose_x_model_path=./ReXNetV1-size-256-wingloss102-0.1041.pth
handpose_x_model_arch=rexnetv1
classify_model_path=./imagenet_size-256_20210409.pth
classify_model_arch=resnet_50
camera_id = 0
vis_gesture_lines = True
charge_cycle_step = 32
charge_cycle_step = 18
......@@ -241,6 +241,7 @@ def handpose_track_keypoints21_pipeline(img,hands_dict,hands_click_dict,track_in
'''
def audio_recognize(img,algo_img,img_reco_crop,object_recognize_model,info_dict,double_en_pts,flag_click_stable):
# 开启识别
reco_msg = None
if (len(double_en_pts) == 2) and (flag_click_stable == True):
x1,y1 = int(double_en_pts[0][0]),int(double_en_pts[0][1])
......@@ -258,12 +259,16 @@ def audio_recognize(img,algo_img,img_reco_crop,object_recognize_model,info_dict,
if ((x2_-x1_)>0) and ((y2_-y1_)>0):
img_reco_crop = cv2.resize(algo_img[y1_:y2_,x1_:x2_,:], (130,130)) #待识别区域块
print("------------------------>>> start object_recognize_model ")
max_index,label_msg,score_ = object_recognize_model.predict(img_reco_crop)
reco_msg = {"index":max_index,"label_msg":label_msg,"score":score_}
# print(" audio_recognize function ->> reco_msg : ",reco_msg)
info_dict["reco_msg"] = reco_msg
if img_reco_crop is not None: # 绘制识别区域在左下角
h,w,_ = img.shape
img[(h-131):(h-1),(w-131):(w-1),:] = img_reco_crop
cv2.rectangle(img, (w-131,h-131), (w-1,h-1), (225,66,66), 5)
#-----------------------------------------
info_dict["double_en_pts"] = True
cv2.rectangle(img, (x1_,y1_), (x2_,y2_), (225,255,62), 5)
......@@ -272,8 +277,9 @@ def audio_recognize(img,algo_img,img_reco_crop,object_recognize_model,info_dict,
cv2.putText(img, ' recognize{}'.format(""), (x1_,y1_),cv2.FONT_HERSHEY_COMPLEX, 0.65, (0,33,255),1)
else:
info_dict["double_en_pts"] = False
return img_reco_crop
return img_reco_crop,reco_msg
'''
判断各手的click状态是否稳定(点击稳定充电环),即click是否持续一定阈值
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册