【AI达人创造营第二期】PaddlePaddle+OpenVINO实现人_副本1

【AI达人创造营第二期】PaddlePaddle+OpenVINO实现人机互动

AI Studio

432人浏览 · 2022-03-05 16:52:23

AI Studio · 2022-03-05 16:52:23 发布

PaddlePaddle+OpenVINO实现人机互动

转载自AI Studio
项目链接https://aistudio.baidu.com/aistudio/projectdetail/3525813

项目展示

项目说明

项目参考自HandPose_x，本项目通过PaddlePaddle+OpenVINO联合使用，打通该项目从训练到快速部署的一条龙，该项目还有待扩展开发的地方，欢迎加入一起建设。
项目需要通过手部关键点的检测和追踪，分析出手部在现实世界的相对坐标，从而实现对现实世界一些画面信息的截取，再把这部分信息传递到计算机中进行分析，实现人机互动，目前已经实现的功能有：信息分类，特定区域文字提取，画画。

训练

从项目需求中可以知道，要实现基础模块需要：①手部目标检测，②手部关键点检测；扩展功能需要：③分类，④OCR。

目标检测

数据集

数据集为TV-Hand 和 COCO-Hand (COCO-Hand-Big 部分) 合并，TV-Hand 和 COCO-Hand数据集官网地址：网址，因为数据集格式是yolo(txt)格式的，所以需要转成VOC格式。

!mkdir -p train/label
!mv datasets_TVCOCO_hand_train/anno/images train/
from xml.dom.minidom import Document
import os
import cv2
from tqdm import tqdm

def writeXml(tmp, imgname, w, h, objbud):
    doc = Document()
    # owner
    annotation = doc.createElement('annotation')
    doc.appendChild(annotation)
    # owner
    folder = doc.createElement('folder')
    annotation.appendChild(folder)
    folder_txt = doc.createTextNode("train")
    folder.appendChild(folder_txt)

    filename = doc.createElement('filename')
    annotation.appendChild(filename)
    filename_txt = doc.createTextNode(imgname)
    filename.appendChild(filename_txt)
    # ones#
    source = doc.createElement('source')
    annotation.appendChild(source)

    database = doc.createElement('database')
    source.appendChild(database)
    database_txt = doc.createTextNode("Unknown")
    database.appendChild(database_txt)

    # onee#
    # twos#
    size = doc.createElement('size')
    annotation.appendChild(size)

    width = doc.createElement('width')
    size.appendChild(width)
    width_txt = doc.createTextNode(str(w))
    width.appendChild(width_txt)

    height = doc.createElement('height')
    size.appendChild(height)
    height_txt = doc.createTextNode(str(h))
    height.appendChild(height_txt)

    depth = doc.createElement('depth')
    size.appendChild(depth)
    depth_txt = doc.createTextNode("3")
    depth.appendChild(depth_txt)
    # twoe#
    segmented = doc.createElement('segmented')
    annotation.appendChild(segmented)
    segmented_txt = doc.createTextNode("0")
    segmented.appendChild(segmented_txt)

    for i in range(0, int(len(objbud) / 5)):
        # threes#
        object_new = doc.createElement("object")
        annotation.appendChild(object_new)

        name = doc.createElement('name')
        object_new.appendChild(name)
        name_txt = doc.createTextNode(objbud[i * 5])
        name.appendChild(name_txt)

        pose = doc.createElement('pose')
        object_new.appendChild(pose)
        pose_txt = doc.createTextNode("Unspecified")
        pose.appendChild(pose_txt)

        truncated = doc.createElement('truncated')
        object_new.appendChild(truncated)
        truncated_txt = doc.createTextNode("0")
        truncated.appendChild(truncated_txt)

        difficult = doc.createElement('difficult')
        object_new.appendChild(difficult)
        difficult_txt = doc.createTextNode("0")
        difficult.appendChild(difficult_txt)
        # threes-1#
        bndbox = doc.createElement('bndbox')
        object_new.appendChild(bndbox)

        xmin = doc.createElement('xmin')
        bndbox.appendChild(xmin)
        xmin_txt = doc.createTextNode(str(objbud[i * 5 + 1]))
        xmin.appendChild(xmin_txt)

        ymin = doc.createElement('ymin')
        bndbox.appendChild(ymin)
        ymin_txt = doc.createTextNode(str(objbud[i * 5 + 2]))
        ymin.appendChild(ymin_txt)

        xmax = doc.createElement('xmax')
        bndbox.appendChild(xmax)
        xmax_txt = doc.createTextNode(str(objbud[i * 5 + 3]))
        xmax.appendChild(xmax_txt)

        ymax = doc.createElement('ymax')
        bndbox.appendChild(ymax)
        ymax_txt = doc.createTextNode(str(objbud[i * 5 + 4]))
        ymax.appendChild(ymax_txt)
        # threee-1#
        # threee#

    tempfile = tmp + imgname.split(".")[0] + ".xml"
    with open(tempfile, "w") as f:
        doc.writexml(f, indent='', addindent='\t', newl='\n', encoding='utf-8')

    return

image_path = "./train/images/"
txt_label_path = "./datasets_TVCOCO_hand_train/anno/labels/"
xml_label_path = "./train/label/"
image_name = os.listdir(image_path)

for name in tqdm(image_name):
    if ".jpg" in name:
        image = cv2.imread(os.path.join(image_path, name))
        height, width, _ = image.shape
        txt_path = os.path.join(txt_label_path, name.split(".")[0]+".txt")
        obj1 = []
        obj2 = []
        with open(txt_path, "r") as f:
            data = f.readlines()
            for line in data:
                for line_data in line.split("\n")[0].split(" "):
                    obj1.append(line_data)
        for i in range(int(len(obj1)/5)):
            if (float(obj1[i * 5 + 1]) - 0.5*float(obj1[i * 5 + 3]) or
                float(obj1[i * 5 + 2]) - 0.5*float(obj1[i * 5 + 4]) or
                float(obj1[i * 5 + 1]) + 0.5*float(obj1[i * 5 + 3]) or
                float(obj1[i * 5 + 2]) + 0.5*float(obj1[i * 5 + 4])) < 0:
                pass
            else:
                obj2.append(obj1[i * 5])
                obj2.append(int((float(obj1[i * 5 + 1]) - 0.5*float(obj1[i * 5 + 3]))*width))
                obj2.append(int((float(obj1[i * 5 + 2]) - 0.5*float(obj1[i * 5 + 4]))*height))
                obj2.append(int((float(obj1[i * 5 + 1]) + 0.5*float(obj1[i * 5 + 3]))*width))
                obj2.append(int((float(obj1[i * 5 + 2]) + 0.5*float(obj1[i * 5 + 4]))*height))
        if len(obj2) != 0:
            writeXml(xml_label_path, name, width, height, obj2)
    else:
        continue

制作数据集。

import os

image_dir = "train/images/"
xml_dir = "train/label/"
xml_path = os.listdir(xml_dir)
f_train = open("train/train.txt", "w")
f_test = open("train/test.txt", "w")

for i in range(len(xml_path)):
    if (i % 100) != 0:
        f_train.write(image_dir + xml_path[i].split(".")[0] + ".jpg "+xml_dir+xml_path[i]+"\n")
    else:
        f_test.write(image_dir + xml_path[i].split(".")[0] + ".jpg "+xml_dir+xml_path[i]+"\n")

通过paddledetection训练目标检测模型

PaddleDetection为基于飞桨PaddlePaddle的端到端目标检测套件，提供多种主流目标检测、实例分割、跟踪、关键点检测算法，配置化的网络模块组件、数据增强策略、损失函数等，推出多种服务器端和移动端工业级SOTA模型，并集成了模型压缩和跨平台高性能部署能力，帮助开发者更快更好完成端到端全开发流程。
一开始在paddledetection使用picodet进行训练(在paddledetection文档中也确实见到picodet支持openvino)，后面涉及到openvino的部署，在尝试openvino的部署后发现算子缺失，只能作罢，查询openvino官网资料，发现有支持paddlepaddle yolov3/ppyolo模型的代码，于是第一次尝试选择了yolov3，经过实验最后yolov3成功通过openvino部署，如果想要尝试picodet训练部署，可以使用paddle原生态推理inference，在后面也会给出inference推理代码，不过要是能尽快支持openvino部署还是很香的，插根inter推理棒带来的加速会让picodet这种轻量化模型在端侧发挥更好的优势。

!git clone https://gitee.com/paddlepaddle/PaddleDetection.git
!pip install -r PaddleDetection/requirements.txt

!export CUDA_VISIBLE_DEVICES=0
!python PaddleDetection/tools/train.py -c config/yolo.yml --eval

通过paddlex训练目标检测模型

同样的，也可以通过paddlex进行训练，paddlex也收集了paddledetection的部分检测模型，paddlex更加简单方便地支持模型训练，作为新手是一个不错的选择。

!pip install paddlex==2.1.0
import paddlex as pdx
from paddlex import transforms as T
train_transforms = T.Compose([
    T.MixupImage(mixup_epoch=250), T.RandomDistort(),
    T.RandomExpand(im_padding_value=[123.675, 116.28, 103.53]), T.RandomCrop(),
    T.RandomHorizontalFlip(), T.BatchRandomResize(
        target_sizes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608],
        interp='RANDOM'), T.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

eval_transforms = T.Compose([
    T.Resize(
        608, interp='CUBIC'), T.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_dataset = pdx.datasets.VOCDetection(
    data_dir='./',
    file_list='train/train.txt',
    label_list='train/labels.txt',
    transforms=train_transforms,
    shuffle=True)

eval_dataset = pdx.datasets.VOCDetection(
    data_dir='./',
    file_list='train/test.txt',
    label_list='train/labels.txt',
    transforms=eval_transforms,
    shuffle=False)
num_classes = len(train_dataset.labels)
model = pdx.det.YOLOv3(num_classes=num_classes, backbone='MobileNetV3', nms_topk=500, nms_keep_topk=50)
model.train(
    num_epochs=270,
    train_dataset=train_dataset,
    train_batch_size=32,
    eval_dataset=eval_dataset,
    pretrain_weights='COCO',
    learning_rate=.001,
    warmup_steps=1000,
    warmup_start_lr=0.0,
    save_interval_epochs=5,
    lr_decay_epochs=[70, 140, 210],
    use_ema=True,
    save_dir='output/yolo')

手部关键点

PoseHand_x

可以参照paddlepaddle实现的HandPose_x。从该项目中将模型下载下来后，在下一步的推理模型导出中导出。

PaddleHub

PaddleHub中有大量的PaddlePaddle生态下的预训练模型，完成模型的管理和一键预测。配合使用Fine-tune API，可以基于大规模预训练模型快速完成迁移学习，让预训练模型能更好地服务于用户特定场景的应用，这里面有现成的手部关键点检测模型：hand_pose_localization，可以直接下载该模型(该模型可以直接用来部署)。

可以通过hub install把模型给下载下来，下载完会提示下载位置在哪里，再去那里把模型给拿出来就好。

当然，该模型是肖佬提供的，也可以去他的项目中把模型给拿出来，项目地址指路：项目

!hub install hand_pose_localization==1.0.1

OpenVINO和Inference推理模型导出

目标检测

paddledetection

导出步骤参考文档PaddleDetection部署模型导出教程。

这里是既将inference模型导出，又有onnx模型(用于在openvino部署)。

!pip install paddle2onnx
!pip install onnx

!python PaddleDetection/tools/export_model.py -c config/yolo.yml \
                                                TestReader.inputs_def.image_shape=[1, 3, 608, 608] \
                                                --output_dir inference_model
!paddle2onnx \
    --model_dir inference_model/inference_model \
    --model_filename model.pdmodel \
    --params_filename model.pdiparams \
    --save_file yolo.onnx \
    --opset_version 11 \
    --enable_onnx_checker True

paddlex

导出步骤参考文档paddlex部署模型导出教程。

!paddlex --export_inference --model_dir=./model/ --save_dir=./inference_model --fixed_input_shape=[1,3,608,608]
!paddle2onnx \
    --model_dir inference_model/inference_model \
    --model_filename model.pdmodel \
    --params_filename model.pdiparams \
    --save_file yolo.onnx \
    --opset_version 11 \
    --enable_onnx_checker True

手部关键点模型

PoseHand_x

可以通过使用paddle.onnx.exportapi将onnx模型导出。

from resnet50 import resnet50
model_ = resnet50(num_classes=42, img_size=256)
model_.eval()  # 设置为前向推断模式
model_path = '848resnet_50-model_epoch-9.pdparams' #训练出的模型
ckpd = paddle.load(model_path)
model_.set_state_dict(ckpd)
x_spec = paddle.static.InputSpec(shape=[1, 3, 256, 256], dtype='float32')
paddle.onnx.export(model, 'posehand', input_spec=[x_spec])

推理部署

目标检测(OpenVINO)

from openvino.inference_engine import IENetwork, IECore
import cv2
import numpy as np

class OpenvinoHandDetectModel(object):
    def __init__(self, crop_size=[608, 608], k_top=2):
        self.model = OpenvinoHandDetectInference(crop_size=crop_size, k_top=k_top)
        self.crop_size = crop_size
    def predict(self, img_cv2, threshold):
        h, w, _ = img_cv2.shape
        output = self.model.forward(img_cv2)
        hands_list = []
        if len(output) > 0:
            if output[0][1] > threshold:
                for i in range(len(output)):
                    if output[i][1] > threshold:
                        x1 = int(output[i][2] / self.crop_size[0] * w)
                        y1 = int(output[i][3] / self.crop_size[0] * h)
                        x2 = int(output[i][4] / self.crop_size[0] * w)
                        y2 = int(output[i][5] / self.crop_size[0] * h)
                        hands_list.append([img_cv2[y1:y2, x1:x2], x1, y1, x2, y2])
                        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), [0, 0, 255], thickness=2)
        return hands_list

class OpenvinoHandDetectInference(object):
    def __init__(self, model_path="./onnx/model.onnx", crop_size=[608, 608], k_top=2, device="CPU"):
        ie = IECore()
        net = ie.read_network(model_path)
        net.reshape({'image': [1, 3, crop_size[0], crop_size[1]], 'im_shape': [1, 2], 'scale_factor': [1, 2]})
        self.exec_net = ie.load_network(net, device)
        self.crop_size = crop_size
        self.k_top = k_top

    def forward(self, src_img):
        test_image = handle(src_img, self.crop_size)
        test_im_shape = np.array([[608, 608]]).astype('float32')
        test_scale_factor = np.array([[1, 1]]).astype('float32')
        inputs_dict = {'image': test_image, "im_shape": test_im_shape,
                       "scale_factor": test_scale_factor}
        output = self.exec_net.infer(inputs_dict)
        output_data = list(output.values())

        return output_data[:self.k_top]

def normalize(src_img, mean, std):
    src_img = src_img.astype(np.float32, copy=False)
    mean = np.array(mean)[np.newaxis, np.newaxis, :]
    std = np.array(std)[np.newaxis, np.newaxis, :]
    src_img = src_img / 255.0
    src_img -= mean
    src_img /= std

    return src_img

def handle(src_img, crop_size):
    src_img = cv2.resize(src_img, (crop_size[0], crop_size[1]))
    src_img = normalize(src_img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    src_img = src_img.transpose([2, 0, 1])
    tensor_img = src_img[None, :].astype("float32")

    return tensor_img

目标检测(Inference)

import cv2
import numpy as np
import paddle.inference as inference

class HandDetectModel(object):
    def __init__(self, crop_size=[608, 608], k_top=2):
        self.model = HandDetectInference(crop_size=crop_size, k_top=k_top)
        self.crop_size = crop_size
    def predict(self, img_cv2, threshold):
        h, w, _ = img_cv2.shape
        output = self.model.forward(img_cv2)
        hands_list = []
        if len(output) > 0:
            if output[0][1] > threshold:
                for i in range(len(output)):
                    if output[i][1] > threshold:
                        x1 = int(output[i][2] / self.crop_size[0] * w)
                        y1 = int(output[i][3] / self.crop_size[0] * h)
                        x2 = int(output[i][4] / self.crop_size[0] * w)
                        y2 = int(output[i][5] / self.crop_size[0] * h)
                        hands_list.append([img_cv2[y1:y2, x1:x2], x1, y1, x2, y2])
                        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), [0, 0, 255], thickness=2)
        return hands_list

class HandDetectInference(object):
    def __init__(self, model_path="./inference_model/model.pdmodel", param_path="./inference_model/model.pdiparams", crop_size=[512, 512], k_top=2):
        self.config = inference.Config(model_path, param_path)
        self.predictor = inference.create_predictor(self.config)
        self.crop_size = crop_size
        self.k_top = k_top

    def forward(self, src_img):
        input_names = self.predictor.get_input_names()
        input_handle = self.predictor.get_input_handle(input_names[0])
        input_handle.copy_from_cpu(np.array([self.crop_size, ]))
        input_handle = self.predictor.get_input_handle(input_names[1])
        input_handle.copy_from_cpu(handle(src_img, self.crop_size))
        input_handle = self.predictor.get_input_handle(input_names[2])
        input_handle.copy_from_cpu(np.array([[1, 1], ]))
        output_names = self.predictor.get_output_names()
        output_handle = self.predictor.get_output_handle(output_names[0])

        self.predictor.run()
        output_data = output_handle.copy_to_cpu()

        return output_data[:self.k_top]

def normalize(src_img, mean, std):
    src_img = src_img.astype(np.float32, copy=False)
    mean = np.array(mean)[np.newaxis, np.newaxis, :]
    std = np.array(std)[np.newaxis, np.newaxis, :]
    src_img = src_img / 255.0
    src_img -= mean
    src_img /= std

    return src_img

def handle(src_img, crop_size):
    src_img = cv2.resize(src_img, (crop_size[0], crop_size[1]))
    src_img = normalize(src_img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    src_img = src_img.transpose([2, 0, 1])
    tensor_img = src_img[None, :].astype("float32")

    return tensor_img

手部关键点模型

import cv2
import numpy as np
from paddle.inference import Config
from paddle.inference import create_predictor

class KeypointInferenceModel(object):
    def __init__(self):
        self.config = Config("model/__model__", "model/__params__")
        self.predictor = create_predictor(self.config)

    def forward(self, inpBlob):
        input_names = self.predictor.get_input_names()
        input_handle = self.predictor.get_input_handle(input_names[0])
        output_names = self.predictor.get_output_names()
        output_handle = self.predictor.get_output_handle(output_names[0])
        input_handle.copy_from_cpu(inpBlob)
        self.predictor.run()
        output_data = output_handle.copy_to_cpu()

        return output_data

class KeypointModel(object):
    # 初始化
    def __init__(self):
        self.num_points = 21
        self.inHeight = 368
        self.threshold = 0.1
        self.point_pairs = [[0, 1], [1, 2], [2, 3], [3, 4],
                            [0, 5], [5, 6], [6, 7], [7, 8],
                            [0, 9], [9, 10], [10, 11], [11, 12],
                            [0, 13], [13, 14], [14, 15], [15, 16],
                            [0, 17], [17, 18], [18, 19], [19, 20]]
        self.model = KeypointInferenceModel()

    # 模型推理预测
    def predict(self, img_cv2):
        # 图像预处理
        img_height, img_width, _ = img_cv2.shape
        aspect_ratio = img_width / img_height
        inWidth = int(((aspect_ratio * self.inHeight) * 8) // 8)
        inpBlob = cv2.dnn.blobFromImage(img_cv2, 1.0 / 255, (inWidth, self.inHeight), (0, 0, 0), swapRB=False,
                                        crop=False)

        # 模型推理
        output = self.model.forward(inpBlob)

        # 关键点计算
        points = []
        for idx in range(self.num_points):
            # confidence map
            probMap = output[0, idx, :, :]
            probMap = cv2.resize(probMap, (img_width, img_height))

            # Find global maxima of the probMap.
            minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)

            if prob > self.threshold:
                points.append((int(point[0]), int(point[1])))
            else:
                points.append(None)

        return points

    # 手部姿势可视化函数
    def vis_pose(self, img_cv2, points, clas_hand):
        img_cv2_copy = np.copy(img_cv2)
        for idx in range(len(points)):
            if points[idx]:
                cv2.circle(img_cv2_copy, points[idx], 3, (0, 255, 255), thickness=-1,
                           lineType=cv2.FILLED)
                cv2.putText(img_cv2_copy, "{}".format(idx), points[idx], cv2.FONT_HERSHEY_SIMPLEX,
                            1, (0, 0, 255), 2, lineType=cv2.LINE_AA)

        # Draw Skeleton
        for pair in self.point_pairs:
            partA = pair[0]
            partB = pair[1]

            if points[partA] and points[partB]:
                cv2.line(img_cv2, points[partA], points[partB], (0, 255, 255), 2)
                cv2.circle(img_cv2, points[partA], 3, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
                cv2.circle(img_cv2, points[partB], 3, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)

            if clas_hand == "left" and points[20]:
                return points[20]
            elif clas_hand == "right" and points[8]:
                return points[8]
            else:
                return None

if __name__ == '__main__':
    pose_model = KeypointModel()
    frame = cv2.imread('test/left.jpg')
    res_points = pose_model.predict(frame)
    pose_model.vis_pose(frame, res_points, "left")
    cv2.imshow("video", frame)
    cv2.waitKey(0)

交互策略

①通过目标检测把手检测出来，将含有手的那部分切出来；

②将①中得到的图片放入关键点检测模型进行推理；

③因为有目标检测，可以通过对手实现简单的iou追踪，在某一区域停留过久就认为想要进行交互了；

④将交互flag置为True，通过关键点提取双手食指围成区域进行分类检测或者OCR提取(或单手食指所在点，开始画画，当食指点位移动过快则认为收笔停止作画)。

在④中提到的扩展功能在clas_flag处加入即可(有注释的那个地方)。

Openvino版本下的

from keypoint import KeypointModel
from detect import OpenvinoHandDetectModel
from util import compute_iou
from config import *
import cv2

handdetectmodel = OpenvinoHandDetectModel(crop_size=crop_size)
keypointmodel = KeypointModel()

video = cv2.VideoCapture(0)
ret, frame = video.read()
h, w, _ = frame.shape
while ret:
    hands_list = handdetectmodel.predict(frame, detect_threshold)
    if len(hands_list) == 1:
        if (hands_list[0][1] + hands_list[0][3]) > 2*w:
            clas_hand = "right"
        else:
            clas_hand = "left"
        single_hand = hands_list[0][0]
        res_points = keypointmodel.predict(single_hand)
        point = keypointmodel.vis_pose(single_hand, res_points, clas_hand)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand
    elif len(hands_list) == 2:
        if (hands_list[0][1] + hands_list[0][3]) > (hands_list[1][1] + hands_list[1][3]):
            clas_hand_1 = "right"
            clas_hand_2 = "left"
            new_box_l = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_r = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        else:
            clas_hand_1 = "left"
            clas_hand_2 = "right"
            new_box_r = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_l = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        if compute_iou(new_box_l, old_box_l) > iou_threshold and \
                compute_iou(new_box_r, old_box_r) > iou_threshold:
            iou_times += 1
            if iou_times > iou_times_threshold:
                iou_flag = True
            elif iou_times > clas_times_threshold:
                clas_flag = True
        else:
            iou_times = 0
            iou_flag = False
            iou_flag_times = 0
            clas_flag = False
        old_box_l = new_box_l
        old_box_r = new_box_r
        single_hand_1 = hands_list[0][0]
        res_points_1 = keypointmodel.predict(single_hand_1)
        point1 = keypointmodel.vis_pose(single_hand_1, res_points_1, clas_hand_1)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand_1
        single_hand_2 = hands_list[1][0]
        res_points_2 = keypointmodel.predict(single_hand_2)
        point2 = keypointmodel.vis_pose(single_hand_2, res_points_2, clas_hand_2)
        frame[hands_list[1][2]:hands_list[1][4], hands_list[1][1]:hands_list[1][3]] = single_hand_2
        if iou_flag and not clas_flag:
            if point1 != None and point2 != None:
                if clas_hand_1 == "left":
                    cv2.ellipse(frame, (point1[0]+hands_list[0][1], point1[1]+hands_list[0][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                    cv2.ellipse(frame, (point2[0]+hands_list[1][1], point2[1]+hands_list[1][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                iou_flag_times += 1
        elif clas_flag:
            # 扩展部分
            pass
    else:
        pass

Inference版本下的

from keypoint import KeypointModel
from detect import HandDetectModel
from util import compute_iou
from config import *
import cv2

handdetectmodel = HandDetectModel(crop_size=crop_size)
keypointmodel = KeypointModel()

video = cv2.VideoCapture(0)
ret, frame = video.read()
h, w, _ = frame.shape
while ret:
    hands_list = handdetectmodel.predict(frame, detect_threshold)
    if len(hands_list) == 1:
        if (hands_list[0][1] + hands_list[0][3]) > 2*w:
            clas_hand = "right"
        else:
            clas_hand = "left"
        single_hand = hands_list[0][0]
        res_points = keypointmodel.predict(single_hand)
        point = keypointmodel.vis_pose(single_hand, res_points, clas_hand)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand
    elif len(hands_list) == 2:
        if (hands_list[0][1] + hands_list[0][3]) > (hands_list[1][1] + hands_list[1][3]):
            clas_hand_1 = "right"
            clas_hand_2 = "left"
            new_box_l = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_r = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        else:
            clas_hand_1 = "left"
            clas_hand_2 = "right"
            new_box_r = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_l = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        if compute_iou(new_box_l, old_box_l) > iou_threshold and \
                compute_iou(new_box_r, old_box_r) > iou_threshold:
            iou_times += 1
            if iou_times > iou_times_threshold:
                iou_flag = True
            elif iou_times > clas_times_threshold:
                clas_flag = True
        else:
            iou_times = 0
            iou_flag = False
            iou_flag_times = 0
            clas_flag = False
        old_box_l = new_box_l
        old_box_r = new_box_r
        single_hand_1 = hands_list[0][0]
        res_points_1 = keypointmodel.predict(single_hand_1)
        point1 = keypointmodel.vis_pose(single_hand_1, res_points_1, clas_hand_1)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand_1
        single_hand_2 = hands_list[1][0]
        res_points_2 = keypointmodel.predict(single_hand_2)
        point2 = keypointmodel.vis_pose(single_hand_2, res_points_2, clas_hand_2)
        frame[hands_list[1][2]:hands_list[1][4], hands_list[1][1]:hands_list[1][3]] = single_hand_2
        if iou_flag and not clas_flag:
            if point1 != None and point2 != None:
                if clas_hand_1 == "left":
                    cv2.ellipse(frame, (point1[0]+hands_list[0][1], point1[1]+hands_list[0][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                    cv2.ellipse(frame, (point2[0]+hands_list[1][1], point2[1]+hands_list[1][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                iou_flag_times += 1
        elif clas_flag:
            # 扩展部分
            pass
    else:
        pass

个人简介

我的公众号

小作者会将在AI Studio上的划桨记录分享到公众号上，而且公众号不定期更新深度学习相关内容，有关于深度学习方面好玩的应用，有论文解读复现，有精读深度学习著作等，小作者还会将在AI Studio公开的项目的背后故事和思考点在公众号同步更新，欢迎关注鸭~

关于作者

学校	哈尔滨工业大学(深圳) 大三在读
感兴趣的方向	大号关注：图像视频、强化学习、点云
	小号关注：文本、语音处理
个人兴趣	本人比较喜欢有趣的事情，会开源一些有趣的项目，项目简单且适合新手，欢迎大家常来fork
主页	大号主页
	小号主页
我的邮箱	firewhitefox@qq.com
我的公众号	Hello Neural Networks