未验证 提交 e137e406 编写于 作者: K kinghuin 提交者: GitHub

add DuDepParser (#741)

上级 59bd510f
# 命令行预测
```shell
$ hub run ddparser --input_text="百度是一家高科技公司"
```
# API
## parse(texts=[])
依存分析接口,输入文本,输出依存关系。
**参数**
* texts(list[list[str] or list[str]]): 待预测数据。各元素可以是未分词的字符串,也可以是已分词的token列表。
**返回**
* results(list[dict]): 依存分析结果。每个元素都是dict类型,包含以下信息:
```python
{
'word': list[str], 分词结果
'head': list[int], 当前成分其支配者的id
'deprel': list[str], 当前成分与支配者的依存关系
'prob': list[float], 从属者和支配者依存的概率
'postag': list[str], 词性标签只有当texts的元素是未分词的字符串时包含这个键
'visual': 图像数组可以使用cv2.imshow显示图像或cv2.imwrite保存图像
}
```
## visualize(word, head, deprel)
可视化接口,输入依存分析接口得到的信息,输出依存图形数组。
**参数**
* word(list[list[str]): 分词信息。
* head(list[int]): 当前成分其支配者的id。
* deprel(list[str]): 当前成分与支配者的依存关系。
**返回**
* data(numpy.array): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
**代码示例**
```python
import cv2
import paddlehub as hub
module = hub.Module(name="ddparser")
test_text = ["百度是一家高科技公司"]
results = module.parse(texts=test_text)
print(results)
test_tokens = [['百度', '是', '一家', '高科技', '公司']]
results = module.parse(texts=test_text)
print(results)
result = results[0]
data = module.visualize(result['word'],result['head'],result['deprel'])
cv2.imwrite('test.jpg',data)
```
# DependencyParser 服务部署
PaddleHub Serving可以部署一个在线情感分析服务,可以将此接口用于在线web应用。
## 第一步:启动PaddleHub Serving
运行启动命令:
```shell
$ hub serving start -m ddparser
```
启动时会显示加载模型过程,启动成功后显示
```shell
Loading ddparser successful.
```
这样就完成了服务化API的部署,默认端口号为8866。
**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。
## 第二步:发送预测请求
配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
```python
import requests
import json
import numpy as np
import cv2
# 待预测数据
text = ["百度是一家高科技公司"]
# 设置运行配置
return_visual = True
data = {"texts": text, "return_visual": return_visual}
# 指定预测方法为DuDepParser并发送post请求,content-type类型应指定json方式
url = "http://0.0.0.0:8866/predict/ddparser"
headers = {"Content-Type": "application/json"}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
results, visuals = r.json()['results']
for i in range(len(results)):
print(results[i])
# 不同于本地调用parse接口,serving返回的图像是list类型的,需要先用numpy加载再显示或保存。
cv2.imwrite('%s.jpg'%i, np.array(visuals[i]))
```
关于PaddleHub Serving更多信息参考[服务部署](https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.6/docs/tutorial/serving.md)
### 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.7.0
## 更新历史
* 1.0.0
初始发布
# -*- coding:utf-8 -*-
import os
import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from paddle import fluid
import paddlehub as hub
from paddlehub.module.module import serving, moduleinfo, runnable
try:
from ddparser import DDParser as DDParserModel
except:
raise ImportError(
"The module requires additional dependencies: ddparser. Please run 'pip install ddparser' to install it."
)
@moduleinfo(
name="ddparser",
version="1.0.0",
summary="Baidu's open-source DDParser model.",
author="baidu-nlp",
author_email="",
type="nlp/syntactic_analysis")
class ddparser(hub.NLPPredictionModule):
def _initialize(self):
"""
initialize with the necessary elements
"""
self.ddp = DDParserModel(prob=True, use_pos=True)
self.font = font_manager.FontProperties(
fname=os.path.join(self.directory, "SimHei.ttf"))
@serving
def serving_parse(self, texts=[], return_visual=False):
results, visuals = self.parse(texts, return_visual)
for i, visual in enumerate(visuals):
visuals[i] = visual.tolist()
return results, visuals
def parse(self, texts=[], return_visual=False):
"""
parse the dependency.
Args:
texts(list[list[str] or list[list[str]]]): the input texts to be parse. It should be a list with elements: untokenized string or tokens list.
return_visual(bool): if set True, the result will contain the dependency visualization.
Returns:
results(list[dict]): a list, with elements corresponding to each of the elements in texts. The element is a dictionary of shape:
{
'word': list[str], the tokenized words.
'head': list[int], the head ids.
'deprel': list[str], the dependency relation.
'prob': list[float], the prediction probility of the dependency relation.
'postag': list[str], the POS tag. If the element of the texts is list, the key 'postag' will not be returned.
}
visuals : list[numpy.array]: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not be empty.
"""
if not texts:
return
if all([isinstance(i, str) and i for i in texts]):
do_parse = self.ddp.parse
elif all([isinstance(i, list) and i for i in texts]):
do_parse = self.ddp.parse_seg
else:
raise ValueError("All of the elements should be string or list")
results = do_parse(texts)
visuals = []
if return_visual:
for result in results:
visuals.append(
self.visualize(result['word'], result['head'],
result['deprel']))
return results, visuals
@runnable
def run_cmd(self, argvs):
"""
Run as a command
"""
self.parser = argparse.ArgumentParser(
description='Run the %s module.' % self.name,
prog='hub run %s' % self.name,
usage='%(prog)s',
add_help=True)
self.arg_input_group = self.parser.add_argument_group(
title="Input options", description="Input data. Required")
self.add_module_input_arg()
args = self.parser.parse_args(argvs)
input_data = self.check_input_data(args)
results = self.parse(texts=input_data)
return results
def visualize(self, word, head, deprel):
"""
Visualize the dependency.
Args:
word: list[str], the tokenized words.
head: list[int], the head ids.
deprel: list[str], the dependency relation.
Returns:
data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
"""
nodes = ['ROOT'] + word
x = list(range(len(nodes)))
y = [0] * (len(nodes))
fig, ax = plt.subplots()
# control the picture size
max_span = max([abs(i + 1 - j) for i, j in enumerate(head)])
fig.set_size_inches((len(nodes), max_span / 2))
# set the points
plt.scatter(x, y, c='w')
for i in range(len(nodes)):
txt = nodes[i]
xytext = (i, 0)
if i == 0:
# set 'ROOT'
ax.annotate(
txt,
xy=xytext,
xycoords='data',
xytext=xytext,
textcoords='data',
)
else:
xy = (head[i - 1], 0)
rad = 0.5 if head[i - 1] < i else -0.5
# set the word
ax.annotate(
txt,
xy=xy,
xycoords='data',
xytext=(xytext[0] - 0.1, xytext[1]),
textcoords='data',
fontproperties=self.font)
# draw the curve
ax.annotate(
"",
xy=xy,
xycoords='data',
xytext=xytext,
textcoords='data',
arrowprops=dict(
arrowstyle="<-",
shrinkA=12,
shrinkB=12,
color='blue',
connectionstyle="arc3,rad=%s" % rad,
),
)
# set the deprel label. Calculate its position by the radius
text_x = min(i, head[i - 1]) + abs((i - head[i - 1])) / 2 - 0.2
text_y = abs((i - head[i - 1])) / 4
ax.annotate(
deprel[i - 1],
xy=xy,
xycoords='data',
xytext=[text_x, text_y],
textcoords='data')
# control the axis
plt.axis('equal')
plt.axis('off')
# save to numpy array
fig.canvas.draw()
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
data = data.reshape(fig.canvas.get_width_height()[::-1] +
(3, ))[:, :, ::-1]
return data
if __name__ == "__main__":
module = ddparser()
# Data to be predicted
test_text = ["百度是一家高科技公司"]
results = module.parse(texts=test_text)
print(results)
test_tokens = [['百度', '是', '一家', '高科技', '公司']]
results = module.parse(texts=test_text)
print(results)
result = results[0]
data = module.visualize(result['word'], result['head'], result['deprel'])
import cv2
import numpy as np
cv2.imwrite('test.jpg', np.array(data))
......@@ -86,7 +86,7 @@ class LocalModuleManager(object):
"%s does not exist, the module will be reinstalled" %
desc_pb_path)
except:
pass
raise
return False, None
def all_modules(self, update=False):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册