add DuDepParser (#741)

e137e406 · kinghuin · GitHub · 59bd510f · e137e406 · e137e406
5 changed file
--- a/hub_module/modules/text/syntactic_analysis/DDParser/README.md
+++ b/hub_module/modules/text/syntactic_analysis/DDParser/README.md
+# 命令行预测
+
+```shell
+$ hub run ddparser --input_text="百度是一家高科技公司"
+```
+
+# API
+
+## parse(texts=[])
+
+依存分析接口，输入文本，输出依存关系。
+
+**参数**
+
+* texts(list[list[str] or list[str]]): 待预测数据。各元素可以是未分词的字符串，也可以是已分词的token列表。
+
+**返回**
+
+* results(list[dict]): 依存分析结果。每个元素都是dict类型，包含以下信息：  
+```python
+  {
+      'word': list[str], 分词结果。
+      'head': list[int], 当前成分其支配者的id。
+      'deprel': list[str], 当前成分与支配者的依存关系。
+      'prob': list[float], 从属者和支配者依存的概率。
+      'postag': list[str], 词性标签，只有当texts的元素是未分词的字符串时包含这个键。
+      'visual': 图像数组，可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
+  }
+```
+
+## visualize(word, head, deprel)
+
+可视化接口，输入依存分析接口得到的信息，输出依存图形数组。
+
+**参数**
+
+* word(list[list[str]): 分词信息。
+* head(list[int]): 当前成分其支配者的id。
+* deprel(list[str]): 当前成分与支配者的依存关系。
+
+**返回**
+
+* data(numpy.array): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
+
+**代码示例**
+
+```python
+import cv2
+import paddlehub as hub
+
+module = hub.Module(name="ddparser")
+
+test_text = ["百度是一家高科技公司"]
+results = module.parse(texts=test_text)
+print(results)
+
+test_tokens = [['百度', '是', '一家', '高科技', '公司']]
+results = module.parse(texts=test_text)
+print(results)
+
+result = results[0]
+data = module.visualize(result['word'],result['head'],result['deprel'])
+cv2.imwrite('test.jpg',data)
+```
+
+# DependencyParser 服务部署
+
+PaddleHub Serving可以部署一个在线情感分析服务，可以将此接口用于在线web应用。
+
+## 第一步：启动PaddleHub Serving
+
+运行启动命令：
+```shell
+$ hub serving start -m ddparser
+```
+
+启动时会显示加载模型过程，启动成功后显示
+```shell
+Loading ddparser successful.
+```
+
+这样就完成了服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+## 第二步：发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+import numpy as np
+import cv2
+
+# 待预测数据
+text = ["百度是一家高科技公司"]
+
+# 设置运行配置
+return_visual = True
+data = {"texts": text, "return_visual": return_visual}
+
+# 指定预测方法为DuDepParser并发送post请求，content-type类型应指定json方式
+url = "http://0.0.0.0:8866/predict/ddparser"
+headers = {"Content-Type": "application/json"}
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+results, visuals = r.json()['results']
+
+for i in range(len(results)):
+  print(results[i])
+  # 不同于本地调用parse接口，serving返回的图像是list类型的，需要先用numpy加载再显示或保存。
+  cv2.imwrite('%s.jpg'%i, np.array(visuals[i]))
+```
+
+关于PaddleHub Serving更多信息参考[服务部署](https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.6/docs/tutorial/serving.md)
+
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/syntactic_analysis/DDParser/__init__.py
+++ b/hub_module/modules/text/syntactic_analysis/DDParser/__init__.py
--- a/hub_module/modules/text/syntactic_analysis/DDParser/module.py
+++ b/hub_module/modules/text/syntactic_analysis/DDParser/module.py
+# -*- coding:utf-8 -*-
+import os
+import argparse
+
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as font_manager
+from paddle import fluid
+import paddlehub as hub
+from paddlehub.module.module import serving, moduleinfo, runnable
+
+try:
+    from ddparser import DDParser as DDParserModel
+except:
+    raise ImportError(
+        "The module requires additional dependencies: ddparser. Please run 'pip install ddparser' to install it."
+    )
+
+
+@moduleinfo(
+    name="ddparser",
+    version="1.0.0",
+    summary="Baidu's open-source DDParser model.",
+    author="baidu-nlp",
+    author_email="",
+    type="nlp/syntactic_analysis")
+class ddparser(hub.NLPPredictionModule):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.ddp = DDParserModel(prob=True, use_pos=True)
+        self.font = font_manager.FontProperties(
+            fname=os.path.join(self.directory, "SimHei.ttf"))
+
+    @serving
+    def serving_parse(self, texts=[], return_visual=False):
+        results, visuals = self.parse(texts, return_visual)
+        for i, visual in enumerate(visuals):
+            visuals[i] = visual.tolist()
+
+        return results, visuals
+
+    def parse(self, texts=[], return_visual=False):
+        """
+        parse the dependency.
+
+        Args:
+            texts(list[list[str] or list[list[str]]]): the input texts to be parse. It should be a list with elements: untokenized string or tokens list.
+            return_visual(bool): if set True, the result will contain the dependency visualization.
+
+        Returns:
+            results(list[dict]): a list, with elements corresponding to each of the elements in texts. The element is a dictionary of shape:
+                {
+                    'word': list[str], the tokenized words.
+                    'head': list[int], the head ids.
+                    'deprel': list[str], the dependency relation.
+                    'prob': list[float], the prediction probility of the dependency relation.
+                    'postag': list[str], the POS tag. If the element of the texts is list, the key 'postag' will not be returned.
+                }
+
+            visuals : list[numpy.array]: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not be empty.
+
+       """
+
+        if not texts:
+            return
+        if all([isinstance(i, str) and i for i in texts]):
+            do_parse = self.ddp.parse
+        elif all([isinstance(i, list) and i for i in texts]):
+            do_parse = self.ddp.parse_seg
+        else:
+            raise ValueError("All of the elements should be string or list")
+        results = do_parse(texts)
+        visuals = []
+        if return_visual:
+            for result in results:
+                visuals.append(
+                    self.visualize(result['word'], result['head'],
+                                   result['deprel']))
+        return results, visuals
+
+    @runnable
+    def run_cmd(self, argvs):
+        """
+        Run as a command
+        """
+        self.parser = argparse.ArgumentParser(
+            description='Run the %s module.' % self.name,
+            prog='hub run %s' % self.name,
+            usage='%(prog)s',
+            add_help=True)
+
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Input options", description="Input data. Required")
+
+        self.add_module_input_arg()
+
+        args = self.parser.parse_args(argvs)
+
+        input_data = self.check_input_data(args)
+
+        results = self.parse(texts=input_data)
+
+        return results
+
+    def visualize(self, word, head, deprel):
+        """
+        Visualize the dependency.
+
+        Args:
+            word: list[str], the tokenized words.
+            head: list[int], the head ids.
+            deprel: list[str], the dependency relation.
+
+        Returns:
+            data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
+        """
+        nodes = ['ROOT'] + word
+        x = list(range(len(nodes)))
+        y = [0] * (len(nodes))
+        fig, ax = plt.subplots()
+        # control the picture size
+        max_span = max([abs(i + 1 - j) for i, j in enumerate(head)])
+        fig.set_size_inches((len(nodes), max_span / 2))
+        # set the points
+        plt.scatter(x, y, c='w')
+
+        for i in range(len(nodes)):
+            txt = nodes[i]
+            xytext = (i, 0)
+            if i == 0:
+                # set 'ROOT'
+                ax.annotate(
+                    txt,
+                    xy=xytext,
+                    xycoords='data',
+                    xytext=xytext,
+                    textcoords='data',
+                )
+            else:
+                xy = (head[i - 1], 0)
+                rad = 0.5 if head[i - 1] < i else -0.5
+                # set the word
+                ax.annotate(
+                    txt,
+                    xy=xy,
+                    xycoords='data',
+                    xytext=(xytext[0] - 0.1, xytext[1]),
+                    textcoords='data',
+                    fontproperties=self.font)
+                # draw the curve
+                ax.annotate(
+                    "",
+                    xy=xy,
+                    xycoords='data',
+                    xytext=xytext,
+                    textcoords='data',
+                    arrowprops=dict(
+                        arrowstyle="<-",
+                        shrinkA=12,
+                        shrinkB=12,
+                        color='blue',
+                        connectionstyle="arc3,rad=%s" % rad,
+                    ),
+                )
+                # set the deprel label. Calculate its position by the radius
+                text_x = min(i, head[i - 1]) + abs((i - head[i - 1])) / 2 - 0.2
+                text_y = abs((i - head[i - 1])) / 4
+                ax.annotate(
+                    deprel[i - 1],
+                    xy=xy,
+                    xycoords='data',
+                    xytext=[text_x, text_y],
+                    textcoords='data')
+
+        # control the axis
+        plt.axis('equal')
+        plt.axis('off')
+
+        # save to numpy array
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        data = data.reshape(fig.canvas.get_width_height()[::-1] +
+                            (3, ))[:, :, ::-1]
+        return data
+
+
+if __name__ == "__main__":
+    module = ddparser()
+    # Data to be predicted
+    test_text = ["百度是一家高科技公司"]
+    results = module.parse(texts=test_text)
+    print(results)
+    test_tokens = [['百度', '是', '一家', '高科技', '公司']]
+    results = module.parse(texts=test_text)
+    print(results)
+    result = results[0]
+    data = module.visualize(result['word'], result['head'], result['deprel'])
+    import cv2
+    import numpy as np
+    cv2.imwrite('test.jpg', np.array(data))
--- a/hub_module/modules/text/syntactic_analysis/README.md
+++ b/hub_module/modules/text/syntactic_analysis/README.md
--- a/paddlehub/module/manager.py
+++ b/paddlehub/module/manager.py
@@ -86,7 +86,7 @@ class LocalModuleManager(object):
                    "%s does not exist, the module will be reinstalled" %
                    desc_pb_path)
        except:
-            pass
+            raise
        return False, None

    def all_modules(self, update=False):