use jieba as tokenizer

4575f3e7 · Hai Liang Wang · 6d754120 · 4575f3e7 · 4575f3e7 · 4575f3e7
Showing with 18 addition and 14 deletion

CHANGELOG.md CHANGELOG.md +3 -0

README.md README.md +1 -1

Requirements.txt Requirements.txt +1 -1

demo.py demo.py +6 -2

setup.py setup.py +2 -2

synonyms/__init__.py synonyms/__init__.py +5 -8

未找到文件。
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# v1.6
+* use ```jieba``` instead of ```thulac``` as tokeninzer.
+* refine console log for Jupyter notebook.
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
 ```
 pip install -U synonyms
 ```
-兼容py2和py3，当前稳定版本 v1.5。
+兼容py2和py3，当前稳定版本 v1.6。

 ![](./assets/3.gif)


--- a/Requirements.txt
+++ b/Requirements.txt
-synonyms>=1.3
\ No newline at end of file
+jieba==0.39
\ No newline at end of file
--- a/demo.py
+++ b/demo.py
@@ -53,11 +53,15 @@ class Test(unittest.TestCase):
        '''
        sen1 = "旗帜引领方向"
        sen2 = "道路决定命运"
-        assert synonyms.compare(sen1, sen2) == 0.0, "the similarity should be zero"
+        r = synonyms.compare(sen1, sen2)
+        print("旗帜引领方向 vs 道路决定命运:", r)
+        assert r == 0.0, "the similarity should be zero"

        sen1 = "发生历史性变革"
        sen2 = "取得历史性成就"
-        assert synonyms.compare(sen1, sen2) > 0, "the similarity should be bigger then zero"
+        r = synonyms.compare(sen1, sen2)
+        print("发生历史性变革 vs 取得历史性成就:", r)
+        assert r > 0, "the similarity should be bigger then zero"

    def testNearbyWords(self):
        synonyms.display("人脸") # synonyms.display calls synonyms.nearby

--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@ Welcome
 """

 setup(name='synonyms',
-      version='1.5',
+      version='1.6',
      description='Chinese Synonyms for Natural Language Processing and Understanding',
      long_description=LONGDOC,
      author='Hai Liang Wang, Hu Ying Xi',
@@ -39,7 +39,7 @@ setup(name='synonyms',
      keywords='corpus,machine-learning,NLU,NLP,Synonyms,Similarity',
      packages= find_packages(),
      install_requires=[
-          'thulac==0.1.2',
+          'jieba>=0.39',
      ],
      package_data={'synonyms':['**/*.pklz', 'LICENSE']}
 )
--- a/synonyms/__init__.py
+++ b/synonyms/__init__.py
@@ -41,12 +41,11 @@ else:
    PLT = 3

 import gzip
-import thulac # http://thulac.thunlp.org/
 import shutil
+import jieba.posseg as _tokenizer

 _vocab = dict()
 _size = 0
-_thulac = thulac.thulac() #默认模式
 _fin_path = os.path.join(curdir, os.path.pardir, 'tmp', 'words.nearby.gz')
 _fin_cached_vocab_path = os.path.join(curdir, 'data', 'words.nearby.%d.pklz' % PLT)

@@ -147,13 +146,11 @@ def _segment_words(sen):
    '''
    segment words
    '''
-    text = _thulac.cut(sen, text=True)  #进行一句话分词
    words, tags = [], []
-    data = [x.rsplit('_', 1) for x in text.split()]
-    for _ in data:
-        assert len(_) == 2, "seg len should be 2"
-        words.append(_[0])
-        tags.append(_[1])
+    m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词
+    for x in m:
+        words.append(x.word)
+        tags.append(x.flag)
    return words, tags

 def _similarity(w1, t1, w2, t2, explain = False):