提交 4575f3e7 编写于 作者: H Hai Liang Wang

use jieba as tokenizer

上级 6d754120
# v1.6
* use ```jieba``` instead of ```thulac``` as tokeninzer.
* refine console log for Jupyter notebook.
\ No newline at end of file
......@@ -11,7 +11,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v1.5
兼容py2和py3,当前稳定版本 v1.6
![](./assets/3.gif)
......
synonyms>=1.3
\ No newline at end of file
jieba==0.39
\ No newline at end of file
......@@ -53,11 +53,15 @@ class Test(unittest.TestCase):
'''
sen1 = "旗帜引领方向"
sen2 = "道路决定命运"
assert synonyms.compare(sen1, sen2) == 0.0, "the similarity should be zero"
r = synonyms.compare(sen1, sen2)
print("旗帜引领方向 vs 道路决定命运:", r)
assert r == 0.0, "the similarity should be zero"
sen1 = "发生历史性变革"
sen2 = "取得历史性成就"
assert synonyms.compare(sen1, sen2) > 0, "the similarity should be bigger then zero"
r = synonyms.compare(sen1, sen2)
print("发生历史性变革 vs 取得历史性成就:", r)
assert r > 0, "the similarity should be bigger then zero"
def testNearbyWords(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
......
......@@ -12,7 +12,7 @@ Welcome
"""
setup(name='synonyms',
version='1.5',
version='1.6',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
......@@ -39,7 +39,7 @@ setup(name='synonyms',
keywords='corpus,machine-learning,NLU,NLP,Synonyms,Similarity',
packages= find_packages(),
install_requires=[
'thulac==0.1.2',
'jieba>=0.39',
],
package_data={'synonyms':['**/*.pklz', 'LICENSE']}
)
......@@ -41,12 +41,11 @@ else:
PLT = 3
import gzip
import thulac # http://thulac.thunlp.org/
import shutil
import jieba.posseg as _tokenizer
_vocab = dict()
_size = 0
_thulac = thulac.thulac() #默认模式
_fin_path = os.path.join(curdir, os.path.pardir, 'tmp', 'words.nearby.gz')
_fin_cached_vocab_path = os.path.join(curdir, 'data', 'words.nearby.%d.pklz' % PLT)
......@@ -147,13 +146,11 @@ def _segment_words(sen):
'''
segment words
'''
text = _thulac.cut(sen, text=True) #进行一句话分词
words, tags = [], []
data = [x.rsplit('_', 1) for x in text.split()]
for _ in data:
assert len(_) == 2, "seg len should be 2"
words.append(_[0])
tags.append(_[1])
m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词
for x in m:
words.append(x.word)
tags.append(x.flag)
return words, tags
def _similarity(w1, t1, w2, t2, explain = False):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册