Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
sfewfsaf
Synonyms
提交
4575f3e7
S
Synonyms
项目概览
sfewfsaf
/
Synonyms
与 Fork 源项目一致
从无法访问的项目Fork
通知
6
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
Synonyms
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
4575f3e7
编写于
10月 28, 2017
作者:
H
Hai Liang Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
use jieba as tokenizer
上级
6d754120
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
18 addition
and
14 deletion
+18
-14
CHANGELOG.md
CHANGELOG.md
+3
-0
README.md
README.md
+1
-1
Requirements.txt
Requirements.txt
+1
-1
demo.py
demo.py
+6
-2
setup.py
setup.py
+2
-2
synonyms/__init__.py
synonyms/__init__.py
+5
-8
未找到文件。
CHANGELOG.md
0 → 100644
浏览文件 @
4575f3e7
# v1.6
*
use
```jieba```
instead of
```thulac```
as tokeninzer.
*
refine console log for Jupyter notebook.
\ No newline at end of file
README.md
浏览文件 @
4575f3e7
...
...
@@ -11,7 +11,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v1.
5
。
兼容py2和py3,当前稳定版本 v1.
6
。
![](./assets/3.gif)
...
...
Requirements.txt
浏览文件 @
4575f3e7
synonyms>=1.3
\ No newline at end of file
jieba==0.39
\ No newline at end of file
demo.py
浏览文件 @
4575f3e7
...
...
@@ -53,11 +53,15 @@ class Test(unittest.TestCase):
'''
sen1
=
"旗帜引领方向"
sen2
=
"道路决定命运"
assert
synonyms
.
compare
(
sen1
,
sen2
)
==
0.0
,
"the similarity should be zero"
r
=
synonyms
.
compare
(
sen1
,
sen2
)
print
(
"旗帜引领方向 vs 道路决定命运:"
,
r
)
assert
r
==
0.0
,
"the similarity should be zero"
sen1
=
"发生历史性变革"
sen2
=
"取得历史性成就"
assert
synonyms
.
compare
(
sen1
,
sen2
)
>
0
,
"the similarity should be bigger then zero"
r
=
synonyms
.
compare
(
sen1
,
sen2
)
print
(
"发生历史性变革 vs 取得历史性成就:"
,
r
)
assert
r
>
0
,
"the similarity should be bigger then zero"
def
testNearbyWords
(
self
):
synonyms
.
display
(
"人脸"
)
# synonyms.display calls synonyms.nearby
...
...
setup.py
浏览文件 @
4575f3e7
...
...
@@ -12,7 +12,7 @@ Welcome
"""
setup
(
name
=
'synonyms'
,
version
=
'1.
5
'
,
version
=
'1.
6
'
,
description
=
'Chinese Synonyms for Natural Language Processing and Understanding'
,
long_description
=
LONGDOC
,
author
=
'Hai Liang Wang, Hu Ying Xi'
,
...
...
@@ -39,7 +39,7 @@ setup(name='synonyms',
keywords
=
'corpus,machine-learning,NLU,NLP,Synonyms,Similarity'
,
packages
=
find_packages
(),
install_requires
=
[
'
thulac==0.1.2
'
,
'
jieba>=0.39
'
,
],
package_data
=
{
'synonyms'
:[
'**/*.pklz'
,
'LICENSE'
]}
)
synonyms/__init__.py
浏览文件 @
4575f3e7
...
...
@@ -41,12 +41,11 @@ else:
PLT
=
3
import
gzip
import
thulac
# http://thulac.thunlp.org/
import
shutil
import
jieba.posseg
as
_tokenizer
_vocab
=
dict
()
_size
=
0
_thulac
=
thulac
.
thulac
()
#默认模式
_fin_path
=
os
.
path
.
join
(
curdir
,
os
.
path
.
pardir
,
'tmp'
,
'words.nearby.gz'
)
_fin_cached_vocab_path
=
os
.
path
.
join
(
curdir
,
'data'
,
'words.nearby.%d.pklz'
%
PLT
)
...
...
@@ -147,13 +146,11 @@ def _segment_words(sen):
'''
segment words
'''
text
=
_thulac
.
cut
(
sen
,
text
=
True
)
#进行一句话分词
words
,
tags
=
[],
[]
data
=
[
x
.
rsplit
(
'_'
,
1
)
for
x
in
text
.
split
()]
for
_
in
data
:
assert
len
(
_
)
==
2
,
"seg len should be 2"
words
.
append
(
_
[
0
])
tags
.
append
(
_
[
1
])
m
=
_tokenizer
.
cut
(
sen
,
HMM
=
True
)
# HMM更好的识别新词
for
x
in
m
:
words
.
append
(
x
.
word
)
tags
.
append
(
x
.
flag
)
return
words
,
tags
def
_similarity
(
w1
,
t1
,
w2
,
t2
,
explain
=
False
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录