Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
sfewfsaf
Synonyms
提交
0e5794cf
S
Synonyms
项目概览
sfewfsaf
/
Synonyms
与 Fork 源项目一致
从无法访问的项目Fork
通知
6
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
Synonyms
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
0e5794cf
编写于
3月 02, 2018
作者:
H
Hai Liang Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Leverage distance computing algorithm in compare API
上级
dac98aa8
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
47 addition
and
40 deletion
+47
-40
CHANGELOG.md
CHANGELOG.md
+3
-0
README.md
README.md
+1
-1
Requirements.txt
Requirements.txt
+1
-1
demo.py
demo.py
+1
-17
setup.py
setup.py
+1
-1
synonyms/__init__.py
synonyms/__init__.py
+40
-20
未找到文件。
CHANGELOG.md
浏览文件 @
0e5794cf
# 2.5
*
使用空间距离近的词汇优化编辑距离计算
# 2.3
*
计算相似度时增加平滑策略
...
...
README.md
浏览文件 @
0e5794cf
...
...
@@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本
v2.3
。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
兼容py2和py3,当前稳定版本
[v2.x](https://github.com/huyingxi/Synonyms/releases)
。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
```
npm install node-synonyms
...
...
Requirements.txt
浏览文件 @
0e5794cf
synonyms>=2.3
\ No newline at end of file
synonyms>=2.5
\ No newline at end of file
demo.py
浏览文件 @
0e5794cf
...
...
@@ -36,7 +36,7 @@ import synonyms # https://github.com/huyingxi/Synonyms
import
numpy
import
unittest
compare_
=
lambda
x
,
y
,
z
:
"%s vs %s: %f"
%
(
x
,
y
,
synonyms
.
compare
(
x
,
y
,
seg
=
z
))
compare_
=
lambda
x
,
y
,
z
:
"
*"
*
30
+
"
\n
%s vs %s: %f"
%
(
x
,
y
,
synonyms
.
compare
(
x
,
y
,
seg
=
z
))
# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class
Test
(
unittest
.
TestCase
):
...
...
@@ -52,35 +52,20 @@ class Test(unittest.TestCase):
def
test_pairs
(
self
):
print
(
"test_pairs"
)
print
(
"*"
*
30
)
print
(
compare_
(
"轿车"
,
"汽车"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"宝石"
,
"宝物"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"旅游"
,
"游历"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"男孩子"
,
"小伙子"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"海岸"
,
"海滨"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"庇护所"
,
"精神病院"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"魔术师"
,
"巫师"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"中午"
,
"正午"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"火炉"
,
"炉灶"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"食物"
,
"水果"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"鸡"
,
"公鸡"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"鸟"
,
"鹤"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"工具"
,
"器械"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"兄弟"
,
"和尚"
,
True
))
print
(
"*"
*
30
)
print
(
compare_
(
"起重机"
,
"器械"
,
True
))
def
test_similarity
(
self
):
...
...
@@ -110,7 +95,6 @@ class Test(unittest.TestCase):
sen2
=
"巴赫"
r
=
synonyms
.
compare
(
sen1
,
sen2
,
seg
=
True
)
print
(
"%s vs %s"
%
(
sen1
,
sen2
),
r
)
def
test_nearby
(
self
):
synonyms
.
display
(
"人脸"
)
# synonyms.display calls synonyms.nearby
...
...
setup.py
浏览文件 @
0e5794cf
...
...
@@ -13,7 +13,7 @@ Welcome
setup
(
name
=
'synonyms'
,
version
=
'2.
3
'
,
version
=
'2.
5
'
,
description
=
'Chinese Synonyms for Natural Language Processing and Understanding'
,
long_description
=
LONGDOC
,
author
=
'Hai Liang Wang, Hu Ying Xi'
,
...
...
synonyms/__init__.py
浏览文件 @
0e5794cf
...
...
@@ -149,7 +149,7 @@ def _get_wv(sentence):
'''
global
_vectors
vectors
=
[]
for
y
in
sentence
.
split
()
:
for
y
in
sentence
:
y_
=
any2unicode
(
y
).
strip
()
if
y_
not
in
_stopwords
:
syns
=
nearby
(
y_
)[
0
]
...
...
@@ -214,13 +214,35 @@ def _levenshtein_distance(sentence1, sentence2):
new_distances
[
-
1
])))
distances
=
new_distances
levenshtein
=
distances
[
-
1
]
d
is
=
float
((
maxlen
-
levenshtein
)
/
maxlen
)
d
=
float
((
maxlen
-
levenshtein
)
/
maxlen
)
# smoothing
s
=
(
sigmoid
(
d
is
*
6
)
-
0.5
)
*
2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d
is
, s))
s
=
(
sigmoid
(
d
*
6
)
-
0.5
)
*
2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s))
return
s
_smooth
=
lambda
x
,
y
,
z
:
(
x
*
y
)
+
z
def
_nearby_levenshtein_distance
(
s1
,
s2
):
'''
使用
'''
s1_len
=
len
(
s1
)
s2_len
=
len
(
s2
)
maxlen
=
max
(
s1_len
,
s2_len
)
first
,
second
=
(
s2
,
s1
)
if
s1_len
==
maxlen
else
(
s1
,
s2
)
ft
=
set
()
# all related words with first sentence
for
x
in
first
:
ft
.
add
(
x
)
n
,
_
=
nearby
(
x
)
for
o
in
n
:
ft
.
add
(
o
)
scores
=
[]
if
len
(
ft
)
==
0
:
return
0.0
# invalid length for first string
for
x
in
second
:
scores
.
append
(
max
([
_levenshtein_distance
(
x
,
y
)
for
y
in
ft
]))
s
=
np
.
sum
(
scores
)
/
maxlen
return
s
# combine similarity scores
_similarity_smooth
=
lambda
x
,
y
,
z
:
(
x
*
y
)
+
z
def
_similarity_distance
(
s1
,
s2
):
'''
...
...
@@ -230,25 +252,21 @@ def _similarity_distance(s1, s2):
b
=
_sim_molecule
(
_get_wv
(
s2
))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g
=
1
/
(
np
.
linalg
.
norm
(
a
-
b
)
+
1
)
u
=
_levenshtein_distance
(
s1
,
s2
)
u
=
_nearby_levenshtein_distance
(
s1
,
s2
)
# print("g: %s, u: %s" % (g, u))
if
u
>
0.8
:
r
=
_s
mooth
(
g
,
0.05
,
u
)
r
=
_s
imilarity_smooth
(
g
,
1
,
u
)
elif
u
>
0.7
:
r
=
_s
mooth
(
g
,
0.1
,
u
)
r
=
_s
imilarity_smooth
(
g
,
1.5
,
u
)
elif
u
>
0.6
:
r
=
_smooth
(
g
,
0.2
,
u
)
elif
u
>
0.5
:
r
=
_smooth
(
g
,
1
,
u
)
elif
u
>
0.4
:
r
=
_smooth
(
g
,
4
,
u
)
r
=
_similarity_smooth
(
g
,
2
,
u
)
else
:
r
=
_s
mooth
(
g
,
10
,
u
)
r
=
_s
imilarity_smooth
(
g
,
4
,
u
)
r
=
min
(
r
,
1.0
)
return
float
(
"%.3f"
%
r
)
def
compare
(
s1
,
s2
,
seg
=
True
):
'''
compare similarity
...
...
@@ -257,12 +275,15 @@ def compare(s1, s2, seg=True):
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
'''
assert
len
(
s1
)
>
0
and
len
(
s2
)
>
0
,
"The length of s1 and s2 should > 0."
if
seg
:
s1
=
' '
.
join
(
jieba
.
cut
(
s1
))
s2
=
' '
.
join
(
jieba
.
cut
(
s2
))
return
_similarity_distance
(
s1
,
s2
)
s1
=
[
x
for
x
in
jieba
.
cut
(
s1
)]
s2
=
[
x
for
x
in
jieba
.
cut
(
s2
)]
else
:
s1
=
s1
.
split
()
s2
=
s2
.
split
()
assert
len
(
s1
)
>
0
and
len
(
s2
)
>
0
,
"The length of s1 and s2 should > 0."
return
_similarity_distance
(
s1
,
s2
)
def
display
(
word
):
print
(
"'%s'近义词:"
%
word
)
...
...
@@ -273,7 +294,6 @@ def display(word):
for
k
,
v
in
enumerate
(
o
[
0
]):
print
(
" %d. %s:%s"
%
(
k
+
1
,
v
,
o
[
1
][
k
]))
def
main
():
display
(
"人脸"
)
display
(
"NOT_EXIST"
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录