提交 0e5794cf 编写于 作者: H Hai Liang Wang

Leverage distance computing algorithm in compare API

上级 dac98aa8
# 2.5
* 使用空间距离近的词汇优化编辑距离计算
# 2.3
* 计算相似度时增加平滑策略
......
......@@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v2.3。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
兼容py2和py3,当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
```
npm install node-synonyms
......
synonyms>=2.3
\ No newline at end of file
synonyms>=2.5
\ No newline at end of file
......@@ -36,7 +36,7 @@ import synonyms # https://github.com/huyingxi/Synonyms
import numpy
import unittest
compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
......@@ -52,35 +52,20 @@ class Test(unittest.TestCase):
def test_pairs(self):
print("test_pairs")
print("*"* 30)
print(compare_("轿车", "汽车", True))
print("*"* 30)
print(compare_("宝石", "宝物", True))
print("*"* 30)
print(compare_("旅游", "游历", True))
print("*"* 30)
print(compare_("男孩子", "小伙子", True))
print("*"* 30)
print(compare_("海岸", "海滨", True))
print("*"* 30)
print(compare_("庇护所", "精神病院", True))
print("*"* 30)
print(compare_("魔术师", "巫师", True))
print("*"* 30)
print(compare_("中午", "正午", True))
print("*"* 30)
print(compare_("火炉", "炉灶", True))
print("*"* 30)
print(compare_("食物", "水果", True))
print("*"* 30)
print(compare_("鸡", "公鸡", True))
print("*"* 30)
print(compare_("鸟", "鹤", True))
print("*"* 30)
print(compare_("工具", "器械", True))
print("*"* 30)
print(compare_("兄弟", "和尚", True))
print("*"* 30)
print(compare_("起重机", "器械", True))
def test_similarity(self):
......@@ -110,7 +95,6 @@ class Test(unittest.TestCase):
sen2 = "巴赫"
r = synonyms.compare(sen1, sen2, seg=True)
print("%s vs %s" % (sen1, sen2), r)
def test_nearby(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
......
......@@ -13,7 +13,7 @@ Welcome
setup(
name='synonyms',
version='2.3',
version='2.5',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
......
......@@ -149,7 +149,7 @@ def _get_wv(sentence):
'''
global _vectors
vectors = []
for y in sentence.split():
for y in sentence:
y_ = any2unicode(y).strip()
if y_ not in _stopwords:
syns = nearby(y_)[0]
......@@ -214,13 +214,35 @@ def _levenshtein_distance(sentence1, sentence2):
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
dis = float((maxlen - levenshtein)/maxlen)
d = float((maxlen - levenshtein)/maxlen)
# smoothing
s = (sigmoid(dis * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s))
s = (sigmoid(d * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s))
return s
_smooth = lambda x, y, z: (x * y) + z
def _nearby_levenshtein_distance(s1, s2):
'''
使用
'''
s1_len = len(s1)
s2_len = len(s2)
maxlen = max(s1_len, s2_len)
first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
ft = set() # all related words with first sentence
for x in first:
ft.add(x)
n, _ = nearby(x)
for o in n:
ft.add(o)
scores = []
if len(ft) == 0: return 0.0 # invalid length for first string
for x in second:
scores.append(max([_levenshtein_distance(x, y) for y in ft]))
s = np.sum(scores) / maxlen
return s
# combine similarity scores
_similarity_smooth = lambda x, y, z: (x * y) + z
def _similarity_distance(s1, s2):
'''
......@@ -230,25 +252,21 @@ def _similarity_distance(s1, s2):
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _levenshtein_distance(s1, s2)
u = _nearby_levenshtein_distance(s1, s2)
# print("g: %s, u: %s" % (g, u))
if u > 0.8:
r = _smooth(g, 0.05, u)
r = _similarity_smooth(g, 1, u)
elif u > 0.7:
r = _smooth(g, 0.1, u)
r = _similarity_smooth(g, 1.5, u)
elif u > 0.6:
r = _smooth(g, 0.2, u)
elif u > 0.5:
r = _smooth(g, 1, u)
elif u > 0.4:
r = _smooth(g, 4, u)
r = _similarity_smooth(g, 2, u)
else:
r = _smooth(g, 10, u)
r = _similarity_smooth(g, 4, u)
r = min(r, 1.0)
return float("%.3f" % r)
def compare(s1, s2, seg=True):
'''
compare similarity
......@@ -257,12 +275,15 @@ def compare(s1, s2, seg=True):
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
'''
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
if seg:
s1 = ' '.join(jieba.cut(s1))
s2 = ' '.join(jieba.cut(s2))
return _similarity_distance(s1, s2)
s1 = [x for x in jieba.cut(s1)]
s2 = [x for x in jieba.cut(s2)]
else:
s1 = s1.split()
s2 = s2.split()
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
return _similarity_distance(s1, s2)
def display(word):
print("'%s'近义词:" % word)
......@@ -273,7 +294,6 @@ def display(word):
for k, v in enumerate(o[0]):
print(" %d. %s:%s" % (k + 1, v, o[1][k]))
def main():
display("人脸")
display("NOT_EXIST")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册