提交 4a44eff8 编写于 作者: H Hai Liang Wang

#60 compare 支持交换句子

上级 c580b3d8
# 3.6
* Fix Bug: compare 保证交换两个句子后分数一致 [#60](https://github.com/huyingxi/Synonyms/issues/60)
# 3.5
* 根据实际情况,降低向量距离对近似度分数的影响
......
synonyms>=3.5
\ No newline at end of file
synonyms>=3.6
\ No newline at end of file
......@@ -114,6 +114,15 @@ class Test(unittest.TestCase):
r = synonyms.compare(sen1, sen2, seg=False)
print("%s vs %s" % (sen1, sen2), r)
def test_swap_sent(self):
print("test_swap_sent")
s1 = synonyms.compare("教学", "老师")
s2 = synonyms.compare("老师", "教学")
print('"教学", "老师": %s ' % s1)
print('"老师", "教学": %s ' % s2)
assert s1 == s2, "Scores should be the same after swap sents"
def test_nearby(self):
synonyms.display("奥运") # synonyms.display calls synonyms.nearby
synonyms.display("北新桥") # synonyms.display calls synonyms.nearby
......
......@@ -13,7 +13,7 @@ Welcome
setup(
name='synonyms',
version='3.5.0',
version='3.6.0',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
......
......@@ -211,28 +211,28 @@ def _nearby_levenshtein_distance(s1, s2):
使用空间距离近的词汇优化编辑距离计算
'''
s1_len, s2_len = len(s1), len(s2)
maxlen = max(s1_len, s2_len)
first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
ft_1 = set() # all related words with first sentence
maxlen = s1_len
if s1_len == s2_len:
first, second = sorted([s1, s2])
elif s1_len < s2_len:
first = s1
second = s2
maxlen = s2_len
else:
first = s2
second = s1
ft = set() # all related words with first sentence
for x in first:
ft_1.add(x)
ft.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_1.add(o)
ft_2 = set() # all related words with second sentence
for x in second:
ft_2.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_2.add(0)
for o in n[:10]:
ft.add(o)
scores = []
if len(ft_1) == 0 or len(ft_2) == 0: return 0.0 # invalid length
for x in ft_1:
for y in ft_2:
scores.append([_levenshtein_distance(x, y)])
s = np.sum(scores) / (s1_len * s2_len)
for x in second:
scores.append(max([_levenshtein_distance(x, y) for y in ft]))
s = np.sum(scores) / maxlen
return s
def _similarity_distance(s1, s2, ignore):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册