diff --git a/CHANGELOG.md b/CHANGELOG.md index 0338f7aa4dbcc6625d05fd337a3e28461d4e2bc2..c8d24dfaa88e2e47232da0cf8b9640ec81cf18a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +# 3.6 +* Fix Bug: compare 保证交换两个句子后分数一致 [#60](https://github.com/huyingxi/Synonyms/issues/60) # 3.5 * 根据实际情况,降低向量距离对近似度分数的影响 diff --git a/Requirements.txt b/Requirements.txt index d7b7214f690259850c3ed08b85c914e537630d64..578c8c3eda8450e7e1aab2632111e4895600e719 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=3.5 \ No newline at end of file +synonyms>=3.6 \ No newline at end of file diff --git a/demo.py b/demo.py index 70666356fa4f73a9e5cc902a755a135efea2355b..161fd644d0b8652fef74097b50249133094ea8b5 100755 --- a/demo.py +++ b/demo.py @@ -114,6 +114,15 @@ class Test(unittest.TestCase): r = synonyms.compare(sen1, sen2, seg=False) print("%s vs %s" % (sen1, sen2), r) + + def test_swap_sent(self): + print("test_swap_sent") + s1 = synonyms.compare("教学", "老师") + s2 = synonyms.compare("老师", "教学") + print('"教学", "老师": %s ' % s1) + print('"老师", "教学": %s ' % s2) + assert s1 == s2, "Scores should be the same after swap sents" + def test_nearby(self): synonyms.display("奥运") # synonyms.display calls synonyms.nearby synonyms.display("北新桥") # synonyms.display calls synonyms.nearby diff --git a/setup.py b/setup.py index 40c978d19c8bf9b0d969dd9b2689c05c6f49317a..703d69132a6cde93dd6e274934142a24ddb26874 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ Welcome setup( name='synonyms', - version='3.5.0', + version='3.6.0', description='Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py index ebc6874c14af2d072941fe293fa125e02395ef6b..908d5ecf9966d0a14cc02243530e3136a56e34e1 100755 --- a/synonyms/synonyms.py +++ b/synonyms/synonyms.py @@ -211,28 +211,28 @@ def _nearby_levenshtein_distance(s1, s2): 使用空间距离近的词汇优化编辑距离计算 ''' s1_len, s2_len = len(s1), len(s2) - maxlen = max(s1_len, s2_len) - first, second = (s2, s1) if s1_len == maxlen else (s1, s2) - ft_1 = set() # all related words with first sentence + maxlen = s1_len + if s1_len == s2_len: + first, second = sorted([s1, s2]) + elif s1_len < s2_len: + first = s1 + second = s2 + maxlen = s2_len + else: + first = s2 + second = s1 + + ft = set() # all related words with first sentence for x in first: - ft_1.add(x) + ft.add(x) n, _ = nearby(x) - for o in n[:5]: - ft_1.add(o) - - ft_2 = set() # all related words with second sentence - for x in second: - ft_2.add(x) - n, _ = nearby(x) - for o in n[:5]: - ft_2.add(0) - + for o in n[:10]: + ft.add(o) + scores = [] - if len(ft_1) == 0 or len(ft_2) == 0: return 0.0 # invalid length - for x in ft_1: - for y in ft_2: - scores.append([_levenshtein_distance(x, y)]) - s = np.sum(scores) / (s1_len * s2_len) + for x in second: + scores.append(max([_levenshtein_distance(x, y) for y in ft])) + s = np.sum(scores) / maxlen return s def _similarity_distance(s1, s2, ignore):