Leverage distance computing algorithm in compare API

0e5794cf · Hai Liang Wang · dac98aa8 · 0e5794cf · 0e5794cf · 0e5794cf
Showing with 47 addition and 40 deletion

CHANGELOG.md CHANGELOG.md +3 -0

README.md README.md +1 -1

Requirements.txt Requirements.txt +1 -1

demo.py demo.py +1 -17

setup.py setup.py +1 -1

synonyms/__init__.py synonyms/__init__.py +40 -20

未找到文件。
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 2.5
+* 使用空间距离近的词汇优化编辑距离计算
+
 # 2.3
 * 计算相似度时增加平滑策略


--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
 ```
 pip install -U synonyms
 ```
-兼容py2和py3，当前稳定版本 v2.3。**同时，Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
+兼容py2和py3，当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)。**同时，Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**

 ```
 npm install node-synonyms

--- a/Requirements.txt
+++ b/Requirements.txt
-synonyms>=2.3
\ No newline at end of file
+synonyms>=2.5
\ No newline at end of file
--- a/demo.py
+++ b/demo.py
@@ -36,7 +36,7 @@ import synonyms  # https://github.com/huyingxi/Synonyms
 import numpy
 import unittest

-compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
+compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))

 # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
 class Test(unittest.TestCase):
@@ -52,35 +52,20 @@ class Test(unittest.TestCase):

    def test_pairs(self):
        print("test_pairs")
-        print("*"* 30)
        print(compare_("轿车", "汽车", True))
-        print("*"* 30)
        print(compare_("宝石", "宝物", True))
-        print("*"* 30)
        print(compare_("旅游", "游历", True))
-        print("*"* 30)
        print(compare_("男孩子", "小伙子", True))
-        print("*"* 30)
        print(compare_("海岸", "海滨", True))
-        print("*"* 30)
        print(compare_("庇护所", "精神病院", True))
-        print("*"* 30)
        print(compare_("魔术师", "巫师", True))
-        print("*"* 30)
        print(compare_("中午", "正午", True))
-        print("*"* 30)
        print(compare_("火炉", "炉灶", True))
-        print("*"* 30)
        print(compare_("食物", "水果", True))
-        print("*"* 30)
        print(compare_("鸡", "公鸡", True))
-        print("*"* 30)
        print(compare_("鸟", "鹤", True))
-        print("*"* 30)
        print(compare_("工具", "器械", True))
-        print("*"* 30)
        print(compare_("兄弟", "和尚", True))
-        print("*"* 30)
        print(compare_("起重机", "器械", True))

    def test_similarity(self):
@@ -110,7 +95,6 @@ class Test(unittest.TestCase):
        sen2 = "巴赫"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("%s vs %s" % (sen1, sen2), r)
-     

    def test_nearby(self):
        synonyms.display("人脸")  # synonyms.display calls synonyms.nearby

--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ Welcome

 setup(
    name='synonyms',
-    version='2.3',
+    version='2.5',
    description='Chinese Synonyms for Natural Language Processing and Understanding',
    long_description=LONGDOC,
    author='Hai Liang Wang, Hu Ying Xi',

--- a/synonyms/__init__.py
+++ b/synonyms/__init__.py
@@ -149,7 +149,7 @@ def _get_wv(sentence):
    '''
    global _vectors
    vectors = []
-    for y in sentence.split():
+    for y in sentence:
        y_ = any2unicode(y).strip()
        if y_ not in _stopwords:
            syns = nearby(y_)[0]
@@ -214,13 +214,35 @@ def _levenshtein_distance(sentence1, sentence2):
                                             new_distances[-1])))
        distances = new_distances
    levenshtein = distances[-1]
-    dis = float((maxlen - levenshtein)/maxlen)
+    d = float((maxlen - levenshtein)/maxlen)
    # smoothing
-    s = (sigmoid(dis * 6) - 0.5) * 2
-    # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s))
+    s = (sigmoid(d * 6) - 0.5) * 2
+    # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s))
    return s

-_smooth = lambda x, y, z: (x * y) + z
+def _nearby_levenshtein_distance(s1, s2):
+    '''
+    使用
+    '''
+    s1_len = len(s1)
+    s2_len = len(s2)
+    maxlen = max(s1_len, s2_len)
+    first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
+    ft = set() # all related words with first sentence 
+    for x in first:
+        ft.add(x)
+        n, _ = nearby(x)
+        for o in n:
+            ft.add(o)
+    scores = []
+    if len(ft) == 0: return 0.0 # invalid length for first string
+    for x in second:
+        scores.append(max([_levenshtein_distance(x, y) for y in ft]))
+    s = np.sum(scores) / maxlen
+    return s
+
+# combine similarity scores
+_similarity_smooth = lambda x, y, z: (x * y) + z

 def _similarity_distance(s1, s2):
    '''
@@ -230,25 +252,21 @@ def _similarity_distance(s1, s2):
    b = _sim_molecule(_get_wv(s2))
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
    g = 1 / (np.linalg.norm(a - b) + 1)
-    u = _levenshtein_distance(s1, s2)
+
+    u = _nearby_levenshtein_distance(s1, s2)
    # print("g: %s, u: %s" % (g, u))
    if u > 0.8:
-        r = _smooth(g, 0.05, u)
+        r = _similarity_smooth(g, 1, u)
    elif u > 0.7:
-        r = _smooth(g, 0.1, u)
+        r = _similarity_smooth(g, 1.5, u)
    elif u > 0.6:
-        r = _smooth(g, 0.2, u)
-    elif u > 0.5:
-        r = _smooth(g, 1, u)
-    elif u > 0.4:
-        r = _smooth(g, 4, u)
+        r = _similarity_smooth(g, 2, u)
    else:
-        r = _smooth(g, 10, u)
+        r = _similarity_smooth(g, 4, u)

    r = min(r, 1.0)
    return float("%.3f" % r)

-
 def compare(s1, s2, seg=True):
    '''
    compare similarity
@@ -257,12 +275,15 @@ def compare(s1, s2, seg=True):
    seg : True : The original sentences need jieba.cut
          Flase : The original sentences have been cut.
    '''
-    assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
    if seg:
-        s1 = ' '.join(jieba.cut(s1))
-        s2 = ' '.join(jieba.cut(s2))
-    return _similarity_distance(s1, s2)
+        s1 = [x for x in jieba.cut(s1)]
+        s2 = [x for x in jieba.cut(s2)]
+    else:
+        s1 = s1.split()
+        s2 = s2.split()
+    assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."

+    return _similarity_distance(s1, s2)

 def display(word):
    print("'%s'近义词：" % word)
@@ -273,7 +294,6 @@ def display(word):
    for k, v in enumerate(o[0]):
        print("  %d. %s:%s" % (k + 1, v, o[1][k]))

-
 def main():
    display("人脸")
    display("NOT_EXIST")