fix gbk encode (#821)

7f83b995 · SiMing Dai · GitHub · f8d70245 · 7f83b995 · 7f83b995
34 changed file
--- a/hub_module/modules/text/semantic_model/lda_news/README.md
+++ b/hub_module/modules/text/semantic_model/lda_news/README.md
@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
 * 1.0.1
  修复因为return的bug导致的NoneType错误
+* 1.0.2
+  修复由于Windows`gbk`编码导致的问题
--- a/hub_module/modules/text/semantic_model/lda_news/model.py
+++ b/hub_module/modules/text/semantic_model/lda_news/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/lda_news/module.py
+++ b/hub_module/modules/text/semantic_model/lda_news/module.py
@@ -14,7 +14,7 @@ from lda_news.vocab import Vocab, WordCount
 @moduleinfo(
    name="lda_news",
-    version="1.0.1",
+    version="1.0.2",
    summary=
    "This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
    author="DesmonDay",

--- a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
@@ -64,7 +64,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -97,7 +97,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/lda_news/util.py
+++ b/hub_module/modules/text/semantic_model/lda_news/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading LDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/lda_news/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_news/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/lda_novel/README.md
+++ b/hub_module/modules/text/semantic_model/lda_novel/README.md
@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
 * 1.0.1
  修复因为return的bug导致的NoneType错误
+* 1.0.2
+  修复由于Windows`gbk`编码导致的问题
--- a/hub_module/modules/text/semantic_model/lda_novel/model.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/lda_novel/module.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/module.py
@@ -14,7 +14,7 @@ from lda_novel.vocab import Vocab, WordCount
 @moduleinfo(
    name="lda_novel",
-    version="1.0.1",
+    version="1.0.2",
    summary=
    "This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
    author="DesmonDay",

--- a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/lda_novel/util.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading LDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/lda_novel/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/lda_webpage/README.md
+++ b/hub_module/modules/text/semantic_model/lda_webpage/README.md
@@ -137,3 +137,7 @@ paddlehub >= 1.8.0
 * 1.0.1
  修复因为return的bug导致的NoneType错误
+* 1.0.2
+  修复由于Windows`gbk`编码导致的问题
--- a/hub_module/modules/text/semantic_model/lda_webpage/model.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/lda_webpage/module.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/module.py
@@ -14,7 +14,7 @@ from lda_webpage.vocab import Vocab, WordCount
 @moduleinfo(
    name="lda_webpage",
-    version="1.0.1",
+    version="1.0.2",
    summary=
    "This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
    author="DesmonDay",

--- a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/lda_webpage/util.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading LDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/lda_webpage/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/slda_news/model.py
+++ b/hub_module/modules/text/semantic_model/slda_news/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/slda_news/util.py
+++ b/hub_module/modules/text/semantic_model/slda_news/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading SLDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/slda_news/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_news/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/slda_novel/model.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/slda_novel/util.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading SLDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/slda_novel/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/slda_webpage/model.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/slda_webpage/util.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading SLDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/slda_webpage/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(

--- a/hub_module/modules/text/semantic_model/slda_weibo/model.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/model.py
@@ -93,7 +93,7 @@ class TopicModel(object):
        """Load the word topic parameters.
        """
        logger.info("Loading word topic.")
-        with open(word_dict_path, 'r') as f:
+        with open(word_dict_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                fields = line.strip().split(" ")
                assert len(fields) > 0, "Model file format error!"

--- a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
-        with open(vocab_path, 'r') as fin:
+        with open(vocab_path, 'r', encoding='utf-8') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')

--- a/hub_module/modules/text/semantic_model/slda_weibo/util.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/util.py
@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
        config: ModelConfig class
    """
    logger.info("Loading SLDA config.")
-    with open(config_file, 'r') as f:
+    with open(config_file, 'r', encoding='utf-8') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
    # Assignment.

--- a/hub_module/modules/text/semantic_model/slda_weibo/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/vocab.py
@@ -22,7 +22,7 @@ class Vocab(object):
    def load(self, vocab_file):
        self.__term2id = {}
        self.__id2term = {}
-        with open(vocab_file, 'r') as fin:
+        with open(vocab_file, 'r', encoding='utf-8') as fin:
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(