未验证 提交 7f83b995 编写于 作者: S SiMing Dai 提交者: GitHub

fix gbk encode (#821)

上级 f8d70245
...@@ -141,3 +141,7 @@ paddlehub >= 1.8.0 ...@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
* 1.0.1 * 1.0.1
修复因为return的bug导致的NoneType错误 修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -14,7 +14,7 @@ from lda_news.vocab import Vocab, WordCount ...@@ -14,7 +14,7 @@ from lda_news.vocab import Vocab, WordCount
@moduleinfo( @moduleinfo(
name="lda_news", name="lda_news",
version="1.0.1", version="1.0.2",
summary= summary=
"This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc", "This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
author="DesmonDay", author="DesmonDay",
......
...@@ -64,7 +64,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -64,7 +64,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -97,7 +97,7 @@ class LACTokenizer(Tokenizer): ...@@ -97,7 +97,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading LDA config.") logger.info("Loading LDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -141,3 +141,7 @@ paddlehub >= 1.8.0 ...@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
* 1.0.1 * 1.0.1
修复因为return的bug导致的NoneType错误 修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -14,7 +14,7 @@ from lda_novel.vocab import Vocab, WordCount ...@@ -14,7 +14,7 @@ from lda_novel.vocab import Vocab, WordCount
@moduleinfo( @moduleinfo(
name="lda_novel", name="lda_novel",
version="1.0.1", version="1.0.2",
summary= summary=
"This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.", "This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay", author="DesmonDay",
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading LDA config.") logger.info("Loading LDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -137,3 +137,7 @@ paddlehub >= 1.8.0 ...@@ -137,3 +137,7 @@ paddlehub >= 1.8.0
* 1.0.1 * 1.0.1
修复因为return的bug导致的NoneType错误 修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -14,7 +14,7 @@ from lda_webpage.vocab import Vocab, WordCount ...@@ -14,7 +14,7 @@ from lda_webpage.vocab import Vocab, WordCount
@moduleinfo( @moduleinfo(
name="lda_webpage", name="lda_webpage",
version="1.0.1", version="1.0.2",
summary= summary=
"This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.", "This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay", author="DesmonDay",
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading LDA config.") logger.info("Loading LDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading SLDA config.") logger.info("Loading SLDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading SLDA config.") logger.info("Loading SLDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading SLDA config.") logger.info("Loading SLDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
...@@ -93,7 +93,7 @@ class TopicModel(object): ...@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters. """Load the word topic parameters.
""" """
logger.info("Loading word topic.") logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f: with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()): for line in tqdm(f.readlines()):
fields = line.strip().split(" ") fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!" assert len(fields) > 0, "Model file format error!"
......
...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer): ...@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer): ...@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path): def __load_vocab(self, vocab_path):
"""Load the word dictionary. """Load the word dictionary.
""" """
with open(vocab_path, 'r') as fin: with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0 vocab_size = 0
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
......
...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config): ...@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class config: ModelConfig class
""" """
logger.info("Loading SLDA config.") logger.info("Loading SLDA config.")
with open(config_file, 'r') as f: with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader) yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment. # Assignment.
......
...@@ -22,7 +22,7 @@ class Vocab(object): ...@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file): def load(self, vocab_file):
self.__term2id = {} self.__term2id = {}
self.__id2term = {} self.__id2term = {}
with open(vocab_file, 'r') as fin: with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines(): for line in fin.readlines():
fields = line.strip().split('\t') fields = line.strip().split('\t')
assert len( assert len(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册