未验证 提交 319ec2d8 编写于 作者: 1 123malin 提交者: GitHub

Merge branch 'master' into modify_yaml

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -12,28 +12,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.
train:
trainer:
# for cluster training
strategy: "async"
workspace: "paddlerec.models.contentunderstanding.classification"
epochs: 10
workspace: "paddlerec.models.contentunderstanding.classification"
dataset:
- name: data1
batch_size: 5
type: DataLoader
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
is_sparse: False
reader:
batch_size: 5
class: "{workspace}/reader.py"
train_data_path: "{workspace}/train_data"
mode: runner1
model:
models: "{workspace}/model.py"
runner:
- name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: data1
thread_num: 1
......@@ -27,19 +27,27 @@ class Model(ModelBase):
self.emb_dim = 8
self.hid_dim = 128
self.class_dim = 2
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False)
def train_net(self):
""" network definition """
def input_data(self, is_infer=False, **kwargs):
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
return [data, label, seq_len]
self._data_var = [data, label, seq_len]
def net(self, input, is_infer=False):
""" network definition """
data = input[0]
label = input[1]
seq_len = input[2]
# embedding layer
emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim])
emb = fluid.embedding(
input=data,
size=[self.dict_dim, self.emb_dim],
is_sparse=self.is_sparse)
emb = fluid.layers.sequence_unpad(emb, length=seq_len)
# convolution layer
conv = fluid.nets.sequence_conv_pool(
......@@ -59,19 +67,8 @@ class Model(ModelBase):
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
self.cost = avg_cost
self._metrics["acc"] = acc
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self._metrics
def optimizer(self):
learning_rate = 0.01
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
def infer_net(self):
self.train_net()
self._cost = avg_cost
if is_infer:
self._infer_results["acc"] = acc
else:
self._metrics["acc"] = acc
......@@ -22,7 +22,7 @@ class TrainReader(Reader):
pass
def _process_line(self, l):
l = l.strip().split(" ")
l = l.strip().split()
data = l[0:10]
seq_len = l[10:11]
label = l[11:]
......@@ -37,8 +37,6 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -37,7 +37,18 @@
<img align="center" src="../../doc/imgs/cnn-ckim2014.png">
<p>
## 使用教程
##使用教程(快速开始)
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
```
## 使用教程(复现论文)
###注意
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。
### 数据处理
**(1)TagSpace**
......@@ -64,20 +75,42 @@ mv test.csv raw_big_test_data
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
**(2)Classification**
### 训练
```
cd modles/contentunderstanding/tagspace
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
```
### 训练
**(2)Classification**
### 训练
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
cd modles/contentunderstanding/classification
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
```
## 效果对比
......
......@@ -12,38 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.
train:
trainer:
# for cluster training
strategy: "async"
workspace: "paddlerec.models.contentunderstanding.tagspace"
epochs: 10
workspace: "paddlerec.models.contentunderstanding.tagspace"
dataset:
- name: sample_1
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
reader:
batch_size: 5
class: "{workspace}/reader.py"
train_data_path: "{workspace}/train_data"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
model:
models: "{workspace}/model.py"
hyper_parameters:
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
mode: runner1
runner:
- name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: sample_1
thread_num: 1
......@@ -26,26 +26,30 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
self.margin = envs.get_global_env("margin", None, self._namespace)
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
self.vocab_text_size = envs.get_global_env(
"hyper_parameters.vocab_text_size")
self.vocab_tag_size = envs.get_global_env(
"hyper_parameters.vocab_tag_size")
self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim")
self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim")
self.win_size = envs.get_global_env("hyper_parameters.win_size")
self.margin = envs.get_global_env("hyper_parameters.margin")
self.neg_size = envs.get_global_env("hyper_parameters.neg_size")
def train_net(self):
""" network"""
def input_data(self, is_infer=False, **kwargs):
text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data(
name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64')
return [text, pos_tag, neg_tag]
self._data_var = [text, pos_tag, neg_tag]
def net(self, input, is_infer=False):
""" network"""
text = input[0]
pos_tag = input[1]
neg_tag = input[2]
text_emb = fluid.embedding(
input=text,
......@@ -97,22 +101,11 @@ class Model(ModelBase):
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
self.cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
self._cost = avg_cost
def infer_net(self, parameter_list):
self.train_net()
if is_infer:
self._infer_results["correct"] = correct
self._infer_results["cos_pos"] = cos_pos
else:
self._metrics["correct"] = correct
self._metrics["cos_pos"] = cos_pos
......@@ -63,7 +63,7 @@ def build(dirname):
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*',
'data/sample_data/infer/*'
'data/sample_data/infer/*', 'data/*/*.csv'
]
engine_copy = ['*/*.sh']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册