未验证 提交 737f3ca6 编写于 作者: 1 123malin 提交者: GitHub

Merge branch 'master' into gru4rec

......@@ -119,7 +119,8 @@ class LocalClusterEngine(Engine):
"PADDLE_TRAINERS_NUM": str(worker_num),
"TRAINING_ROLE": "TRAINER",
"PADDLE_TRAINER_ID": str(i),
"FLAGS_selected_gpus": str(selected_gpus[i])
"FLAGS_selected_gpus": str(selected_gpus[i]),
"PADDLEREC_GPU_NUMS": str(selected_gpus_num)
})
os.system("mkdir -p {}".format(logs_dir))
......
......@@ -177,6 +177,13 @@ class ModelBase(object):
opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
opt_lr = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
if not isinstance(opt_lr, (float, Variable)):
try:
opt_lr = float(opt_lr)
except ValueError:
raise ValueError(
"In your config yaml, 'learning_rate': %s must be written as a floating piont number,such as 0.001 or 1e-3"
% opt_lr)
opt_strategy = envs.get_global_env(
"hyper_parameters.optimizer.strategy")
......
......@@ -143,6 +143,8 @@ class QueueDataset(DatasetBase):
if need_split_files:
file_list = split_files(file_list, context["fleet"].worker_index(),
context["fleet"].worker_num())
context["file_list"] = file_list
print("File_list: {}".format(file_list))
dataset.set_filelist(file_list)
......
......@@ -18,11 +18,18 @@ import os
import time
import warnings
import numpy as np
import random
import json
import logging
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.utils.util import shuffle_files
from paddlerec.core.metric import Metric
logging.basicConfig(
format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO)
__all__ = [
"RunnerBase", "SingleRunner", "PSRunner", "CollectiveRunner", "PslibRunner"
]
......@@ -88,12 +95,12 @@ class RunnerBase(object):
reader_name = model_dict["dataset_name"]
model_name = model_dict["name"]
model_class = context["model"][model_dict["name"]]["model"]
fetch_vars = []
fetch_alias = []
fetch_period = int(
envs.get_global_env("runner." + context["runner_name"] +
".print_interval", 20))
scope = context["model"][model_name]["scope"]
program = context["model"][model_name]["main_program"]
reader = context["dataset"][reader_name]
......@@ -133,6 +140,9 @@ class RunnerBase(object):
fetch_period = int(
envs.get_global_env("runner." + context["runner_name"] +
".print_interval", 20))
save_step_interval = int(
envs.get_global_env("runner." + context["runner_name"] +
".save_step_interval", -1))
if context["is_infer"]:
metrics = model_class.get_infer_results()
else:
......@@ -140,18 +150,33 @@ class RunnerBase(object):
metrics_varnames = []
metrics_format = []
if context["is_infer"]:
metrics_format.append("\t[Infer] {}: {{}}".format("batch"))
else:
metrics_format.append("\t[Train]")
if "current_epoch" in context:
metrics_format.append(" epoch: {}".format(context[
"current_epoch"]))
metrics_format.append(" {}: {{}}".format("batch"))
metrics_format.append("{}: {{:.2f}}s".format("time_each_interval"))
metrics_names = ["total_batch"]
metrics_format.append("{}: {{}}".format("batch"))
metrics_indexes = dict()
for name, var in metrics.items():
metrics_names.append(name)
metrics_varnames.append(var.name)
metrics_indexes[var.name] = len(metrics_varnames) - 1
metrics_format.append("{}: {{}}".format(name))
metrics_format = ", ".join(metrics_format)
reader = context["model"][model_dict["name"]]["model"]._data_loader
reader.start()
batch_id = 0
begin_time = time.time()
scope = context["model"][model_name]["scope"]
runner_results = []
result = None
with fluid.scope_guard(scope):
try:
......@@ -160,20 +185,61 @@ class RunnerBase(object):
program=program,
fetch_list=metrics_varnames,
return_numpy=False)
metrics = [batch_id]
metrics = [batch_id]
metrics_rets = [
as_numpy(metrics_tensor)
for metrics_tensor in metrics_tensors
]
metrics.extend(metrics_rets)
batch_runner_result = {}
for k, v in metrics_indexes.items():
batch_runner_result[k] = np.array(metrics_rets[
v]).tolist()
runner_results.append(batch_runner_result)
if batch_id % fetch_period == 0 and batch_id != 0:
print(metrics_format.format(*metrics))
end_time = time.time()
seconds = end_time - begin_time
metrics_logging = metrics[:]
metrics_logging.insert(1, seconds)
begin_time = end_time
logging.info(metrics_format.format(*metrics_logging))
if save_step_interval >= 1 and batch_id % save_step_interval == 0 and context[
"is_infer"] == False:
if context["fleet_mode"].upper() == "PS":
train_prog = context["model"][model_dict["name"]][
"main_program"]
else:
train_prog = context["model"][model_dict["name"]][
"default_main_program"]
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
self.save(
context,
is_fleet=context["is_fleet"],
epoch_id=None,
batch_id=batch_id)
batch_id += 1
except fluid.core.EOFException:
reader.reset()
runner_result_save_path = envs.get_global_env(
"runner." + context["runner_name"] + ".runner_result_dump_path",
None)
if runner_result_save_path:
if "current_epoch" in context:
runner_result_save_path = runner_result_save_path + "_epoch_{}".format(
context["current_epoch"])
logging.info("Dump runner result in {}".format(
runner_result_save_path))
with open(runner_result_save_path, 'w+') as fout:
json.dump(runner_results, fout)
if batch_id > 0:
result = dict(zip(metrics_names, metrics))
return result
......@@ -270,7 +336,7 @@ class RunnerBase(object):
exec_strategy=_exe_strategy)
return program
def save(self, epoch_id, context, is_fleet=False):
def save(self, context, is_fleet=False, epoch_id=None, batch_id=None):
def need_save(epoch_id, epoch_interval, is_last=False):
name = "runner." + context["runner_name"] + "."
total_epoch = int(envs.get_global_env(name + "epochs", 1))
......@@ -327,7 +393,8 @@ class RunnerBase(object):
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
logging.info("\tsave epoch_id:%d model into: \"%s\"" %
(epoch_id, dirname))
if is_fleet:
warnings.warn(
"Save inference model in cluster training is not recommended! Using save checkpoint instead.",
......@@ -350,14 +417,35 @@ class RunnerBase(object):
if dirname is None or dirname == "":
return
dirname = os.path.join(dirname, str(epoch_id))
logging.info("\tsave epoch_id:%d model into: \"%s\"" %
(epoch_id, dirname))
if is_fleet:
if context["fleet"].worker_index() == 0:
context["fleet"].save_persistables(context["exe"], dirname)
else:
fluid.io.save_persistables(context["exe"], dirname)
def save_checkpoint_step():
name = "runner." + context["runner_name"] + "."
save_interval = int(
envs.get_global_env(name + "save_step_interval", -1))
dirname = envs.get_global_env(name + "save_step_path", None)
if dirname is None or dirname == "":
return
dirname = os.path.join(dirname, str(batch_id))
logging.info("\tsave batch_id:%d model into: \"%s\"" %
(batch_id, dirname))
if is_fleet:
if context["fleet"].worker_index() == 0:
context["fleet"].save_persistables(context["exe"], dirname)
else:
fluid.io.save_persistables(context["exe"], dirname)
save_persistables()
save_inference_model()
if isinstance(epoch_id, int):
save_persistables()
save_inference_model()
if isinstance(batch_id, int):
save_checkpoint_step()
class SingleRunner(RunnerBase):
......@@ -376,7 +464,13 @@ class SingleRunner(RunnerBase):
for model_dict in context["phases"]:
model_class = context["model"][model_dict["name"]]["model"]
metrics = model_class._metrics
if "shuffle_filelist" in model_dict:
need_shuffle_files = model_dict.get("shuffle_filelist",
None)
filelist = context["file_list"]
context["file_list"] = shuffle_files(need_shuffle_files,
filelist)
context["current_epoch"] = epoch
begin_time = time.time()
result = self._run(context, model_dict)
end_time = time.time()
......@@ -403,7 +497,7 @@ class SingleRunner(RunnerBase):
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
self.save(epoch, context)
self.save(context=context, epoch_id=epoch)
context["status"] = "terminal_pass"
......@@ -420,6 +514,12 @@ class PSRunner(RunnerBase):
model_class = context["model"][model_dict["name"]]["model"]
metrics = model_class._metrics
for epoch in range(epochs):
if "shuffle_filelist" in model_dict:
need_shuffle_files = model_dict.get("shuffle_filelist", None)
filelist = context["file_list"]
context["file_list"] = shuffle_files(need_shuffle_files,
filelist)
context["current_epoch"] = epoch
begin_time = time.time()
result = self._run(context, model_dict)
end_time = time.time()
......@@ -450,7 +550,7 @@ class PSRunner(RunnerBase):
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
self.save(epoch, context, True)
self.save(context=context, is_fleet=True, epoch_id=epoch)
context["status"] = "terminal_pass"
......@@ -465,6 +565,12 @@ class CollectiveRunner(RunnerBase):
".epochs"))
model_dict = context["env"]["phase"][0]
for epoch in range(epochs):
if "shuffle_filelist" in model_dict:
need_shuffle_files = model_dict.get("shuffle_filelist", None)
filelist = context["file_list"]
context["file_list"] = shuffle_files(need_shuffle_files,
filelist)
context["current_epoch"] = epoch
begin_time = time.time()
self._run(context, model_dict)
end_time = time.time()
......@@ -477,7 +583,7 @@ class CollectiveRunner(RunnerBase):
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
self.save(epoch, context, True)
self.save(context=context, is_fleet=True, epoch_id=epoch)
context["status"] = "terminal_pass"
......@@ -493,6 +599,12 @@ class PslibRunner(RunnerBase):
envs.get_global_env("runner." + context["runner_name"] +
".epochs"))
for epoch in range(epochs):
if "shuffle_filelist" in model_dict:
need_shuffle_files = model_dict.get("shuffle_filelist", None)
filelist = context["file_list"]
context["file_list"] = shuffle_files(need_shuffle_files,
filelist)
context["current_epoch"] = epoch
begin_time = time.time()
self._run(context, model_dict)
end_time = time.time()
......@@ -555,6 +667,12 @@ class SingleInferRunner(RunnerBase):
metrics = model_class._infer_results
self._load(context, model_dict,
self.epoch_model_path_list[index])
if "shuffle_filelist" in model_dict:
need_shuffle_files = model_dict.get("shuffle_filelist",
None)
filelist = context["file_list"]
context["file_list"] = shuffle_files(need_shuffle_files,
filelist)
begin_time = time.time()
result = self._run(context, model_dict)
end_time = time.time()
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import os
import warnings
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ
......@@ -47,6 +48,16 @@ def dataloader_by_name(readerclass,
files.sort()
# for local cluster: discard some files if files cannot be divided equally between GPUs
if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ:
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
discard_file_nums = len(files) % selected_gpu_nums
if (discard_file_nums != 0):
warnings.warn(
"Because files cannot be divided equally between GPUs,discard these files:{}".
format(files[-discard_file_nums:]))
files = files[:len(files) - discard_file_nums]
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
......@@ -59,7 +70,7 @@ def dataloader_by_name(readerclass,
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
context["file_list"] = files
reader = reader_class(yaml_file)
reader.init()
......@@ -109,6 +120,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
files.sort()
# for local cluster: discard some files if files cannot be divided equally between GPUs
if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ:
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
discard_file_nums = len(files) % selected_gpu_nums
if (discard_file_nums != 0):
warnings.warn(
"Because files cannot be divided equally between GPUs,discard these files:{}".
format(files[-discard_file_nums:]))
files = files[:len(files) - discard_file_nums]
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
......@@ -121,7 +142,7 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
context["file_list"] = files
sparse = get_global_env(name + "sparse_slots", "#")
if sparse == "":
sparse = "#"
......@@ -153,73 +174,3 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
return gen_reader
def slotdataloader(readerclass, train, yaml_file, context):
if train == "TRAIN":
reader_name = "SlotReader"
namespace = "train.reader"
data_path = get_global_env("train_data_path", None, namespace)
else:
reader_name = "SlotReader"
namespace = "evaluate.reader"
data_path = get_global_env("test_data_path", None, namespace)
if data_path.startswith("paddlerec::"):
package_base = get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
need_split_files = True
elif context["engine"] == EngineMode.CLUSTER and context[
"cluster_type"] == "K8S":
# for k8s mount mode, split files for every node
need_split_files = True
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
sparse = get_global_env("sparse_slots", "#", namespace)
if sparse == "":
sparse = "#"
dense = get_global_env("dense_slots", "#", namespace)
if dense == "":
dense = "#"
padding = get_global_env("padding", 0, namespace)
reader = SlotReader(yaml_file)
reader.init(sparse, dense, int(padding))
def gen_reader():
for file in files:
with open(file, 'r') as f:
for line in f:
line = line.rstrip('\n')
iter = reader.generate_sample(line)
for parsed_line in iter():
if parsed_line is None:
continue
else:
values = []
for pased in parsed_line:
values.append(pased[1])
yield values
def gen_batch_reader():
return reader.generate_batch_from_trainfiles(files)
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
return gen_reader
......@@ -20,7 +20,7 @@ import socket
import sys
import six
import traceback
import six
import warnings
global_envs = {}
global_envs_flatten = {}
......@@ -98,6 +98,25 @@ def set_global_envs(envs):
value = os_path_adapter(workspace_adapter(value))
global_envs[name] = value
for runner in envs["runner"]:
if "save_step_interval" in runner or "save_step_path" in runner:
phase_name = runner["phases"]
phase = [
phase for phase in envs["phase"]
if phase["name"] == phase_name[0]
]
dataset_name = phase[0].get("dataset_name")
dataset = [
dataset for dataset in envs["dataset"]
if dataset["name"] == dataset_name
]
if dataset[0].get("type") == "QueueDataset":
runner["save_step_interval"] = None
runner["save_step_path"] = None
warnings.warn(
"QueueDataset can not support save by step, please not config save_step_interval and save_step_path in your yaml"
)
if get_platform() != "LINUX":
for dataset in envs["dataset"]:
name = ".".join(["dataset", dataset["name"], "type"])
......
......@@ -16,6 +16,8 @@ import datetime
import os
import sys
import time
import warnings
import random
import numpy as np
from paddle import fluid
......@@ -223,6 +225,16 @@ def check_filelist(hidden_file_list, data_file_list, train_data_path):
return hidden_file_list, data_file_list
def shuffle_files(need_shuffle_files, filelist):
if not isinstance(need_shuffle_files, bool):
raise ValueError(
"In your config yaml, 'shuffle_filelist': %s must be written as a boolean type,such as True or False"
% need_shuffle_files)
elif need_shuffle_files:
random.shuffle(filelist)
return filelist
class CostPrinter(object):
"""
For count cost time && print cost log
......
......@@ -7,9 +7,27 @@ PaddleRec基于业务实践,使用真实数据,产出了推荐领域算法
### 获取地址
```bash
wget xxx.tar.gz
wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz
```
### 使用方法
解压后,得到的是一个paddle的模型文件夹,使用`PaddleRec/models/contentunderstanding/classification_finetue`模型进行加载
解压后,得到的是一个paddle的模型文件夹,使用`PaddleRec/models/contentunderstanding/textcnn`模型进行加载
您可以在PaddleRec/models/contentunderstanding/textcnn_pretrain中找到finetune_startup.py文件,在config.yaml中配置startup_class_path和init_pretraining_model_path两个参数。
在参数startup_class_path中配置finetune_startup.py文件的地址,在init_pretraining_model_path参数中配置您要加载的参数文件。
以textcnn_pretrain为例,配置完的runner如下:
```
runner:
- name: train_runner
class: train
epochs: 6
device: cpu
save_checkpoint_interval: 1
save_checkpoint_path: "increment"
init_model_path: ""
print_interval: 10
startup_class_path: "{workspace}/finetune_startup.py"
init_pretraining_model_path: "{workspace}/pretrain_model/pretrain_model_params"
phases: phase_train
```
具体使用方法请参照textcnn[使用预训练模型进行finetune](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/contentunderstanding/textcnn_pretrain)
......@@ -20,7 +20,7 @@ python -m paddlerec.run -m paddlerec.models.xxx.yyy
例如启动`recall`下的`word2vec`模型的默认配置;
```shell
python -m paddlerec.run -m models/recall/word2vec
python -m paddlerec.run -m models/recall/word2vec/config.yaml
```
### 2. 启动内置模型的个性化配置训练
......
......@@ -27,6 +27,8 @@
| init_model_path | string | 路径 | 否 | 初始化模型地址 |
| save_checkpoint_interval | int | >= 1 | 否 | Save参数的轮数间隔 |
| save_checkpoint_path | string | 路径 | 否 | Save参数的地址 |
| save_step_interval | int | >= 1 | 否 | Step save参数的batch数间隔 |
| save_step_path | string | 路径 | 否 | Step save参数的地址 |
| save_inference_interval | int | >= 1 | 否 | Save预测模型的轮数间隔 |
| save_inference_path | string | 路径 | 否 | Save预测模型的地址 |
| save_inference_feed_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的入口变量name |
......@@ -37,6 +39,9 @@
| startup_class_path | string | 路径 | 否 | 自定义startup流程实现的地址 |
| runner_class_path | string | 路径 | 否 | 自定义runner流程实现的地址 |
| terminal_class_path | string | 路径 | 否 | 自定义terminal流程实现的地址 |
| init_pretraining_model_path | string | 路径 | 否 |自定义的startup流程中需要传入这个参数,finetune中需要加载的参数的地址 |
| runner_result_dump_path | string | 路径 | 否 | 运行中metrics的结果使用json.dump到文件的地址,若是在训练的runner中使用, 会自动加上epoch后缀 |
......
# 内容理解模型库
## 简介
我们提供了常见的内容理解任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的内容理解模型包括 [Tagspace](tagspace)[文本分类](classification)等。
我们提供了常见的内容理解任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的内容理解模型包括 [Tagspace](tagspace)[文本分类](textcnn)[基于textcnn的预训练模型](textcnn_pretrain)等。
模型算法库在持续添加中,欢迎关注。
......@@ -23,7 +23,7 @@
| 模型 | 简介 | 论文 |
| :------------------: | :--------------------: | :---------: |
| TagSpace | 标签推荐 | [EMNLP 2014][TagSpace: Semantic Embeddings from Hashtags](https://www.aclweb.org/anthology/D14-1194.pdf) |
| Classification | 文本分类 | [EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf) |
| textcnn | 文本分类 | [EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf) |
下面是每个模型的简介(注:图片引用自链接中的论文)
......@@ -32,7 +32,7 @@
<img align="center" src="../../doc/imgs/tagspace.png">
<p>
[文本分类CNN模型](https://www.aclweb.org/anthology/D14-1181.pdf)
[textCNN模型](https://www.aclweb.org/anthology/D14-1181.pdf)
<p align="center">
<img align="center" src="../../doc/imgs/cnn-ckim2014.png">
<p>
......@@ -42,7 +42,7 @@
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd PaddleRec
python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
python -m paddlerec.run -m models/contentunderstanding/textcnn/config.yaml
```
## 使用教程(复现论文)
......@@ -134,7 +134,7 @@ batch: 13, acc: [0.928], loss: [0.01736144]
batch: 14, acc: [0.93], loss: [0.01911209]
```
**(2)Classification**
**(2)textcnn**
### 数据处理
情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。
......@@ -206,4 +206,4 @@ batch: 3, acc: [0.90234375], loss: [0.27907994]
| 数据集 | 模型 | loss | acc |
| :------------------: | :--------------------: | :---------: |:---------: |
| ag news dataset | TagSpace | 0.0198 | 0.9177 |
| ChnSentiCorp | Classification | 0.2282 | 0.9127 |
| ChnSentiCorp | textcnn | 0.2282 | 0.9127 |
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "models/contentunderstanding/classification"
workspace: "models/contentunderstanding/textcnn"
dataset:
- name: data1
......
# classification文本分类模型
# textcnn文本分类模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── reader.py #读取程序
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
......@@ -44,7 +43,7 @@ Yoon Kim在论文[EMNLP 2014][Convolutional neural networks for sentence classic
| 模型 | dev | test |
| :------| :------ | :------
| TextCNN | 90.75% | 92.19% |
| TextCNN | 90.75% | 91.27% |
您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt):
......@@ -73,13 +72,13 @@ os : windows/linux/macos
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
python -m paddlerec.run -m models/contentunderstanding/textcnn/config.yaml
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/classification
1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/textcnn
2. 下载并解压数据集,命令如下:
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase
from paddlerec.core.metrics import RecallK
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.dict_size = 2000000 + 1
self.max_seq_len = 1024
self.emb_dim = 128
self.cnn_hid_dim = 128
self.cnn_win_size = 3
self.cnn_win_size2 = 5
self.hid_dim1 = 96
self.class_dim = 30
self.is_sparse = True
def input_data(self, is_infer=False, **kwargs):
text = fluid.data(
name="text", shape=[None, self.max_seq_len, 1], dtype='int64')
label = fluid.data(name="category", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
return [text, label, seq_len]
def net(self, inputs, is_infer=False):
""" network definition """
#text label
self.data = inputs[0]
self.label = inputs[1]
self.seq_len = inputs[2]
emb = embedding(self.data, self.dict_size, self.emb_dim,
self.is_sparse)
concat = multi_convs(emb, self.seq_len, self.cnn_hid_dim,
self.cnn_win_size, self.cnn_win_size2)
self.fc_1 = full_connect(concat, self.hid_dim1)
self.metrics(is_infer)
def metrics(self, is_infer=False):
""" classification and metrics """
# softmax layer
prediction = fluid.layers.fc(input=[self.fc_1],
size=self.class_dim,
act="softmax",
name="pretrain_fc_1")
cost = fluid.layers.cross_entropy(input=prediction, label=self.label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=self.label)
#acc = RecallK(input=prediction, label=label, k=1)
self._cost = avg_cost
if is_infer:
self._infer_results["acc"] = acc
else:
self._metrics["acc"] = acc
def embedding(inputs, dict_size, emb_dim, is_sparse):
""" embeding definition """
emb = fluid.layers.embedding(
input=inputs,
size=[dict_size, emb_dim],
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(
name='pretrain_word_embedding',
initializer=fluid.initializer.Xavier()))
return emb
def multi_convs(input_layer, seq_len, cnn_hid_dim, cnn_win_size,
cnn_win_size2):
"""conv and concat"""
emb = fluid.layers.sequence_unpad(
input_layer, length=seq_len, name="pretrain_unpad")
conv = fluid.nets.sequence_conv_pool(
param_attr=fluid.ParamAttr(name="pretrain_conv0_w"),
bias_attr=fluid.ParamAttr(name="pretrain_conv0_b"),
input=emb,
num_filters=cnn_hid_dim,
filter_size=cnn_win_size,
act="tanh",
pool_type="max")
conv2 = fluid.nets.sequence_conv_pool(
param_attr=fluid.ParamAttr(name="pretrain_conv1_w"),
bias_attr=fluid.ParamAttr(name="pretrain_conv1_b"),
input=emb,
num_filters=cnn_hid_dim,
filter_size=cnn_win_size2,
act="tanh",
pool_type="max")
concat = fluid.layers.concat(
input=[conv, conv2], axis=1, name="pretrain_concat")
return concat
def full_connect(input_layer, hid_dim1):
"""full connect layer"""
fc_1 = fluid.layers.fc(name="pretrain_fc_0",
input=input_layer,
size=hid_dim1,
act="tanh")
return fc_1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "models/contentunderstanding/textcnn_pretrain"
dataset:
- name: dataset_train
batch_size: 128
type: DataLoader
data_path: "{workspace}/senta_data/train"
data_converter: "{workspace}/reader.py"
- name: dataset_infer
batch_size: 256
type: DataLoader
data_path: "{workspace}/senta_data/test"
data_converter: "{workspace}/reader.py"
hyper_parameters:
optimizer:
class: adam
learning_rate: 0.001
strategy: async
mode: [train_runner,infer_runner]
runner:
- name: train_runner
class: train
epochs: 6
device: cpu
save_checkpoint_interval: 1
save_checkpoint_path: "increment"
init_model_path: ""
print_interval: 10
# startup class for finetuning
startup_class_path: "{workspace}/finetune_startup.py"
# path of pretrained model. Please set empty if you don't use finetune function.
init_pretraining_model_path: "{workspace}/pretrain_model/pretrain_model_params"
phases: phase_train
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/3" # load model path
phases: phase_infer
phase:
- name: phase_train
model: "{workspace}/model.py"
dataset_name: dataset_train
thread_num: 1
- name: phase_infer
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# encoding=utf-8
import os
import sys
def build_word_dict():
word_file = "word_dict.txt"
f = open(word_file, "r")
word_dict = {}
lines = f.readlines()
for line in lines:
word = line.strip().split("\t")
word_dict[word[0]] = word[1]
f.close()
return word_dict
def build_token_data(word_dict, txt_file, token_file):
max_text_size = 100
f = open(txt_file, "r")
fout = open(token_file, "w")
lines = f.readlines()
i = 0
for line in lines:
line = line.strip("\n").split("\t")
text = line[0].strip("\n").split(" ")
tokens = []
label = line[1]
for word in text:
if word in word_dict:
tokens.append(str(word_dict[word]))
else:
tokens.append("0")
seg_len = len(tokens)
if seg_len < 5:
continue
if seg_len >= max_text_size:
tokens = tokens[:max_text_size]
seg_len = max_text_size
else:
tokens = tokens + ["0"] * (max_text_size - seg_len)
text_tokens = " ".join(tokens)
fout.write(text_tokens + " " + str(seg_len) + " " + label + "\n")
if (i + 1) % 100 == 0:
print(str(i + 1) + " lines OK")
i += 1
fout.close()
f.close()
word_dict = build_word_dict()
txt_file = "test.tsv"
token_file = "test.txt"
build_token_data(word_dict, txt_file, token_file)
txt_file = "dev.tsv"
token_file = "dev.txt"
build_token_data(word_dict, txt_file, token_file)
txt_file = "train.tsv"
token_file = "train.txt"
build_token_data(word_dict, txt_file, token_file)
5681 17044 4352 7574 16576 3574 32952 12211 18835 28961 15320 2019 21675 30604 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1
9054 31881 4449 12211 12488 5975 3574 28592 2547 2547 14132 3574 24908 5975 24285 10010 3574 31872 20925 9886 12211 26530 3567 30818 19640 22506 28312 19887 12211 28212 8576 3574 28592 12306 14132 539 33049 9039 14160 113 3567 19675 5511 2111 623 12068 12211 3574 18416 12068 19680 12211 30781 21946 1525 9886 3574 28109 31201 3567 25710 30503 30781 12068 19887 12211 22052 3574 2050 5402 10217 31201 1525 9698 14160 19887 3574 26209 24908 539 33049 9039 32949 8890 29693 3566 3566 11053 30781 26853 3567 3567 0 0 0 0 0 0 0 0 92 0
19640 32771 31526 16576 13354 3574 5087 30781 7902 19037 12211 0 3574 4756 15048 11063 0 15019 16576 2019 29812 2276 22804 13275 2019 24599 12211 30294 6983 26606 1467 3574 18448 8052 16576 23091 32440 11034 16576 3574 1470 6983 1346 31382 13354 3574 11711 10074 28587 5030 19058 16576 2019 16497 6890 12223 30035 6983 1112 18448 30837 11280 24599 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 64 0
7513 19838 3562 32737 15474 3562 1887 15474 0 0 18835 19813 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 1
30325 3574 30788 12211 25843 11533 30150 8937 11309 8690 12211 14166 2200 3574 15802 0 20424 14166 25336 113 16576 11533 24294 12211 26301 16576 3574 28592 16191 12211 8690 13743 0 517 12211 0 0 23958 3574 31019 19680 13841 15337 12211 23958 30781 28630 3574 8690 12700 11280 12211 23958 24908 20409 7481 8052 6094 4002 30245 3574 1526 9904 27032 31347 24006 12211 14166 0 9910 24908 12211 0 2019 25469 17293 27438 29774 13757 24908 22301 28505 25450 12211 14039 3574 28801 4621 4879 3574 623 9904 23958 14166 18417 4895 113 11114 2018 113 100 1
113 16576 17947 28955 12211 24253 3574 22068 30167 12211 14039 30818 28640 7801 2019 7985 30167 5402 6805 0 12211 27645 33067 30151 3574 11110 12211 10710 4549 22708 4308 24908 25975 12211 26957 0 2019 17942 25575 227 19641 1525 13129 113 15492 23224 3574 21163 15565 23273 29004 12452 13233 27573 12211 12046 2019 302 19367 16576 27914 0 0 113 12211 28035 0 13743 13330 24390 12466 1525 12537 3574 18131 2019 9315 25720 27416 2276 15038 18162 10024 28955 3574 10097 18162 26594 12211 21949 3574 30788 12133 26362 1779 27386 21017 14295 1525 454 100 1
33022 4169 19038 25096 3574 19185 113 25010 0 0 10511 17460 28972 6574 3574 1409 0 10010 3574 33022 129 16186 10511 17460 15182 3574 20235 10511 17460 11226 27150 13166 3562 18835 19038 5391 3574 22195 8052 28892 31948 10960 3574 13367 29338 15048 11030 22185 18621 28776 5205 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 0
23439 330 0 0 29655 12211 3574 4211 3574 19650 19640 13757 3562 0 0 8990 330 0 0 18920 12211 31924 6688 31857 15364 3574 19641 30781 18416 28952 9209 12211 118 10710 16912 3562 0 0 27771 330 0 0 10126 30325 3574 15374 4348 0 6356 28420 24193 29526 12211 10523 21872 3571 24383 1580 3574 17536 1525 14745 21674 10710 4952 14871 3574 14590 20306 7695 0 32718 3562 0 0 13260 330 0 0 5847 30325 3574 25951 26995 21163 22787 15535 20889 3574 27914 5391 130 2276 15243 6356 0 16576 3562 0 0 100 1
24908 32568 24044 28952 16576 27914 28955 3574 14160 13543 16582 5536 2019 11711 3527 19675 12211 15474 3574 0 14160 31857 30927 2019 18416 9231 12486 12211 20374 3574 1111 30173 19058 3574 31857 31825 3574 30170 15501 21070 2019 31383 19640 5004 3574 31858 12211 6408 2733 8034 24870 12730 12211 16401 2019 18416 19640 9072 18416 12211 2313 12211 20374 3574 18416 2313 25575 19315 31383 20374 20161 24160 3574 11711 3527 3574 31383 20374 31857 28378 2019 1296 5402 23273 16576 2019 16497 28952 2019 9512 15038 5536 3574 11711 10486 15168 19641 21994 0 2019 100 1
0 7902 5402 29107 16576 15535 15535 15535 0 19634 21017 12211 26505 14160 15129 0 15535 15535 15535 26211 4002 9749 23360 16576 15535 15535 15535 26040 15535 15535 15535 15535 11698 32986 19641 0 22421 15535 15535 15535 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 0
28955 17755 3574 1735 18232 19262 12992 12230 3574 18416 30781 7388 19680 19643 16576 12211 3574 28952 9209 3574 16572 22360 2019 19680 19643 6414 12211 2011 27666 2012 3574 13757 32205 3574 14754 11280 12211 22186 7628 1827 17413 3574 19641 30781 31383 12211 4853 2019 33140 113 6047 6414 3310 31383 3574 4654 22360 6580 26147 12211 18696 2019 12306 6414 20539 3574 12680 22360 18624 8051 29384 1146 2019 18046 33188 16582 29384 12211 17311 13222 3574 18416 7453 28961 8014 3574 11711 18416 28961 17658 3574 29384 30781 19893 19643 15073 12211 32171 12211 2019 100 0
28955 12211 30964 14590 28961 4412 29183 29493 6393 17111 29183 11670 12211 19636 23233 28961 4412 29183 25469 1112 16603 14590 16720 28961 9749 32365 23958 12211 33245 1525 11271 29183 29607 4694 8052 12068 32247 26813 29183 12229 6856 3674 330 30326 972 32948 29183 18416 28961 20161 1120 19641 30054 28955 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 55 0
28587 26594 16393 14439 20100 8452 12211 11738 3574 20288 2276 2770 9051 29266 3574 27097 12211 0 14648 7902 5827 4308 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 1
19083 3561 20034 30173 8356 3574 18416 18016 6154 13757 30827 23410 4879 5213 3566 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 1
28587 14745 2018 1580 3574 19636 9052 14160 19683 16576 0 0 6007 5361 26370 5391 785 3574 0 17010 28587 27857 19048 20558 9051 3574 6007 0 0 22897 18323 1447 2019 0 0 32391 17536 24961 19048 9749 18448 3574 24283 6356 7648 26789 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 47 0
24908 18920 1400 665 16167 12211 17293 3574 13518 28952 8393 23504 3574 31266 12211 30781 4477 2019 4654 18896 4289 13841 4822 3574 24908 27376 15243 18416 8052 20077 17493 17317 3574 14842 16949 3574 12081 28961 2276 0 14399 20158 14398 16335 12211 3699 7697 6318 69 2019 11924 8053 27376 12211 14039 3574 21210 23273 3574 1732 30818 17942 22561 3083 2019 17268 12700 28892 9108 16576 26203 19037 23872 3574 14988 31773 3574 33140 1725 24908 0 8053 8052 13841 3574 25944 0 2019 4032 5025 13841 19185 12211 14039 3574 665 0 12211 4822 6988 100 1
29728 31619 6149 5402 113 7317 11738 3574 31482 11924 16576 17657 6541 9761 3574 31224 5402 21141 3574 6356 16191 19640 14451 26154 7192 16076 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 0
29302 11364 19059 13652 12211 3574 7898 30781 6356 7961 14954 21752 7340 2019 29302 11401 8328 3574 20384 20034 1460 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 0
4592 12211 31382 11030 3574 7961 6356 136 11714 31881 31478 3574 7957 11533 17413 3574 18835 14451 14550 11533 389 3574 14444 20444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 1
18416 24908 0 5233 22185 12211 29183 18956 30781 9668 8904 15168 18416 16108 29183 18416 29123 4351 28845 11709 11731 30486 21200 3574 4351 32986 8052 13757 11711 16497 25138 18448 3006 30326 20837 6356 16060 11231 13757 18448 11731 29173 3576 18835 27924 11711 11533 11225 3574 17386 15934 7288 0 26216 12211 1542 3574 24908 12511 18416 16060 11231 32842 18448 11731 29173 3574 18956 9668 31387 755 32986 18416 28972 18855 30781 18448 3006 30326 20837 30781 8052 13757 15048 18448 11731 29173 12211 3574 19640 18584 18416 32986 25710 18416 2276 29173 12211 22052 24908 100 0
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import warnings
import os
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddlerec.core.utils import envs
from paddlerec.core.trainers.framework.startup import StartupBase
from paddlerec.core.trainer import EngineMode
__all__ = ["Startup"]
class Startup(StartupBase):
"""R
"""
def __init__(self, context):
self.op_name_scope = "op_namescope"
self.clip_op_name_scope = "@CLIP"
self.op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName(
)
print("Running FineTuningStartup.")
def _is_opt_role_op(self, op):
# NOTE: depend on oprole to find out whether this op is for
# optimize
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
return False
def _get_params_grads(self, program):
"""
Get optimizer operators, parameters and gradients from origin_program
Returns:
opt_ops (list): optimize operators.
params_grads (dict): parameter->gradient.
"""
block = program.global_block()
params_grads = []
# tmp set to dedup
optimize_params = set()
origin_var_dict = program.global_block().vars
for op in block.ops:
if self._is_opt_role_op(op):
# Todo(chengmo): Whether clip related op belongs to Optimize guard should be discussed
# delete clip op from opt_ops when run in Parameter Server mode
if self.op_name_scope in op.all_attrs(
) and self.clip_op_name_scope in op.attr(self.op_name_scope):
op._set_attr(
"op_role",
int(core.op_proto_and_checker_maker.OpRole.Backward))
continue
if op.attr(self.op_role_var_attr_name):
param_name = op.attr(self.op_role_var_attr_name)[0]
grad_name = op.attr(self.op_role_var_attr_name)[1]
if not param_name in optimize_params:
optimize_params.add(param_name)
params_grads.append([
origin_var_dict[param_name],
origin_var_dict[grad_name]
])
return params_grads
@staticmethod
def is_persistable(var):
"""
Check whether the given variable is persistable.
Args:
var(Variable): The variable to be checked.
Returns:
bool: True if the given `var` is persistable
False if not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
param = fluid.default_main_program().global_block().var('fc.b')
res = fluid.io.is_persistable(param)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER:
return False
return var.persistable
def load(self, context, is_fleet=False, main_program=None):
dirname = envs.get_global_env("runner." + context["runner_name"] +
".init_pretraining_model_path", "")
hotstart_dirname = envs.get_global_env(
"runner." + context["runner_name"] + ".init_model_path", "")
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
if os.path.exists(os.path.join(dirname, var.name)):
print("INIT %s" % var.name)
return True
else:
#print("SKIP %s" % var.name)
return False
if hotstart_dirname != "":
#If init_model_path exists, hot start is first choice
print("going to load ", hotstart_dirname)
fluid.io.load_persistables(
context["exe"], hotstart_dirname, main_program=main_program)
print("load from {} success".format(hotstart_dirname))
elif dirname != "":
#If init_pretraining_model_path exists ,pretrained model load parameters
print("going to load ", dirname)
fluid.io.load_vars(
context["exe"],
dirname,
main_program=main_program,
predicate=existed_params)
print("load from {} success".format(dirname))
else:
#If both of the above are empty, cold start model
return
def startup(self, context):
for model_dict in context["phases"]:
with fluid.scope_guard(context["model"][model_dict["name"]][
"scope"]):
train_prog = context["model"][model_dict["name"]][
"main_program"]
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
context["exe"].run(startup_prog)
self.load(context, main_program=train_prog)
context["status"] = "train_pass"
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase
from basemodel import embedding
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.dict_size = 2000001
self.max_len = 100
self.cnn_dim = 128
self.cnn_filter_size1 = 1
self.cnn_filter_size2 = 2
self.cnn_filter_size3 = 3
self.emb_dim = 128
self.hid_dim = 96
self.class_dim = 2
self.is_sparse = True
def input_data(self, is_infer=False, **kwargs):
data = fluid.data(
name="input", shape=[None, self.max_len, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
return [data, seq_len, label]
def net(self, input, is_infer=False):
""" network definition """
self.data = input[0]
self.seq_len = input[1]
self.label = input[2]
# embedding layer
emb = embedding(self.data, self.dict_size, self.emb_dim,
self.is_sparse)
emb = fluid.layers.sequence_unpad(emb, length=self.seq_len)
# convolution layer
conv1 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size1,
act="tanh",
pool_type="max")
conv2 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size2,
act="tanh",
pool_type="max")
conv3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size3,
act="tanh",
pool_type="max")
convs_out = fluid.layers.concat(input=[conv1, conv2, conv3], axis=1)
# full connect layer
fc_1 = fluid.layers.fc(input=convs_out, size=self.hid_dim, act="tanh")
# softmax layer
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=self.label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=self.label)
self._cost = avg_cost
if is_infer:
self._infer_results["acc"] = acc
self._infer_results["loss"] = avg_cost
else:
self._metrics["acc"] = acc
self._metrics["loss"] = avg_cost
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddlerec.core.reader import ReaderBase
class Reader(ReaderBase):
def init(self):
pass
def _process_line(self, l):
l = l.strip().split()
data = l[0:100]
seq_len = l[100:101]
label = l[101:]
return data, label, seq_len
def generate_sample(self, line):
def data_iter():
data, label, seq_len = self._process_line(line)
if data is None:
yield None
return
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
yield [('data', data), ('seq_len', seq_len), ('label', label)]
return data_iter
# 使用文本分类模型作为预训练模型对textcnn模型进行fine-tuning
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── basemodel.py #预训练模型
├── config.yaml #配置文件
├── reader.py #读取程序
├── finetune_startup.py #加载参数
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。在本文中,我们提供了一个使用大规模的对文章数据进行多分类的textCNN模型(2个卷积核的cnn模型)作为预训练模型。本文会使用这个预训练模型对contentunderstanding目录下的textcnn模型(3个卷积核的cnn模型)进行fine-tuning。本文将预训练模型中的embedding层迁移到了contentunderstanding目录下的textcnn模型中,依然进行情感分析的二分类任务。最终获得了模型准确率上的基本持平以及更快速的收敛
Yoon Kim在论文[EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf)提出了TextCNN并给出基本的结构。将卷积神经网络CNN应用到文本分类任务,利用多个不同size的kernel来提取句子中的关键信息(类似于多窗口大小的ngram),从而能够更好地捕捉局部相关性。模型的主体结构如图所示:
<p align="center">
<img align="center" src="../../../doc/imgs/cnn-ckim2014.png">
<p>
## 数据准备
情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。
情感是人类的一种高级智能行为,为了识别文本的情感倾向,需要深入的语义建模。另外,不同领域(如餐饮、体育)在情感的表达各不相同,因而需要有大规模覆盖各个领域的数据进行模型训练。为此,我们通过基于深度学习的语义模型和大规模数据挖掘解决上述两个问题。效果上,我们和contentunderstanding目录下的textcnn模型一样基于开源情感倾向分类数据集ChnSentiCorp进行评测。
您可以直接执行以下命令获取我们的预训练模型(basemodel.py,pretrain_model_params)以及对应的字典(word_dict.txt):
```
wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz
tar -zxvf textcnn_pretrain%2Fpretrain_model.tar.gz
```
您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt):
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
```
数据格式为一句中文的评价语句,和一个代表情感信息的标签。两者之间用/t分隔,中文的评价语句已经分词,词之间用空格分隔。
```
15.4寸 笔记本 的 键盘 确实 爽 , 基本 跟 台式机 差不多 了 , 蛮 喜欢 数字 小 键盘 , 输 数字 特 方便 , 样子 也 很 美观 , 做工 也 相当 不错 1
跟 心灵 鸡汤 没 什么 本质 区别 嘛 , 至少 我 不 喜欢 这样 读 经典 , 把 经典 都 解读 成 这样 有点 去 中国 化 的 味道 了 0
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
本文需要下载模型的参数文件和finetune的数据集才可以体现出finetune的效果,所以暂不提供快速一键运行。若想体验finetune的效果,请按照下面【效果复现】模块的步骤依次执行。
## 效果复现
在本模块,我们希望用户可以理解如何使用预训练模型来对自己的模型进行fine-tuning。
1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/textcnn_pretrain
2. 下载并解压数据集,命令如下。解压后您可以看到出现senta_data目录
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
```
3. 下载并解压预训练模型,命令如下。
```
wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz
tar -zxvf textcnn_pretrain%2Fpretrain_model.tar.gz
```
4. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本。在您下载预训练模型后,将word_dict.txt复制到senta_data文件中。您在解压数据集后,将preprocess.py复制到senta_data文件中。
执行preprocess.py,即可将数据集中提供的dev.tsv,test.tsv,train.tsv按照词典提供的对应关系转化为可直接训练的txt文件.命令如下:
```
rm -f senta_data/word_dict.txt
cp pretrain_model/word_dict.txt senta_data
cp data/preprocess.py senta_data/
cd senta_data
python3 preprocess.py
mkdir train
mv train.txt train
mkdir test
mv test.txt test
cd ..
```
5. 打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
6. 执行命令,开始训练:
```
python -m paddlerec.run -m ./config.yaml
```
7. 运行结果:
```
PaddleRec: Runner infer_runner Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment/3
batch: 1, acc: [0.8828125], loss: [0.35940486]
batch: 2, acc: [0.91796875], loss: [0.24300358]
batch: 3, acc: [0.91015625], loss: [0.2490797]
Infer phase_infer of epoch increment/3 done, use time: 0.78388094902, global metrics: acc=[0.91015625], loss=[0.2490797]
PaddleRec Finish
```
## 进阶使用
在观察完model.py和config.yaml两个文件后,相信大家会发现和之前的模型相比有些改变。本章将详细解析这些改动,方便大家理解并灵活应用到自己的程序中.
1.在model.py中,大家会发现在构建embedding层的时候,直接传参使用了basemodel.py中的embeding层。
这是因为本文使用了预训练模型(basemodel.py)中embedding层,经过大量语料的训练后的embedding层中本身已经蕴含了大量的先验知识。而这些先验知识对于下游任务,尤其是小数据集来讲,是非常有帮助的。
2.在config.yaml中,大家会发现在train_runner中多了startup_class_path和init_pretraining_model_path两个参数。
参数startup_class_path的作用是自定义训练的流程。我们将在自定义的finetune_startup.py文件中将训练好的参数加载入模型当中。
参数init_pretraining_model_path的作用就是指明加载参数的路径。若路径下的参数文件和模型中的var具有相同的名字,就会将参数加载进模型当中。
在您设置init_model_path参数时,程序会优先试图按您设置的路径热启动。当没有init_model_path参数,无法热启动时,程序会试图加载init_pretraining_model_path路径下的参数,进行finetune训练。
只有在两者均为空的情况下,模型会冷启动从头开始训练。
若您希望进一步了解自定义流程的操作,可以参考以下内容:[如何添加自定义流程](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/trainer_develop.md#%E5%A6%82%E4%BD%95%E6%B7%BB%E5%8A%A0%E8%87%AA%E5%AE%9A%E4%B9%89%E6%B5%81%E7%A8%8B)
3.在basemodel.py中,我们准备了embedding,multi_convs,full_connect三个模块供您在有需要时直接import使用。
相关参数可以从本文提供的预训练模型下载链接里的pretrain_model/pretrain_model_params中找到。
## FAQ
# PaddleRec 基于 Movielens 数据集的全流程示例
## 模型的详细教程可以查阅: [十分钟!全流程!从零搭建推荐系统](https://aistudio.baidu.com/aistudio/projectdetail/559336)
## 本地运行流程
在本地需要安装`PaddleRec``PaddlePaddle`,推荐在`Linux` + `python2.7` 环境下执行此demo
本地运行流程与AiStudio流程基本一致,细节略有区别
### 离线训练
```shell
sh train.sh
```
### 离线测试
```shell
sh offline_test.sh
```
### 模拟在线召回
```shell
sh online_recall.sh
```
### 模拟在线排序
```shell
sh online_rank.sh
```
cd data
echo "---> Download movielens 1M data ..."
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
echo "---> Unzip ml-1m.zip ..."
unzip ml-1m.zip
rm ml-1m.zip
echo "---> Split movielens data ..."
python split.py
mkdir train/
mkdir test/
mkdir -p train/
mkdir -p test/
echo "---> Process train & test data ..."
python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train
python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test
python process_ml_1m.py hash log.data.train > ./train/data.txt
......@@ -15,4 +20,6 @@ python process_ml_1m.py hash log.data.test > ./test/data.txt
rm log.data.train
rm log.data.test
cd ../
cd ..
echo "---> Finish data process"
## modify config.yaml to infer mode at first
cd recall
python -m paddlerec.run -m ./config.yaml
cd ../rank
python -m paddlerec.run -m ./config.yaml
cd ..
echo "Recall offline test ..."
echo "Model config at models/demo/movie_recommand/recall/config_offline_test.yaml"
python -m paddlerec.run -m ./recall/config_test_offline.yaml
echo "Rank offline test ..."
echo "Model config at models/demo/movie_recommand/rank/config_offline_test.yaml"
python -m paddlerec.run -m ./rank/config_test_offline.yaml
echo "recall offline test result:"
python parse.py recall_offline recall/infer_result
echo "rank offline test result:"
python parse.py rank_offline rank/infer_result
cd data
echo "Create online test data ..."
python process_ml_1m.py data_rank > online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd ../rank
python -m paddlerec.run -m ./config.yaml
cd ../
python parse.py rank_online rank/infer_result
cd ..
echo "Rank online test ..."
echo "Model config at models/demo/movie_recommand/rank/config_online_test.yaml"
python -m paddlerec.run -m ./rank/config_test_online.yaml
python parse.py rank_online ./rank/infer_result
cd data
echo "Create online test data ..."
mkdir online_user/test
python process_ml_1m.py data_recall > online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd ../recall
python -m paddlerec.run -m ./config.yaml
cd ../
cd ..
echo "Recall online test ..."
echo "Model config at models/demo/movie_recommand/recall/config_online_test.yaml"
python -m paddlerec.run -m ./recall/config_test_online.yaml
python parse.py recall_online recall/infer_result
......@@ -12,28 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "models/demo/movie_recommand"
workspace: "./"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 128
type: QueueDataset
type: DataLoader
data_path: "{workspace}/data/train"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_online_infer # name
batch_size: 10
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
......@@ -51,42 +39,17 @@ hyper_parameters:
# train
mode: runner_train
## online or offline infer
#mode: runner_infer
runner:
- name: runner_train
class: train
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_checkpoint_path: "increment_rank" # save checkpoint path
epochs: 10
device: cpu
- name: runner_infer
class: infer
print_interval: 10000
init_model_path: "increment/9" # load model path
#train
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
model: "{workspace}/rank/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
thread_num: 4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#workspace: "paddlerec.models.demo.movie_recommand"
workspace: "./"
# list of dataset
dataset:
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_infer
## online or offline infer
#mode: runner_infer
runner:
- name: runner_infer
epochs: 1
device: cpu
class: infer
print_interval: 10000
runner_result_dump_path: "{workspace}/rank/infer_result"
init_model_path: "increment_rank/9" # load model path
#offline infer
phase:
- name: phase1
model: "{workspace}/rank/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "./"
# list of dataset
dataset:
- name: dataset_online_infer # name
batch_size: 10
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_infer
runner:
- name: runner_infer
epochs: 1
device: cpu
class: infer
print_interval: 10000
runner_result_dump_path: "{workspace}/rank/infer_result"
init_model_path: "increment_rank/9" # load model path
#offline infer
phase:
- name: phase1
model: "{workspace}/rank/model.py" # user-defined model
dataset_name: dataset_online_infer # select dataset by name
thread_num: 1
......@@ -12,28 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "models/demo/movie_recommand"
workspace: "./"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 128
type: QueueDataset
type: DataLoader
data_path: "{workspace}/data/train"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_online_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
......@@ -50,43 +38,17 @@ hyper_parameters:
# train
mode: runner_train
## online or offline infer
#mode: runner_infer
runner:
- name: runner_train
class: train
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_checkpoint_path: "increment_recall" # save checkpoint path
epochs: 10
device: cpu
- name: runner_infer
class: infer
print_interval: 10000
init_model_path: "increment/9" # load model path
#train
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
model: "{workspace}/recall/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
thread_num: 4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#workspace: "paddlerec.models.demo.movie_recommand"
workspace: "./"
# list of dataset
dataset:
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_infer
runner:
- name: runner_infer
epochs: 1
device: cpu
class: infer
print_interval: 100000
runner_result_dump_path: "{workspace}/recall/infer_result"
init_model_path: "increment_recall/9" # load model path
#offline infer
phase:
- name: phase1
model: "{workspace}/recall/model.py" # user-defined model
dataset_name: dataset_infer
thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#workspace: "paddlerec.models.demo.movie_recommand"
workspace: ./
# list of dataset
dataset:
- name: dataset_online_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_infer
## online or offline infer
#mode: runner_infer
runner:
- name: runner_infer
epochs: 1
device: cpu
class: infer
print_interval: 10000
runner_result_dump_path: "{workspace}/recall/infer_result"
init_model_path: "increment_recall/9" # load model path
#offline infer
phase:
- name: phase1
model: "{workspace}/recall/model.py" # user-defined model
dataset_name: dataset_online_infer # select dataset by name
thread_num: 1
cd recall
python -m paddlerec.run -m ./config.yaml &> log &
cd ../rank
python -m paddlerec.run -m ./config.yaml &> log &
cd ..
echo "Recall offline training ..."
echo "Model config at models/demo/movie_recommand/recall/config.yaml"
python -m paddlerec.run -m ./recall/config.yaml
echo "----------------------------------------"
echo "Rank offline training ..."
echo "Model config at models/demo/movie_recommand/rank/config.yaml"
python -m paddlerec.run -m ./rank/config.yaml
......@@ -50,11 +50,6 @@ ESMM是发表在 SIGIR’2018 的论文[《Entire Space Multi-Task Model: An E
数据地址:[Ali-CCP:Alibaba Click and Conversion Prediction]( https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408 )
```
cd data
sh run.sh
```
数据格式参见demo数据:data/train
......@@ -108,11 +103,25 @@ CPU环境
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1000, thread_num=8, epoch_num=4
由于原论文的数据太大,我们选取了部分数据作为训练和测试数据, 建议使用gpu训练。
我们的测试ctr auc为0.79+,ctcvr auc为0.82+。
```
wget https://paddlerec.bj.bcebos.com/esmm/traindata_10w.csv
wget https://paddlerec.bj.bcebos.com/esmm/testdata_10w.csv
mkdir data/train_data data/test_data
mv traindata_10w.csv data/train_data
mv testdata_10w.csv data/test_data
```
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1024, epoch=10, device=gpu, selected_gpus:"0"
具体配置可以下载config_10w.yaml文件
```
wget https://paddlerec.bj.bcebos.com/esmm/config_10w.yaml
```
修改后运行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
......
......@@ -17,19 +17,19 @@ workspace: "models/multitask/esmm"
dataset:
- name: dataset_train
batch_size: 1
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/esmm_reader.py"
- name: dataset_infer
batch_size: 1
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/test"
data_converter: "{workspace}/esmm_reader.py"
hyper_parameters:
vocab_size: 10000
embed_size: 128
vocab_size: 737946
embed_size: 12
optimizer:
class: adam
learning_rate: 0.001
......@@ -43,15 +43,15 @@ runner:
class: train
device: cpu
epochs: 3
save_checkpoint_interval: 2
save_checkpoint_interval: 1
save_inference_interval: 4
save_checkpoint_path: "increment"
save_checkpoint_path: "increment_esmm"
save_inference_path: "inference"
print_interval: 10
phases: [train]
- name: infer_runner
class: infer
init_model_path: "increment/1"
init_model_path: "increment_esmm/1"
device: cpu
print_interval: 1
phases: [infer]
......
......@@ -17,12 +17,12 @@ workspace: "models/multitask/mmoe"
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
......@@ -37,7 +37,6 @@ hyper_parameters:
learning_rate: 0.001
strategy: async
#use infer_runner mode and modify 'phase' below if infer
mode: [train_runner, infer_runner]
runner:
......@@ -49,10 +48,10 @@ runner:
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
print_interval: 10
print_interval: 1
- name: infer_runner
class: infer
init_model_path: "increment/0"
init_model_path: "increment/1"
device: cpu
phase:
......
......@@ -259,3 +259,133 @@ auc_var, batch_auc_var, auc_states = fluid.layers.auc(
```
完成上述组网后,我们最终可以通过训练拿到`avg_cost``auc`两个重要指标。
## 流式训练(OnlineLearning)任务启动及配置流程
### 流式训练简介
流式训练是按照一定顺序进行数据的接收和处理,每接收一个数据,模型会对它进行预测并对当前模型进行更新,然后处理下一个数据。 像信息流、小视频、电商等场景,每天都会新增大量的数据, 让每天(每一刻)新增的数据基于上一天(上一刻)的模型进行新的预测和模型更新。
在大规模流式训练场景下, 需要使用的深度学习框架有对应的能力支持, 即:
* 支持大规模分布式训练的能力, 数据量巨大, 需要有良好的分布式训练及扩展能力,才能满足训练的时效要求
* 支持超大规模的Embedding, 能够支持十亿甚至千亿级别的Embedding, 拥有合理的参数输出的能力,能够快速输出模型参数并和线上其他系统进行对接
* Embedding的特征ID需要支持HASH映射,不要求ID的编码,能够自动增长及控制特征的准入(原先不存在的特征可以以适当的条件创建), 能够定期淘汰(能够以一定的策略进行过期的特征的清理) 并拥有准入及淘汰策略
* 最后就是要基于框架开发一套完备的流式训练的 trainer.py, 能够拥有完善的流式训练流程
### 使用ctr-dnn online learning 进行模型的训练
目前,PaddleRec基于飞桨分布式训练框架的能力,实现了这套流式训练的流程。 供大家参考和使用。我们基于`models/rank/ctr-dnn`修改了一个online_training的版本,供大家更好的理解和参考。
**注意**
1. 使用online learning 需要安装目前Paddle最新的开发者版本, 你可以从 https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev 此处获得它,需要先卸载当前已经安装的飞桨版本,根据自己的Python环境下载相应的安装包。
2. 使用online learning 需要安装目前PaddleRec最新的开发者版本, 你可以通过 git clone https://github.com/PaddlePaddle/PaddleRec.git 得到最新版的PaddleRec并自行安装
### 启动方法
1. 修改config.yaml中的 hyper_parameters.distributed_embedding=1,表示打开大规模稀疏的模式
2. 修改config.yaml中的 mode: [single_cpu_train, single_cpu_infer] 中的 `single_cpu_train` 为online_learning_cluster,表示使用online learning对应的运行模式
3. 准备训练数据, ctr-dnn中使用的online learning对应的训练模式为 天级别训练, 每天又分为24个小时, 因此训练数据需要 天--小时的目录结构进行整理。
以 2020年08月10日 到 2020年08月11日 2天的训练数据举例, 用户需要准备的数据的目录结构如下:
```
train_data/
|-- 20200810
| |-- 00
| | `-- train.txt
| |-- 01
| | `-- train.txt
| |-- 02
| | `-- train.txt
| |-- 03
| | `-- train.txt
| |-- 04
| | `-- train.txt
| |-- 05
| | `-- train.txt
| |-- 06
| | `-- train.txt
| |-- 07
| | `-- train.txt
| |-- 08
| | `-- train.txt
| |-- 09
| | `-- train.txt
| |-- 10
| | `-- train.txt
| |-- 11
| | `-- train.txt
| |-- 12
| | `-- train.txt
| |-- 13
| | `-- train.txt
| |-- 14
| | `-- train.txt
| |-- 15
| | `-- train.txt
| |-- 16
| | `-- train.txt
| |-- 17
| | `-- train.txt
| |-- 18
| | `-- train.txt
| |-- 19
| | `-- train.txt
| |-- 20
| | `-- train.txt
| |-- 21
| | `-- train.txt
| |-- 22
| | `-- train.txt
| `-- 23
| `-- train.txt
`-- 20200811
|-- 00
| `-- train.txt
|-- 01
| `-- train.txt
|-- 02
| `-- train.txt
|-- 03
| `-- train.txt
|-- 04
| `-- train.txt
|-- 05
| `-- train.txt
|-- 06
| `-- train.txt
|-- 07
| `-- train.txt
|-- 08
| `-- train.txt
|-- 09
| `-- train.txt
|-- 10
| `-- train.txt
|-- 11
| `-- train.txt
|-- 12
| `-- train.txt
|-- 13
| `-- train.txt
|-- 14
| `-- train.txt
|-- 15
| `-- train.txt
|-- 16
| `-- train.txt
|-- 17
| `-- train.txt
|-- 18
| `-- train.txt
|-- 19
| `-- train.txt
|-- 20
| `-- train.txt
|-- 21
| `-- train.txt
|-- 22
| `-- train.txt
`-- 23
`-- train.txt
```
4. 准备好数据后, 即可按照标准的训练流程进行流式训练了
```shell
python -m paddlerec.run -m models/rerank/ctr-dnn/config.yaml
```
......@@ -49,6 +49,7 @@ hyper_parameters:
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
distributed_embedding: 0
# select runner by name
mode: [single_cpu_train, single_cpu_infer]
......@@ -90,6 +91,18 @@ runner:
print_interval: 1
phases: [phase1]
- name: online_learning_cluster
class: cluster_train
runner_class_path: "{workspace}/online_learning_runner.py"
epochs: 2
device: cpu
fleet_mode: ps
save_checkpoint_interval: 1 # save model interval of epochs
save_checkpoint_path: "increment_dnn" # save checkpoint path
init_model_path: "" # load model path
print_interval: 1
phases: [phase1]
- name: collective_cluster
class: cluster_train
epochs: 2
......@@ -101,6 +114,23 @@ runner:
print_interval: 1
phases: [phase1]
- name: single_multi_gpu_train
class: train
# num of epochs
epochs: 1
# device to run training or infer
device: gpu
selected_gpus: "0,1" # 选择多卡执行训练
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 4 # save inference
save_step_interval: 1
save_checkpoint_path: "increment_dnn" # save checkpoint path
save_inference_path: "inference" # save inference path
save_step_path: "step_save"
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
print_interval: 1
phases: [phase1]
# runner will run all the phase in each epoch
phase:
- name: phase1
......
......@@ -25,8 +25,16 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_fleet_mode().upper(
) == "PSLIB" else False
self.is_distributed = False
self.distributed_embedding = False
if envs.get_fleet_mode().upper() == "PSLIB":
self.is_distributed = True
if envs.get_global_env("hyper_parameters.distributed_embedding",
0) == 1:
self.distributed_embedding = True
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
......@@ -40,14 +48,26 @@ class Model(ModelBase):
self.label_input = self._sparse_data_var[0]
def embedding_layer(input):
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()), )
if self.distributed_embedding:
emb = fluid.contrib.layers.sparse_embedding(
input=input,
size=[
self.sparse_feature_number, self.sparse_feature_dim
],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
else:
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[
self.sparse_feature_number, self.sparse_feature_dim
],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import time
import warnings
import numpy as np
import logging
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.metric import Metric
from paddlerec.core.trainers.framework.runner import RunnerBase
logging.basicConfig(
format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO)
class OnlineLearningRunner(RunnerBase):
def __init__(self, context):
print("Running OnlineLearningRunner.")
def run(self, context):
epochs = int(
envs.get_global_env("runner." + context["runner_name"] +
".epochs"))
model_dict = context["env"]["phase"][0]
model_class = context["model"][model_dict["name"]]["model"]
metrics = model_class._metrics
dataset_list = []
dataset_index = 0
for day_index in range(len(days)):
day = days[day_index]
cur_path = "%s/%s" % (path, str(day))
filelist = fleet.split_files(hdfs_ls([cur_path]))
dataset = create_dataset(use_var, filelist)
dataset_list.append(dataset)
dataset_index += 1
dataset_index = 0
for epoch in range(len(days)):
day = days[day_index]
begin_time = time.time()
result = self._run(context, model_dict)
end_time = time.time()
seconds = end_time - begin_time
message = "epoch {} done, use time: {}".format(epoch, seconds)
# TODO, wait for PaddleCloudRoleMaker supports gloo
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
if context["fleet"] is not None and isinstance(context["fleet"],
GeneralRoleMaker):
metrics_result = []
for key in metrics:
if isinstance(metrics[key], Metric):
_str = metrics[key].calc_global_metrics(
context["fleet"],
context["model"][model_dict["name"]]["scope"])
metrics_result.append(_str)
elif result is not None:
_str = "{}={}".format(key, result[key])
metrics_result.append(_str)
if len(metrics_result) > 0:
message += ", global metrics: " + ", ".join(metrics_result)
print(message)
with fluid.scope_guard(context["model"][model_dict["name"]][
"scope"]):
train_prog = context["model"][model_dict["name"]][
"main_program"]
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
self.save(epoch, context, True)
context["status"] = "terminal_pass"
......@@ -102,9 +102,9 @@ phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataloader_train # select dataset by name
thread_num: 8
thread_num: 1
- name: phase2
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 8
thread_num: 1
wget https://paddlerec.bj.bcebos.com/utils/tree_build_utils.tar.gz --no-check-certificate
# input_path: embedding的路径
# emb_shape: embedding中key-value,value的维度
# emb格式要求: embedding_id(int64),embedding(float),embedding(float),......,embedding(float)
# cluster_threads: 建树聚类所用线程
python_172_anytree/bin/python -u main.py --input_path=./gen_emb/item_emb.txt --output_path=./ --emb_shape=24 --cluster_threads=4
建树流程是:1、读取emb -> 2、kmeans聚类 -> 3、聚类结果整理为树 -> 4、基于树结构得到模型所需的4个文件
1 Layer_list:记录了每一层都有哪些节点。训练用
2 Travel_list:记录每个叶子节点的Travel路径。训练用
3 Tree_Info:记录了每个节点的信息,主要为:是否是item/item_id,所在层级,父节点,子节点。检索用
4 Tree_Embedding:记录所有节点的Embedding。训练及检索用
注意一下训练数据输入的item是建树之前用的item id,还是基于树的node id,还是基于叶子的leaf id,在tdm_reader.py中,可以加载字典,做映射。
用厂内版建树得到的输出文件夹里,有名为id2nodeid.txt的映射文件,格式是『hash值』+ 『树节点ID』+『叶子节点ID(表示第几个叶子节点,tdm_sampler op 所需的输入)』
在另一个id2bidword.txt中,也有映射关系,格式是『hash值』+『原始item ID』,这个文件中仅存储了叶子节点的信息。
......@@ -49,7 +49,7 @@ function model_test() {
root_dir=`pwd`
all_model=$(find ${root_dir} -name config.yaml)
special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace")
special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace" "textcnn_pretrain")
for model in ${all_model}
do
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册