未验证 提交 752f075e 编写于 作者: 1 123malin 提交者: GitHub

Merge branch 'master' into readme

......@@ -16,15 +16,20 @@ before_install:
# For pylint dockstring checker
- sudo apt-get update
- sudo apt-get install -y python-pip libpython-dev
- sudo apt-get remove python-urllib3
- sudo apt-get purge python-urllib3
- sudo rm /usr/lib/python2.7/dist-packages/chardet-*
- sudo pip install -U pip
- sudo pip install --upgrade setuptools
- sudo pip install six --upgrade --ignore-installed six
- sudo pip install pillow
- sudo pip install PyYAML
- sudo pip install pylint pytest astroid isort pre-commit
- sudo pip install kiwisolver
- sudo pip install paddlepaddle==1.7.2 --ignore-installed urllib3
- sudo pip uninstall -y rarfile
- sudo pip install scikit-build
- sudo pip install Pillow==5.3.0
- sudo pip install opencv-python==3.4.3.18
- sudo pip install rarfile==3.0
- sudo pip install paddlepaddle==1.7.2
- sudo python setup.py install
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
......
......@@ -124,7 +124,10 @@
```bash
# 使用CPU进行单机训练
python -m paddlerec.run -m paddlerec.models.rank.dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/rank/dnn/config.yaml
```
......@@ -144,6 +147,7 @@ python -m paddlerec.run -m paddlerec.models.rank.dnn
* [启动分布式训练](doc/distributed_train.md)
* [启动预测](doc/predict.md)
* [快速部署](doc/serving.md)
* [预训练模型](doc/pre_train_model.md)
### 进阶教程
......
......@@ -119,7 +119,10 @@ We take the `dnn` algorithm as an example to get start of `PaddleRec`, and we ta
```bash
# Training with cpu
python -m paddlerec.run -m paddlerec.models.rank.dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/rank/dnn/config.yaml
```
......
......@@ -19,6 +19,7 @@ afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/"
# 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193
PADDLE_PADDLEREC_ROLE=WORKER
PADDLEREC_CLUSTER_TYPE=K8S
use_python3=<$ USE_PYTHON3 $>
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
......
......@@ -17,6 +17,7 @@ output_path=<$ OUTPUT_PATH $>
thirdparty_path=<$ THIRDPARTY_PATH $>
PADDLE_PADDLEREC_ROLE=WORKER
PADDLEREC_CLUSTER_TYPE=MPI
use_python3=<$ USE_PYTHON3 $>
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
......
......@@ -22,6 +22,19 @@ trainers = {}
def trainer_registry():
trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(trainer_abs,
"cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
"ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
"ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
"tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
"tdm_cluster_trainer.py")
trainers["OnlineLearningTrainer"] = os.path.join(
trainer_abs, "online_learning_trainer.py")
# Definition of procedure execution process
trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
"ctr_coding_trainer.py")
......
......@@ -107,6 +107,7 @@ class Trainer(object):
self.device = Device.GPU
gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
self._place = fluid.CUDAPlace(gpu_id)
print("PaddleRec run on device GPU: {}".format(gpu_id))
self._exe = fluid.Executor(self._place)
elif device == "CPU":
self.device = Device.CPU
......@@ -146,6 +147,7 @@ class Trainer(object):
elif engine.upper() == "CLUSTER":
self.engine = EngineMode.CLUSTER
self.is_fleet = True
self.which_cluster_type()
else:
raise ValueError("Not Support Engine {}".format(engine))
self._context["is_fleet"] = self.is_fleet
......@@ -165,6 +167,14 @@ class Trainer(object):
self._context["is_pslib"] = (fleet_mode.upper() == "PSLIB")
self._context["fleet_mode"] = fleet_mode
def which_cluster_type(self):
cluster_type = os.getenv("PADDLEREC_CLUSTER_TYPE", "MPI")
print("PADDLEREC_CLUSTER_TYPE: {}".format(cluster_type))
if cluster_type and cluster_type.upper() == "K8S":
self._context["cluster_type"] = "K8S"
else:
self._context["cluster_type"] = "MPI"
def which_executor_mode(self):
executor_mode = envs.get_runtime_environ("train.trainer.executor_mode")
if executor_mode.upper() not in ["TRAIN", "INFER"]:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
General Trainer, applicable to many situations: Single/Cluster/Local_Cluster + PS/COLLECTIVE
"""
from __future__ import print_function
import os
from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer, EngineMode, FleetMode
class FineTuningTrainer(Trainer):
"""
Trainer for various situations
"""
def __init__(self, config=None):
Trainer.__init__(self, config)
self.processor_register()
self.abs_dir = os.path.dirname(os.path.abspath(__file__))
self.runner_env_name = "runner." + self._context["runner_name"]
def processor_register(self):
print("processor_register begin")
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('network_pass', self.network)
self.regist_context_processor('startup_pass', self.startup)
self.regist_context_processor('train_pass', self.runner)
self.regist_context_processor('terminal_pass', self.terminal)
def instance(self, context):
instance_class_path = envs.get_global_env(
self.runner_env_name + ".instance_class_path", default_value=None)
if instance_class_path:
instance_class = envs.lazy_instance_by_fliename(
instance_class_path, "Instance")(context)
else:
if self.engine == EngineMode.SINGLE:
instance_class_name = "SingleInstance"
else:
raise ValueError(
"FineTuningTrainer can only support SingleTraining.")
instance_path = os.path.join(self.abs_dir, "framework",
"instance.py")
instance_class = envs.lazy_instance_by_fliename(
instance_path, instance_class_name)(context)
instance_class.instance(context)
def network(self, context):
network_class_path = envs.get_global_env(
self.runner_env_name + ".network_class_path", default_value=None)
if network_class_path:
network_class = envs.lazy_instance_by_fliename(network_class_path,
"Network")(context)
else:
if self.engine == EngineMode.SINGLE:
network_class_name = "FineTuningNetwork"
else:
raise ValueError(
"FineTuningTrainer can only support SingleTraining.")
network_path = os.path.join(self.abs_dir, "framework",
"network.py")
network_class = envs.lazy_instance_by_fliename(
network_path, network_class_name)(context)
network_class.build_network(context)
def startup(self, context):
startup_class_path = envs.get_global_env(
self.runner_env_name + ".startup_class_path", default_value=None)
if startup_class_path:
startup_class = envs.lazy_instance_by_fliename(startup_class_path,
"Startup")(context)
else:
if self.engine == EngineMode.SINGLE and not context["is_infer"]:
startup_class_name = "FineTuningStartup"
else:
raise ValueError(
"FineTuningTrainer can only support SingleTraining.")
startup_path = os.path.join(self.abs_dir, "framework",
"startup.py")
startup_class = envs.lazy_instance_by_fliename(
startup_path, startup_class_name)(context)
startup_class.startup(context)
def runner(self, context):
runner_class_path = envs.get_global_env(
self.runner_env_name + ".runner_class_path", default_value=None)
if runner_class_path:
runner_class = envs.lazy_instance_by_fliename(runner_class_path,
"Runner")(context)
else:
if self.engine == EngineMode.SINGLE and not context["is_infer"]:
runner_class_name = "SingleRunner"
else:
raise ValueError(
"FineTuningTrainer can only support SingleTraining.")
runner_path = os.path.join(self.abs_dir, "framework", "runner.py")
runner_class = envs.lazy_instance_by_fliename(
runner_path, runner_class_name)(context)
runner_class.run(context)
def terminal(self, context):
terminal_class_path = envs.get_global_env(
self.runner_env_name + ".terminal_class_path", default_value=None)
if terminal_class_path:
terminal_class = envs.lazy_instance_by_fliename(
terminal_class_path, "Terminal")(context)
terminal_class.terminal(context)
else:
terminal_class_name = "TerminalBase"
if self.engine != EngineMode.SINGLE and self.fleet_mode != FleetMode.COLLECTIVE:
terminal_class_name = "PSTerminal"
terminal_path = os.path.join(self.abs_dir, "framework",
"terminal.py")
terminal_class = envs.lazy_instance_by_fliename(
terminal_path, terminal_class_name)(context)
terminal_class.terminal(context)
context['is_exit'] = True
......@@ -21,7 +21,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance
from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
from paddlerec.core.utils.util import split_files, check_filelist
__all__ = ["DatasetBase", "DataLoader", "QueueDataset"]
......@@ -121,14 +121,30 @@ class QueueDataset(DatasetBase):
dataset.set_pipe_command(pipe_cmd)
train_data_path = envs.get_global_env(name + "data_path")
file_list = [
os.path.join(train_data_path, x)
for x in os.listdir(train_data_path)
]
hidden_file_list, file_list = check_filelist(
hidden_file_list=[],
data_file_list=[],
train_data_path=train_data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
file_list.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
need_split_files = True
elif context["engine"] == EngineMode.CLUSTER and context[
"cluster_type"] == "K8S":
# for k8s mount afs, split files for every node
need_split_files = True
if need_split_files:
file_list = split_files(file_list, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("File_list: {}".format(file_list))
dataset.set_filelist(file_list)
for model_dict in context["phases"]:
if model_dict["dataset_name"] == dataset_name:
......
......@@ -23,7 +23,7 @@ from paddlerec.core.trainers.framework.dataset import DataLoader, QueueDataset
__all__ = [
"NetworkBase", "SingleNetwork", "PSNetwork", "PslibNetwork",
"CollectiveNetwork"
"CollectiveNetwork", "FineTuningNetwork"
]
......@@ -109,6 +109,88 @@ class SingleNetwork(NetworkBase):
context["status"] = "startup_pass"
class FineTuningNetwork(NetworkBase):
"""R
"""
def __init__(self, context):
print("Running FineTuningNetwork.")
def build_network(self, context):
context["model"] = {}
for model_dict in context["phases"]:
context["model"][model_dict["name"]] = {}
train_program = fluid.Program()
startup_program = fluid.Program()
scope = fluid.Scope()
dataset_name = model_dict["dataset_name"]
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
model_path = envs.os_path_adapter(
envs.workspace_adapter(model_dict["model"]))
model = envs.lazy_instance_by_fliename(
model_path, "Model")(context["env"])
model._data_var = model.input_data(
dataset_name=model_dict["dataset_name"])
if envs.get_global_env("dataset." + dataset_name +
".type") == "DataLoader":
model._init_dataloader(
is_infer=context["is_infer"])
data_loader = DataLoader(context)
data_loader.get_dataloader(context, dataset_name,
model._data_loader)
model.net(model._data_var, context["is_infer"])
finetuning_varnames = envs.get_global_env(
"runner." + context["runner_name"] +
".finetuning_aspect_varnames",
default_value=[])
if len(finetuning_varnames) == 0:
raise ValueError(
"nothing need to be fine tuning, you may use other traning mode"
)
if len(finetuning_varnames) != 1:
raise ValueError(
"fine tuning mode can only accept one varname now"
)
varname = finetuning_varnames[0]
finetuning_vars = train_program.global_block().vars[
varname]
finetuning_vars.stop_gradient = True
optimizer = model.optimizer()
optimizer.minimize(model._cost)
context["model"][model_dict["name"]][
"main_program"] = train_program
context["model"][model_dict["name"]][
"startup_program"] = startup_program
context["model"][model_dict["name"]]["scope"] = scope
context["model"][model_dict["name"]]["model"] = model
context["model"][model_dict["name"]][
"default_main_program"] = train_program.clone()
context["model"][model_dict["name"]]["compiled_program"] = None
context["dataset"] = {}
for dataset in context["env"]["dataset"]:
type = envs.get_global_env("dataset." + dataset["name"] + ".type")
if type == "QueueDataset":
dataset_class = QueueDataset(context)
context["dataset"][dataset[
"name"]] = dataset_class.create_dataset(dataset["name"],
context)
context["status"] = "startup_pass"
class PSNetwork(NetworkBase):
def __init__(self, context):
print("Running PSNetwork.")
......
......@@ -16,6 +16,7 @@ from __future__ import print_function
import os
import time
import warnings
import numpy as np
import paddle.fluid as fluid
......@@ -284,6 +285,7 @@ class RunnerBase(object):
return (epoch_id + 1) % epoch_interval == 0
def save_inference_model():
# get global env
name = "runner." + context["runner_name"] + "."
save_interval = int(
envs.get_global_env(name + "save_inference_interval", -1))
......@@ -296,18 +298,44 @@ class RunnerBase(object):
if feed_varnames is None or fetch_varnames is None or feed_varnames == "" or fetch_varnames == "" or \
len(feed_varnames) == 0 or len(fetch_varnames) == 0:
return
fetch_vars = [
fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames
]
# check feed var exist
for var_name in feed_varnames:
if var_name not in fluid.default_main_program().global_block(
).vars:
raise ValueError(
"Feed variable: {} not in default_main_program, global block has follow vars: {}".
format(var_name,
fluid.default_main_program().global_block()
.vars.keys()))
# check fetch var exist
fetch_vars = []
for var_name in fetch_varnames:
if var_name not in fluid.default_main_program().global_block(
).vars:
raise ValueError(
"Fetch variable: {} not in default_main_program, global block has follow vars: {}".
format(var_name,
fluid.default_main_program().global_block()
.vars.keys()))
else:
fetch_vars.append(fluid.default_main_program()
.global_block().vars[var_name])
dirname = envs.get_global_env(name + "save_inference_path", None)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
if is_fleet:
context["fleet"].save_inference_model(
context["exe"], dirname, feed_varnames, fetch_vars)
warnings.warn(
"Save inference model in cluster training is not recommended! Using save checkpoint instead.",
category=UserWarning,
stacklevel=2)
if context["fleet"].worker_index() == 0:
context["fleet"].save_inference_model(
context["exe"], dirname, feed_varnames, fetch_vars)
else:
fluid.io.save_inference_model(dirname, feed_varnames,
fetch_vars, context["exe"])
......@@ -323,7 +351,8 @@ class RunnerBase(object):
return
dirname = os.path.join(dirname, str(epoch_id))
if is_fleet:
context["fleet"].save_persistables(context["exe"], dirname)
if context["fleet"].worker_index() == 0:
context["fleet"].save_persistables(context["exe"], dirname)
else:
fluid.io.save_persistables(context["exe"], dirname)
......
......@@ -17,9 +17,13 @@ from __future__ import print_function
import warnings
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddlerec.core.utils import envs
__all__ = ["StartupBase", "SingleStartup", "PSStartup", "CollectiveStartup"]
__all__ = [
"StartupBase", "SingleStartup", "PSStartup", "CollectiveStartup",
"FineTuningStartup"
]
class StartupBase(object):
......@@ -65,6 +69,122 @@ class SingleStartup(StartupBase):
context["status"] = "train_pass"
class FineTuningStartup(StartupBase):
"""R
"""
def __init__(self, context):
self.op_name_scope = "op_namescope"
self.clip_op_name_scope = "@CLIP"
self.self.op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName(
)
print("Running SingleStartup.")
def _is_opt_role_op(self, op):
# NOTE: depend on oprole to find out whether this op is for
# optimize
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
return False
def _get_params_grads(self, program):
"""
Get optimizer operators, parameters and gradients from origin_program
Returns:
opt_ops (list): optimize operators.
params_grads (dict): parameter->gradient.
"""
block = program.global_block()
params_grads = []
# tmp set to dedup
optimize_params = set()
origin_var_dict = program.global_block().vars
for op in block.ops:
if self._is_opt_role_op(op):
# Todo(chengmo): Whether clip related op belongs to Optimize guard should be discussed
# delete clip op from opt_ops when run in Parameter Server mode
if self.op_name_scope in op.all_attrs(
) and self.clip_op_name_scope in op.attr(self.op_name_scope):
op._set_attr(
"op_role",
int(core.op_proto_and_checker_maker.OpRole.Backward))
continue
if op.attr(self.op_role_var_attr_name):
param_name = op.attr(self.op_role_var_attr_name)[0]
grad_name = op.attr(self.op_role_var_attr_name)[1]
if not param_name in optimize_params:
optimize_params.add(param_name)
params_grads.append([
origin_var_dict[param_name],
origin_var_dict[grad_name]
])
return params_grads
@staticmethod
def is_persistable(var):
"""
Check whether the given variable is persistable.
Args:
var(Variable): The variable to be checked.
Returns:
bool: True if the given `var` is persistable
False if not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
param = fluid.default_main_program().global_block().var('fc.b')
res = fluid.io.is_persistable(param)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER:
return False
return var.persistable
def load(self, context, is_fleet=False, main_program=None):
dirname = envs.get_global_env(
"runner." + context["runner_name"] + ".init_model_path", None)
if dirname is None or dirname == "":
return
print("going to load ", dirname)
params_grads = self._get_params_grads(main_program)
update_params = [p for p, _ in params_grads]
need_load_vars = []
parameters = list(
filter(FineTuningStartup.is_persistable, main_program.list_vars()))
for param in parameters:
if param not in update_params:
need_load_vars.append(param)
fluid.io.load_vars(context["exe"], dirname, main_program,
need_load_vars)
print("load from {} success".format(dirname))
def startup(self, context):
for model_dict in context["phases"]:
with fluid.scope_guard(context["model"][model_dict["name"]][
"scope"]):
train_prog = context["model"][model_dict["name"]][
"main_program"]
startup_prog = context["model"][model_dict["name"]][
"startup_program"]
with fluid.program_guard(train_prog, startup_prog):
context["exe"].run(startup_prog)
self.load(context, main_program=train_prog)
context["status"] = "train_pass"
class PSStartup(StartupBase):
def __init__(self, context):
print("Running PSStartup.")
......
......@@ -19,7 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ
from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
from paddlerec.core.utils.util import split_files, check_filelist
def dataloader_by_name(readerclass,
......@@ -38,11 +38,27 @@ def dataloader_by_name(readerclass,
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
need_split_files = True
elif context["engine"] == EngineMode.CLUSTER and context[
"cluster_type"] == "K8S":
# for k8s mount mode, split files for every node
need_split_files = True
print("need_split_files: {}".format(need_split_files))
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list : {}".format(files))
reader = reader_class(yaml_file)
reader.init()
......@@ -84,11 +100,27 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
need_split_files = True
elif context["engine"] == EngineMode.CLUSTER and context[
"cluster_type"] == "K8S":
# for k8s mount mode, split files for every node
need_split_files = True
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list: {}".format(files))
sparse = get_global_env(name + "sparse_slots", "#")
if sparse == "":
......@@ -138,11 +170,27 @@ def slotdataloader(readerclass, train, yaml_file, context):
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
need_split_files = True
elif context["engine"] == EngineMode.CLUSTER and context[
"cluster_type"] == "K8S":
# for k8s mount mode, split files for every node
need_split_files = True
if need_split_files:
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list: {}".format(files))
sparse = get_global_env("sparse_slots", "#", namespace)
if sparse == "":
......
......@@ -201,6 +201,28 @@ def split_files(files, trainer_id, trainers):
return trainer_files[trainer_id]
def check_filelist(hidden_file_list, data_file_list, train_data_path):
for root, dirs, files in os.walk(train_data_path):
if (files == None and dirs == None):
return None, None
else:
# use files and dirs
for file_name in files:
file_path = os.path.join(train_data_path, file_name)
if file_name[0] == '.':
hidden_file_list.append(file_path)
else:
data_file_list.append(file_path)
for dirs_name in dirs:
dirs_path = os.path.join(train_data_path, dirs_name)
if dirs_name[0] == '.':
hidden_file_list.append(dirs_path)
else:
#train_data_path = os.path.join(train_data_path, dirs_name)
check_filelist(hidden_file_list, data_file_list, dirs_path)
return hidden_file_list, data_file_list
class CostPrinter(object):
"""
For count cost time && print cost log
......
# PaddleRec 自定义数据集及Reader
用户自定义数据集及配置异步Reader,需要关注以下几个步骤:
* [数据集整理](#数据集整理)
* [在模型组网中加入输入占位符](#在模型组网中加入输入占位符)
* [Reader实现](#Reader的实现)
* [在yaml文件中配置Reader](#在yaml文件中配置reader)
我们以CTR-DNN模型为例,给出了从数据整理,变量定义,Reader写法,调试的完整历程。
* [数据及Reader示例-DNN](#数据及Reader示例-DNN)
## 数据集整理
PaddleRec支持模型自定义数据集。
关于数据的tips:
1. 数据量:
PaddleRec面向大规模数据设计,可以轻松支持亿级的数据读取,工业级的数据读写api:`dataset`在搜索、推荐、信息流等业务得到了充分打磨。
2. 文件类型:
支持任意直接可读的文本数据,`dataset`同时支持`.gz`格式的文本压缩数据,无需额外代码,可直接读取。数据样本应以`\n`为标志,按行组织。
3. 文件存放位置:
文件通常存放在训练节点本地,但同时,`dataset`支持使用`hadoop`远程读取数据,数据无需下载到本地,为dataset配置hadoop相关账户及地址即可。
4. 数据类型
Reader处理的是以行为单位的`string`数据,喂入网络的数据需要转为`int`,`float`的数值数据,不支持`string`喂入网络,不建议明文保存及处理训练数据。
5. Tips
Dataset模式下,训练线程与数据读取线程的关系强相关,为了多线程充分利用,`强烈建议将文件合理的拆为多个小文件`,尤其是在分布式训练场景下,可以均衡各个节点的数据量,同时加快数据的下载速度。
## 在模型组网中加入输入占位符
Reader读取文件后,产出的数据喂入网络,需要有占位符进行接收。占位符在Paddle中使用`fluid.data``fluid.layers.data`进行定义。`data`的定义可以参考[fluid.data](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/data_cn.html#data)以及[fluid.layers.data](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/layers_cn/data_cn.html#data)
假如您希望输入三个数据,分别是维度32的数据A,维度变长的稀疏数据B,以及一个一维的标签数据C,并希望梯度可以经过该变量向前传递,则示例如下:
数据A的定义:
```python
var_a = fluid.data(name='A', shape= [-1, 32], dtype='float32')
```
数据B的定义,变长数据的使用可以参考[LoDTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#cn-user-guide-lod-tensor)
```python
var_b = fluid.data(name='B', shape=[-1, 1], lod_level=1, dtype='int64')
```
数据C的定义:
```python
var_c = fluid.data(name='C', shape=[-1, 1], dtype='int32')
var_c.stop_gradient = False
```
当我们完成以上三个数据的定义后,在PaddleRec的模型定义中,还需将其加入model基类成员变量`self._data_var`
```python
self._data_var.append(var_a)
self._data_var.append(var_b)
self._data_var.append(var_c)
```
至此,我们完成了在组网中定义输入数据的工作。
## Reader的实现
### Reader的实现范式
Reader的逻辑需要一个单独的python文件进行描述。我们试写一个`test_reader.py`,实现的具体流程如下:
1. 首先我们需要引入Reader基类
```python
from paddlerec.core.reader import ReaderBase
```
2. 创建一个子类,继承Reader的基类,训练所需Reader命名为`TrainerReader`
```python
class TrainerReader(ReaderBase):
def init(self):
pass
def generator_sample(self, line):
pass
```
3.`init(self)`函数中声明一些在数据读取中会用到的变量,必要时可以在`config.yaml`文件中配置变量,利用`env.get_global_env()`拿到。
比如,我们希望从yaml文件中读取一个数据预处理变量`avg=10`,目的是将数据A的数据缩小10倍,可以这样实现:
首先更改yaml文件,在某个space下加入该变量
```yaml
...
train:
reader:
avg: 10
...
```
再更改Reader的init函数
```python
from paddlerec.core.utils import envs
class TrainerReader(Reader):
def init(self):
self.avg = envs.get_global_env("avg", None, "train.reader")
def generator_sample(self, line):
pass
```
4. 继承并实现基类中的`generate_sample(self, line)`函数,逐行读取数据。
- 该函数应返回一个可以迭代的reader方法(带有yield的函数不再是一个普通的函数,而是一个生成器generator,成为了可以迭代的对象,等价于一个数组、链表、文件、字符串etc.)
- 在这个可以迭代的函数中,如示例代码中的`def reader()`,我们定义数据读取的逻辑。以行为单位的数据进行截取,转换及预处理。
- 最后,我们需要将数据整理为特定的格式,才能够被PaddleRec的Reader正确读取,并灌入的训练的网络中。简单来说,数据的输出顺序与我们在网络中创建的`inputs`必须是严格一一对应的,并转换为类似字典的形式。
示例: 假设数据ABC在文本数据中,每行以这样的形式存储:
```shell
0.1,0.2,0.3...3.0,3.1,3.2 \t 99999,99998,99997 \t 1 \n
```
则示例代码如下:
```python
from paddlerec.core.utils import envs
class TrainerReader(Reader):
def init(self):
self.avg = envs.get_global_env("avg", None, "train.reader")
def generator_sample(self, line):
def reader(self, line):
# 先分割 '\n', 再以 '\t'为标志分割为list
variables = (line.strip('\n')).split('\t')
# A是第一个元素,并且每个数据之间使用','分割
var_a = variables[0].split(',') # list
var_a = [float(i) / self.avg for i in var_a] # 将str数据转换为float
# B是第二个元素,同样以 ',' 分割
var_b = variables[1].split(',') # list
var_b = [int(i) for i in var_b] # 将str数据转换为int
# C是第三个元素, 只有一个元素,没有分割符
var_c = variables[2]
var_c = int(var_c) # 将str数据转换为int
var_c = [var_c] # 将单独的数据元素置入list中
# 将数据与数据名结合,组织为dict的形式
# 如下,output形式为{ A: var_a, B: var_b, C: var_c}
variable_name = ['A', 'B', 'C']
output = zip(variable_name, [var_a] + [var_b] + [var_c])
# 将数据输出,使用yield方法,将该函数变为了一个可迭代的对象
yield output
```
至此,我们完成了Reader的实现。
### 在yaml文件中配置Reader
在模型的yaml配置文件中,主要的修改是三个,如下
```yaml
reader:
batch_size: 2
class: "{workspace}/reader.py"
train_data_path: "{workspace}/data/train_data"
reader_debug_mode: False
```
batch_size: 顾名思义,是小批量训练时的样本大小
class: 运行改模型所需reader的路径
train_data_path: 训练数据所在文件夹
reader_debug_mode: 测试reader语法,及输出是否符合预期的debug模式的开关
## 数据及Reader示例-DNN
Reader代码来源于[criteo_reader.py](../models/rank/criteo_reader.py), 组网代码来源于[model.py](../models/rank/dnn/model.py)
### Criteo数据集格式
CTR-DNN训练及测试数据集选用[Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。
每一行数据格式如下所示:
```bash
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
```
其中```<label>```表示广告是否被点击,点击用1表示,未点击用0表示。```<integer feature>```代表数值特征(连续特征),共有13个连续特征。```<categorical feature>```代表分类特征(离散特征),共有26个离散特征。相邻两个特征用```\t```分隔,缺失特征用空格表示。测试集中```<label>```特征已被移除。
### Criteo数据集的预处理
数据预处理共包括两步:
- 将原始训练集按9:1划分为训练集和验证集
- 数值特征(连续特征)需进行归一化处理,但需要注意的是,对每一个特征```<integer feature i>```,归一化时用到的最大值并不是用全局最大值,而是取排序后95%位置处的特征值作为最大值,同时保留极值。
### CTR网络输入的定义
正如前所述,Criteo数据集中,分为连续数据与离散(稀疏)数据,所以整体而言,CTR-DNN模型的数据输入层包括三个,分别是:`dense_input`用于输入连续数据,维度由超参数`dense_feature_dim`指定,数据类型是归一化后的浮点型数据。`sparse_input_ids`用于记录离散数据,在Criteo数据集中,共有26个slot,所以我们创建了名为`C1~C26`的26个稀疏参数输入,并设置`lod_level=1`,代表其为变长数据,数据类型为整数;最后是每条样本的`label`,代表了是否被点击,数据类型是整数,0代表负样例,1代表正样例。
在Paddle中数据输入的声明使用`paddle.fluid.layers.data()`,会创建指定类型的占位符,数据IO会依据此定义进行数据的输入。
稀疏参数输入的定义:
```python
def sparse_inputs():
ids = envs.get_global_env("hyper_parameters.sparse_inputs_slots", None)
sparse_input_ids = [
fluid.layers.data(name="S" + str(i),
shape=[1],
lod_level=1,
dtype="int64") for i in range(1, ids)
]
return sparse_input_ids
```
稠密参数输入的定义:
```python
def dense_input():
dim = envs.get_global_env("hyper_parameters.dense_input_dim", None)
dense_input_var = fluid.layers.data(name="D",
shape=[dim],
dtype="float32")
return dense_input_var
```
标签的定义:
```python
def label_input():
label = fluid.layers.data(name="click", shape=[1], dtype="int64")
return label
```
组合起来,正确的声明他们:
```python
self.sparse_inputs = sparse_inputs()
self.dense_input = dense_input()
self.label_input = label_input()
self._data_var.append(self.dense_input)
for input in self.sparse_inputs:
self._data_var.append(input)
self._data_var.append(self.label_input)
```
### Criteo Reader写法
```python
# 引入PaddleRec的Reader基类
from paddlerec.core.reader import ReaderBase
# 引入PaddleRec的读取yaml配置文件的方法
from paddlerec.core.utils import envs
# 定义TrainReader,需要继承 paddlerec.core.reader.Reader
class Reader(ReaderBase)::
# 数据预处理逻辑,继承自基类
# 如果无需处理, 使用pass跳过该函数的执行
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.hash_dim_ = envs.get_global_env("hyper_parameters.sparse_feature_number", None, "train.model")
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
# 读取数据方法,继承自基类
# 实现可以迭代的reader函数,逐行处理数据
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
"""
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
if features[idx] == "":
dense_feature.append(0.0)
else:
dense_feature.append(
(float(features[idx]) - self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_])
label = [int(features[0])]
feature_name = ["D"]
for idx in self.categorical_range_:
feature_name.append("S" + str(idx - 13))
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label])
return reader
```
### 调试Reader
在Linux下运行时,默认启动`Dataset`模式,在Win/Mac下运行时,默认启动`Dataloader`模式。
通过在`config.yaml`中添加或修改`reader_debug_mode=True`打开debug模式,只会结合组网运行reader的部分,读取10条样本,并print,方便您观察格式是否符合预期或隐藏bug。
```yaml
reader:
batch_size: 2
class: "{workspace}/../criteo_reader.py"
train_data_path: "{workspace}/data/train"
reader_debug_mode: True
```
修改后,使用paddlerec.run执行该修改后的yaml文件,可以观察输出。
```bash
python -m paddlerec.run -m ./models/rank/dnn/config.yaml -e single
```
### Dataset调试
dataset输出的数据格式如下:
` dense_input:size ; dense_input:value ; sparse_input:size ; sparse_input:value ; ... ; sparse_input:size ; sparse_input:value ; label:size ; label:value `
基本规律是对于每个变量,会先输出其维度大小,再输出其具体值。
直接debug `criteo_reader`理想的输出为(截取了一个片段):
```bash
...
13 0.0 0.00497512437811 0.05 0.08 0.207421875 0.028 0.35 0.08 0.082 0.0 0.4 0.0 0.08 1 737395 1 210498 1 903564 1 286224 1 286835 1 906818 1 90
6116 1 67180 1 27346 1 51086 1 142177 1 95024 1 157883 1 873363 1 600281 1 812592 1 228085 1 35900 1 880474 1 984402 1 100885 1 26235 1 410878 1 798162 1 499868 1 306163 1 0
...
```
可以看到首先输出的是13维的dense参数,随后是分立的sparse参数,最后一个是1维的label,数值为0,输出符合预期。
>使用Dataset的一些注意事项
> - Dataset的基本原理:将数据print到缓存,再由C++端的代码实现读取,因此,我们不能在dataset的读取代码中,加入与数据读取无关的print信息,会导致C++端拿到错误的数据信息。
> - dataset目前只支持在`unbuntu`及`CentOS`等标准Linux环境下使用,在`Windows`及`Mac`下使用时,会产生预料之外的错误,请知悉。
### DataLoader调试
dataloader的输出格式为`list: [ list[var_1], list[var_2], ... , list[var_3]]`,每条样本的数据会被放在一个 **list[list]** 中,list[0]为第一个variable。
直接debug `criteo_reader`理想的输出为(截取了一个片段):
```bash
...
[[0.0, 0.004975124378109453, 0.05, 0.08, 0.207421875, 0.028, 0.35, 0.08, 0.082, 0.0, 0.4, 0.0, 0.08], [560746], [902436], [262029], [182633], [368411], [735166], [321120], [39572], [185732], [140298], [926671], [81559], [461249], [728372], [915018], [907965], [818961], [850958], [311492], [980340], [254960], [175041], [524857], [764893], [526288], [220126], [0]]
...
```
可以看到首先输出的是13维的dense参数的list,随后是分立的sparse参数,各自在一个list中,最后一个是1维的label的list,数值为0,输出符合预期。
......@@ -48,7 +48,7 @@
```yaml
# workspace
workspace: "paddlerec.models.rank.dnn"
workspace: "models/rank/dnn"
mode: [single_cpu_train]
runner:
......
......@@ -92,7 +92,7 @@ def input_data(self, is_infer=False, **kwargs):
return train_inputs
```
更多数据读取教程,请参考[自定义数据集及Reader](custom_dataset_reader.md)
更多数据读取教程,请参考[自定义数据集及Reader](custom_reader.md)
### 组网的定义
......
# PaddleRec 预训练模型
PaddleRec基于业务实践,使用真实数据,产出了推荐领域算法的若干预训练模型,方便开发者进行算法调研。
## 文本分类预训练模型
### 获取地址
```bash
wget xxx.tar.gz
```
### 使用方法
解压后,得到的是一个paddle的模型文件夹,使用`PaddleRec/models/contentunderstanding/classification_finetue`模型进行加载
......@@ -20,7 +20,7 @@ python -m paddlerec.run -m paddlerec.models.xxx.yyy
例如启动`recall`下的`word2vec`模型的默认配置;
```shell
python -m paddlerec.run -m paddlerec.models.recall.word2vec
python -m paddlerec.run -m models/recall/word2vec
```
### 2. 启动内置模型的个性化配置训练
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.contentunderstanding.classification"
workspace: "models/contentunderstanding/classification"
dataset:
- name: data1
......
......@@ -39,8 +39,11 @@
##使用教程(快速开始)
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
```
## 使用教程(复现论文)
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.contentunderstanding.tagspace"
workspace: "models/contentunderstanding/tagspace"
dataset:
- name: sample_1
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.demo.movie_recommand"
workspace: "models/demo/movie_recommand"
# list of dataset
dataset:
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.demo.movie_recommand"
workspace: "models/demo/movie_recommand"
# list of dataset
dataset:
......
......@@ -13,7 +13,7 @@
# limitations under the License.
workspace: "paddlerec.models.match.dssm"
workspace: "models/match/dssm"
dataset:
- name: dataset_train
......
......@@ -13,7 +13,7 @@
# limitations under the License.
workspace: "paddlerec.models.match.match-pyramid"
workspace: "models/match/match-pyramid"
dataset:
- name: dataset_train
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace: "paddlerec.models.match.multiview-simnet"
workspace: "models/match/multiview-simnet"
# list of dataset
dataset:
......
......@@ -34,8 +34,11 @@
## 使用教程(快速开始)
### 训练
```shell
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/match/dssm/config.yaml # dssm
python -m paddlerec.run -m models/match/multiview-simnet/config.yaml # multiview-simnet
```
### 预测
......
......@@ -13,7 +13,7 @@
# limitations under the License.
workspace: "paddlerec.models.multitask.esmm"
workspace: "models/multitask/esmm"
dataset:
- name: dataset_train
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.multitask.mmoe"
workspace: "models/multitask/mmoe"
dataset:
- name: dataset_train
......
......@@ -44,9 +44,12 @@
## 使用教程(快速开始)
```shell
python -m paddlerec.run -m paddlerec.models.multitask.mmoe # mmoe
python -m paddlerec.run -m paddlerec.models.multitask.share-bottom # share-bottom
python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/multitask/mmoe/config.yaml # mmoe
python -m paddlerec.run -m models/multitask/share-bottom/config.yaml # share-bottom
python -m paddlerec.run -m models/multitask/esmm/config.yaml # esmm
```
## 使用教程(复现论文)
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.multitask.share-bottom"
workspace: "models/multitask/share-bottom"
dataset:
- name: dataset_train
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.AutoInt"
workspace: "models/rank/AutoInt"
dataset:
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.BST"
workspace: "models/rank/BST"
dataset:
- name: sample_1
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.afm"
workspace: "models/rank/afm"
dataset:
- name: train_sample
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.dcn"
workspace: "models/rank/dcn"
dataset:
- name: train_sample
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.deep_crossing"
workspace: "models/rank/deep_crossing"
dataset:
- name: train_sample
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.deepfm"
workspace: "models/rank/deepfm"
dataset:
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.dien"
workspace: "models/rank/dien"
dataset:
- name: sample_1
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.din"
workspace: "models/rank/din"
dataset:
- name: sample_1
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace: "paddlerec.models.rank.dnn"
workspace: "models/rank/dnn"
# list of dataset
dataset:
......@@ -67,7 +67,6 @@ runner:
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 10
phases: [phase1]
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.ffm"
workspace: "models/rank/ffm"
dataset:
- name: train_sample
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.fgcnn"
workspace: "models/rank/fgcnn"
dataset:
- name: train_sample
......
......@@ -132,7 +132,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m paddlerec.models.rank.fibinet
python -m paddlerec.run -m models/rank/fibinet
```
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace: "paddlerec.models.rank.fibinet"
workspace: "models/rank/fibinet"
# list of dataset
dataset:
......
......@@ -110,7 +110,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m paddlerec.models.rank.flen
python -m paddlerec.run -m models/rank/flen
```
## 论文复现
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace: "paddlerec.models.rank.flen"
workspace: "models/rank/flen"
# list of dataset
dataset:
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.fm"
workspace: "models/rank/fm"
dataset:
- name: train_sample
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.fnn"
workspace: "models/rank/fnn"
dataset:
- name: train_sample
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.logistic_regression"
workspace: "models/rank/logistic_regression"
dataset:
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.nfm"
workspace: "models/rank/nfm"
dataset:
- name: train_sample
......
......@@ -15,7 +15,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.pnn"
workspace: "models/rank/pnn"
dataset:
- name: train_sample
......
......@@ -107,7 +107,7 @@ sh run.sh
### 训练
```
cd modles/rank/dnn # 进入选定好的排序模型的目录 以DNN为例
python -m paddlerec.run -m paddlerec.models.rank.dnn # 使用内置配置
python -m paddlerec.run -m models/rank/dnn/config.yaml # 使用内置配置
# 如果需要使用自定义配置,config.yaml中workspace需要使用改模型目录的绝对路径
# 自定义修改超参后,指定配置文件,使用自定义配置
python -m paddlerec.run -m ./config.yaml
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.rank.wide_deep"
workspace: "models/rank/wide_deep"
dataset:
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
debug: false
workspace: "paddlerec.models.rank.xdeepfm"
workspace: "models/rank/xdeepfm"
dataset:
- name: sample_1
......
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.fasttext"
workspace: "models/recall/fasttext"
# list of dataset
dataset:
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace: "paddlerec.models.recall.gnn"
workspace: "models/recall/gnn"
# list of dataset
dataset:
......
......@@ -165,7 +165,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m paddlerec.models.recall.gnn
python -m paddlerec.run -m models/recall/gnn/config.yaml
```
### 结果展示
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.gru4rec"
workspace: "models/recall/gru4rec"
dataset:
- name: dataset_train
......
......@@ -129,7 +129,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m paddlerec.models.recall.look-alike_recall
python -m paddlerec.run -m models/recall/look-alike_recall/config.yaml
```
......
......@@ -14,7 +14,7 @@
# global settings
debug: false
workspace: "paddlerec.models.recall.look-alike_recall"
workspace: "models/recall/look-alike_recall"
dataset:
- name: sample_1
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.ncf"
workspace: "models/recall/ncf"
dataset:
- name: dataset_train
......
......@@ -62,12 +62,15 @@
## 使用教程(快速开始)
###
```shell
python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec
python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr
python -m paddlerec.run -m paddlerec.models.recall.gru4rec # gru4rec
python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn
python -m paddlerec.run -m paddlerec.models.recall.ncf # ncf
python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/recall/word2vec/config.yaml # word2vec
python -m paddlerec.run -m models/recall/ssr/config.yaml # ssr
python -m paddlerec.run -m models/recall/gru4rec/config.yaml # gru4rec
python -m paddlerec.run -m models/recall/gnn/config.yaml # gnn
python -m paddlerec.run -m models/recall/ncf/config.yaml # ncf
python -m paddlerec.run -m models/recall/youtube_dnn/config.yaml # youtube_dnn
```
## 使用教程(复现论文)
......@@ -87,6 +90,9 @@ sh data_prepare.sh
### 训练
```bash
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
cd modles/recall/gnn # 进入选定好的召回模型的目录 以gnn为例
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.ssr"
workspace: "models/recall/ssr"
dataset:
- name: dataset_train
......
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.word2vec"
workspace: "models/recall/word2vec"
# list of dataset
dataset:
......
......@@ -13,7 +13,7 @@
# limitations under the License.
workspace: "paddlerec.models.recall.youtube_dnn"
workspace: "models/recall/youtube_dnn"
dataset:
- name: dataset_train
......
......@@ -13,7 +13,7 @@
# limitations under the License.
workspace: "paddlerec.models.rerank.listwise"
workspace: "models/rerank/listwise"
dataset:
- name: dataset_train
......
......@@ -28,7 +28,10 @@
## 使用教程(快速开始)
```shell
python -m paddlerec.run -m paddlerec.models.rerank.listwise # listwise
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/rerank/listwise/config.yaml # listwise
```
## 使用教程(复现论文)
......
......@@ -8,7 +8,10 @@
2. 基于单机模型,可以进行分布式的参数服务器训练
```shell
python -m paddlerec.run -m paddlerec.models.treebased.tdm
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/treebased/tdm/config.yaml
```
## 树结构的准备
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.treebased.tdm"
workspace: "models/treebased/tdm"
# list of dataset
dataset:
......
......@@ -16,7 +16,6 @@ import os
import subprocess
import sys
import argparse
import tempfile
import warnings
import copy
......@@ -39,6 +38,7 @@ def engine_registry():
engines["TRANSPILER"]["INFER"] = single_infer_engine
engines["TRANSPILER"]["LOCAL_CLUSTER_TRAIN"] = local_cluster_engine
engines["TRANSPILER"]["CLUSTER_TRAIN"] = cluster_engine
engines["TRANSPILER"]["ONLINE_LEARNING"] = online_learning
engines["PSLIB"]["TRAIN"] = local_mpi_engine
engines["PSLIB"]["LOCAL_CLUSTER_TRAIN"] = local_mpi_engine
engines["PSLIB"]["CLUSTER_TRAIN"] = cluster_mpi_engine
......@@ -259,6 +259,20 @@ def single_infer_engine(args):
return trainer
def online_learning(args):
trainer = "OnlineLearningTrainer"
single_envs = {}
single_envs["train.trainer.trainer"] = trainer
single_envs["train.trainer.threads"] = "2"
single_envs["train.trainer.engine"] = "online_learning"
single_envs["train.trainer.platform"] = envs.get_platform()
print("use {} engine to run model: {}".format(trainer, args.model))
set_runtime_envs(single_envs, args.model)
trainer = TrainerFactory.create(args.model)
return trainer
def cluster_engine(args):
def master():
from paddlerec.core.engine.cluster.cluster import ClusterEngine
......
[easy_install]
index_url=http://pip.baidu.com/pypi/simple
\ No newline at end of file
......@@ -38,15 +38,18 @@ readme = ""
def build(dirname):
package_dir = os.path.dirname(os.path.abspath(__file__))
shutil.copytree(
package_dir, dirname, ignore=shutil.ignore_patterns(".git"))
package_dir,
dirname,
ignore=shutil.ignore_patterns(".git", "models", "build", "dist",
"*.md"))
os.mkdir(os.path.join(dirname, "paddlerec"))
shutil.move(
os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))
shutil.move(
os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))
shutil.move(
os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))
shutil.move(
os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))
shutil.move(
......@@ -63,17 +66,8 @@ def build(dirname):
package_dir = {'': dirname}
package_data = {}
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*',
'data/sample_data/infer/*', 'data/*/*.csv', 'Criteo_data/*',
'Criteo_data/sample_data/train/*'
]
engine_copy = ['*/*.sh', '*/*.template']
for package in packages:
if package.startswith("paddlerec.models."):
package_data[package] = models_copy
if package.startswith("paddlerec.core.engine"):
package_data[package] = engine_copy
......@@ -98,16 +92,6 @@ build(dirname)
shutil.rmtree(dirname)
print(u'''
\033[32m
██████╗ █████╗ ██████╗ ██████╗ ██╗ ███████╗██████╗ ███████╗ ██████╗
██╔══██╗██╔══██╗██╔══██╗██╔══██╗██║ ██╔════╝██╔══██╗██╔════╝██╔════╝
██████╔╝███████║██║ ██║██║ ██║██║ █████╗ ██████╔╝█████╗ ██║
██╔═══╝ ██╔══██║██║ ██║██║ ██║██║ ██╔══╝ ██╔══██╗██╔══╝ ██║
██║ ██║ ██║██████╔╝██████╔╝███████╗███████╗██║ ██║███████╗╚██████╗
╚═╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═╝╚══════╝ ╚═════╝
\033[0m
\033[34m
Installation Complete. Congratulations!
How to use it ? Please visit our webside: https://github.com/PaddlePaddle/PaddleRec
\033[0m
''')
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册