diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh index 8f8c5479df508dfc5e74ee936b665ba08d4647b1..399a21e78aa2eba2489c8aa0b4f2214328bd0a50 100644 --- a/core/engine/cluster/cloud/cluster.sh +++ b/core/engine/cluster/cloud/cluster.sh @@ -59,6 +59,7 @@ function _gen_mpi_config() { -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \ -e "s#<$ THIRDPARTY_PATH $>#$THIRDPARTY_PATH#g" \ -e "s#<$ CPU_NUM $>#$max_thread_num#g" \ + -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \ -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \ -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \ -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \ @@ -76,6 +77,7 @@ function _gen_k8s_config() { -e "s#<$ AFS_REMOTE_MOUNT_POINT $>#$AFS_REMOTE_MOUNT_POINT#g" \ -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \ -e "s#<$ CPU_NUM $>#$max_thread_num#g" \ + -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \ -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \ -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \ -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \ diff --git a/core/engine/cluster/cloud/k8s_config.ini.template b/core/engine/cluster/cloud/k8s_config.ini.template index 904bfbc5e1453f90ec1163d1681d554b52dae45f..8979cc6f0d996c132fbc2259a01134ba4a8a1ee5 100644 --- a/core/engine/cluster/cloud/k8s_config.ini.template +++ b/core/engine/cluster/cloud/k8s_config.ini.template @@ -19,6 +19,7 @@ afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/" # 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193 PADDLE_PADDLEREC_ROLE=WORKER +use_python3=<$ USE_PYTHON3 $> CPU_NUM=<$ CPU_NUM $> GLOG_v=0 diff --git a/core/engine/cluster/cloud/mpi_config.ini.template b/core/engine/cluster/cloud/mpi_config.ini.template index 8312d46a01449b3d6eac322b098d5b029bb67f86..7d9f7fbb97a53c23e566e925a87eae990cef9f2a 100644 --- a/core/engine/cluster/cloud/mpi_config.ini.template +++ b/core/engine/cluster/cloud/mpi_config.ini.template @@ -17,6 +17,7 @@ output_path=<$ OUTPUT_PATH $> thirdparty_path=<$ THIRDPARTY_PATH $> PADDLE_PADDLEREC_ROLE=WORKER +use_python3=<$ USE_PYTHON3 $> CPU_NUM=<$ CPU_NUM $> GLOG_v=0 diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index 7dbb5708e572340c37265972e541bb00ef2ee195..c94124581fcf3a591a924211ae36ce87bae97586 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -159,23 +159,28 @@ class ClusterEnvBase(object): self.cluster_env["PADDLE_VERSION"] = self.backend_env.get( "config.paddle_version", "1.7.2") + # python_version + self.cluster_env["USE_PYTHON3"] = self.backend_env.get( + "config.use_python3", "0") + # communicator + max_thread_num = int(envs.get_runtime_environ("max_thread_num")) self.cluster_env[ "FLAGS_communicator_is_sgd_optimizer"] = self.backend_env.get( "config.communicator.FLAGS_communicator_is_sgd_optimizer", 0) self.cluster_env[ "FLAGS_communicator_send_queue_size"] = self.backend_env.get( - "config.communicator.FLAGS_communicator_send_queue_size", 5) + "config.communicator.FLAGS_communicator_send_queue_size", max_thread_num) self.cluster_env[ "FLAGS_communicator_thread_pool_size"] = self.backend_env.get( "config.communicator.FLAGS_communicator_thread_pool_size", 32) self.cluster_env[ "FLAGS_communicator_max_merge_var_num"] = self.backend_env.get( - "config.communicator.FLAGS_communicator_max_merge_var_num", 5) + "config.communicator.FLAGS_communicator_max_merge_var_num", max_thread_num) self.cluster_env[ "FLAGS_communicator_max_send_grad_num_before_recv"] = self.backend_env.get( "config.communicator.FLAGS_communicator_max_send_grad_num_before_recv", - 5) + max_thread_num) self.cluster_env["FLAGS_communicator_fake_rpc"] = self.backend_env.get( "config.communicator.FLAGS_communicator_fake_rpc", 0) self.cluster_env["FLAGS_rpc_retry_times"] = self.backend_env.get( diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 9e7dbf1bd903e459d78f18f66e5893cb3d3ced1b..9c07babaa52898c2dd5ea0cace4e0aec6f68a130 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -69,6 +69,12 @@ dataset: data_path: "{workspace}/data/sample_data/train" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" dense_slots: "dense_var:13" + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 ``` 分布式的训练配置可以改为: @@ -101,6 +107,13 @@ dataset: data_path: "{workspace}/train_data" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" dense_slots: "dense_var:13" + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + # 分布式训练节点的CPU_NUM环境变量与thread_num相等,多个phase时,取最大的thread_num + thread_num: 1 ``` 除此之外,还需关注数据及模型加载的路径,一般而言: @@ -120,6 +133,8 @@ cluster_type: mpi # k8s 可选 config: # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 paddle_version: "1.7.2" + # 是否使用PaddleCloud运行环境下的Python3,默认使用python2 + use_python3: 1 # hdfs/afs的配置信息填写 fs_name: "afs://xxx.com" @@ -140,11 +155,13 @@ config: # paddle参数服务器分布式底层超参,无特殊需求不理不改 communicator: + # 使用SGD优化器时,建议设置为1 FLAGS_communicator_is_sgd_optimizer: 0 + # 以下三个变量默认都等于训练时的线程数:CPU_NUM FLAGS_communicator_send_queue_size: 5 - FLAGS_communicator_thread_pool_size: 32 FLAGS_communicator_max_merge_var_num: 5 FLAGS_communicator_max_send_grad_num_before_recv: 5 + FLAGS_communicator_thread_pool_size: 32 FLAGS_communicator_fake_rpc: 0 FLAGS_rpc_retry_times: 3 @@ -232,7 +249,7 @@ phase: 再新增`backend.yaml` ```yaml backend: "PaddleCloud" -cluster_type: mpi +cluster_type: mpi # k8s可选 config: paddle_version: "1.7.2" @@ -317,7 +334,7 @@ phase: ```yaml backend: "PaddleCloud" -cluster_type: k8s # k8s 可选 +cluster_type: k8s # mpi 可选 config: # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 @@ -399,7 +416,7 @@ phase: 再新增`backend.yaml` ```yaml backend: "PaddleCloud" -cluster_type: k8s # k8s 可选 +cluster_type: k8s # mpi 可选 config: # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 diff --git a/doc/yaml.md b/doc/yaml.md index 4d08ef72253b5fa1371faa5890c53ba32c0ca2e6..0450e50b6866f84a5295600bdcd2da94535ce4ba 100644 --- a/doc/yaml.md +++ b/doc/yaml.md @@ -1,4 +1,4 @@ -# PaddleRec yaml配置说明 +# PaddleRec config.yaml配置说明 ## 全局变量 @@ -70,3 +70,20 @@ | optimizer.learning_rate | float | > 0 | 否 | 指定学习率 | | reg | float | > 0 | 否 | L2正则化参数,只在SGD下生效 | | others | / | / | / | 由各个模型组网独立指定 | + + +# PaddleRec backend.yaml配置说明 + +## 全局变量 + + | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | + | :-------: | :-------------------: | :---------------------------------------------------: | :------: | :------------------------------------------------: | + | backend | string | paddlecloud/k8s | 是 | 使用PaddleCloud平台提交,还是在公有云K8S集群提交 | + | cluster_type | string | mpi/k8s | 是 | 指定运行的计算集群: mpi 还是 k8s | + + ## config + | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | +| :---------------------------: | :----------: | :-------------------------------------------: | :------: | :------------------------------------------------------------------: | +| paddle_version | string | paddle官方版本号,如1.7.2/1.8.0/1.8.3等 | 否 | 指定运行训练使用的Paddle版本,默认1.7.2 | +| use_python3 | int | 0(默认)/1 | 否 | 指定是否使用python3进行训练 | +| fs_name | string | "afs://xxx.com" | 是 | hadoop集群名称所需配置 | \ No newline at end of file