diff --git a/core/engine/cluster/cloud/before_hook_cpu.sh.template b/core/engine/cluster/cloud/before_hook_cpu.sh.template index 07e5d7337d9171518187ff96c9de9bcb5e734df4..d0bd67b2fbe60221ad51e99073d097675286eac7 100644 --- a/core/engine/cluster/cloud/before_hook_cpu.sh.template +++ b/core/engine/cluster/cloud/before_hook_cpu.sh.template @@ -1,6 +1,6 @@ echo "Run before_hook.sh ..." -wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz +wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate tar -xf PaddleRec.tar.gz @@ -10,6 +10,6 @@ python setup.py install pip uninstall -y paddlepaddle -pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com +pip install paddlepaddle==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com echo "End before_hook.sh ..." diff --git a/core/engine/cluster/cloud/before_hook_gpu.sh.template b/core/engine/cluster/cloud/before_hook_gpu.sh.template index e1bbde468b900262f28f53e8895f5da219aa140d..1a9d5e189870e84670e60571dfbeadd48e1245b0 100644 --- a/core/engine/cluster/cloud/before_hook_gpu.sh.template +++ b/core/engine/cluster/cloud/before_hook_gpu.sh.template @@ -1,6 +1,6 @@ echo "Run before_hook.sh ..." -wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz +wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate tar -xf PaddleRec.tar.gz diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh index 35ba5657f36cff46b41c06639e43676af44f264a..8f8c5479df508dfc5e74ee936b665ba08d4647b1 100644 --- a/core/engine/cluster/cloud/cluster.sh +++ b/core/engine/cluster/cloud/cluster.sh @@ -39,7 +39,12 @@ function _before_submit() { elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then _gen_gpu_before_hook _gen_k8s_config - _gen_k8s_job + _gen_k8s_gpu_job + _gen_end_hook + elif [ ${DISTRIBUTE_MODE} == "PS_CPU_K8S" ]; then + _gen_cpu_before_hook + _gen_k8s_config + _gen_k8s_cpu_job _gen_end_hook fi @@ -101,6 +106,7 @@ function _gen_end_hook() { function _gen_mpi_job() { echo "gen mpi_job.sh" sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ -e "s#<$ AK $>#$AK#g" \ -e "s#<$ SK $>#$SK#g" \ -e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \ @@ -109,18 +115,34 @@ function _gen_mpi_job() { ${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh } -function _gen_k8s_job() { +function _gen_k8s_gpu_job() { echo "gen k8s_job.sh" sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ -e "s#<$ AK $>#$AK#g" \ -e "s#<$ SK $>#$SK#g" \ -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \ -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \ + -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \ -e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \ -e "s#<$ START_CMD $>#$START_CMD#g" \ ${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh } +function _gen_k8s_cpu_job() { + echo "gen k8s_job.sh" + sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ + -e "s#<$ AK $>#$AK#g" \ + -e "s#<$ SK $>#$SK#g" \ + -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \ + -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \ + -e "s#<$ K8S_PS_NUM $>#$K8S_PS_NUM#g" \ + -e "s#<$ K8S_PS_CORES $>#$K8S_PS_CORES#g" \ + -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \ + -e "s#<$ START_CMD $>#$START_CMD#g" \ + ${abs_dir}/cloud/k8s_cpu_job.sh.template >${PWD}/job.sh +} #----------------------------------------------------------------------------------------------------------------- @@ -145,6 +167,7 @@ function _submit() { function package_hook() { cur_time=`date +"%Y%m%d%H%M"` new_job_name="${JOB_NAME}_${cur_time}" + export OLD_JOB_NAME=${JOB_NAME} export JOB_NAME=${new_job_name} export job_file_path="${PWD}/${new_job_name}" mkdir ${job_file_path} diff --git a/core/engine/cluster/cloud/k8s_cpu_job.sh.template b/core/engine/cluster/cloud/k8s_cpu_job.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..c5203fcad76b28b5a48de62067b46f4ed5bf1696 --- /dev/null +++ b/core/engine/cluster/cloud/k8s_cpu_job.sh.template @@ -0,0 +1,40 @@ +#!/bin/bash +############################################################### +## 注意-- 注意--注意 ## +## K8S PS-CPU多机作业作业示例 ## +############################################################### +job_name=<$ JOB_NAME $> + +# 作业参数 +group_name="<$ GROUP_NAME $>" +job_version="paddle-fluid-v1.7.1" +start_cmd="<$ START_CMD $>" +wall_time="10:00:00" + +k8s_priority=<$ K8S_PRIORITY $> +k8s_trainers=<$ K8S_TRAINERS $> +k8s_cpu_cores=<$ K8S_CPU_CORES $> +k8s_ps_num=<$ K8S_PS_NUM $> +k8s_ps_cores=<$ K8S_PS_CORES $> + +# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) +ak=<$ AK $> +sk=<$ SK $> + +paddlecloud job --ak ${ak} --sk ${sk} \ + train --job-name ${job_name} \ + --group-name ${group_name} \ + --job-conf config.ini \ + --start-cmd "${start_cmd}" \ + --files ./* \ + --job-version ${job_version} \ + --k8s-priority ${k8s_priority} \ + --wall-time ${wall_time} \ + --k8s-trainers ${k8s_trainers} \ + --k8s-cpu-cores ${k8s_cpu_cores} \ + --k8s-ps-num ${k8s_ps_num} \ + --k8s-ps-cores ${k8s_ps_cores} \ + --is-standalone 0 \ + --distribute-job-type "PSERVER" \ + --json + \ No newline at end of file diff --git a/core/engine/cluster/cloud/k8s_job.sh.template b/core/engine/cluster/cloud/k8s_job.sh.template index 5c2ebdcd62ef4ca46dafc57db95ede9fcfd13ab3..9886f11aebbbe547ed1fb433a35c653e2a77f6f3 100644 --- a/core/engine/cluster/cloud/k8s_job.sh.template +++ b/core/engine/cluster/cloud/k8s_job.sh.template @@ -3,7 +3,7 @@ ## 注意-- 注意--注意 ## ## K8S NCCL2多机作业作业示例 ## ############################################################### -job_name=${JOB_NAME} +job_name=<$ JOB_NAME $> # 作业参数 group_name="<$ GROUP_NAME $>" @@ -13,8 +13,20 @@ wall_time="10:00:00" k8s_priority=<$ K8S_PRIORITY $> k8s_trainers=<$ K8S_TRAINERS $> +k8s_cpu_cores=<$ K8S_CPU_CORES $> k8s_gpu_cards=<$ K8S_GPU_CARD $> +is_stand_alone=0 +nccl="--distribute-job-type "NCCL2"" +if [ ${k8s_trainers} == 1 ];then + is_stand_alone=1 + nccl="--job-remark single-trainer" + if [ ${k8s_gpu_cards} == 1];then + nccl="--job-remark single-gpu" + echo "Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml." + fi +fi + # 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) ak=<$ AK $> sk=<$ SK $> @@ -27,9 +39,11 @@ paddlecloud job --ak ${ak} --sk ${sk} \ --files ./* \ --job-version ${job_version} \ --k8s-trainers ${k8s_trainers} \ + --k8s-cpu-cores ${k8s_cpu_cores} \ --k8s-gpu-cards ${k8s_gpu_cards} \ --k8s-priority ${k8s_priority} \ --wall-time ${wall_time} \ - --is-standalone 0 \ - --distribute-job-type "NCCL2" \ - --json \ No newline at end of file + --is-standalone ${is_stand_alone} \ + --json \ + ${nccl} + \ No newline at end of file diff --git a/core/engine/cluster/cloud/mpi_job.sh.template b/core/engine/cluster/cloud/mpi_job.sh.template index 84fafaffaa9f6ccc06578d673144c0d63069e13b..46d68d2130d591c86f4a587000498c139c1e74aa 100644 --- a/core/engine/cluster/cloud/mpi_job.sh.template +++ b/core/engine/cluster/cloud/mpi_job.sh.template @@ -3,7 +3,7 @@ ## 注意--注意--注意 ## ## MPI 类型作业演示 ## ############################################################### -job_name=${JOB_NAME} +job_name=<$ JOB_NAME $> # 作业参数 group_name=<$ GROUP_NAME $> diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index 4fe7529f9664a4e9a78c63dbe6c5c18dfe59f141..7dbb5708e572340c37265972e541bb00ef2ee195 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -67,10 +67,10 @@ class ClusterEngine(Engine): @staticmethod def workspace_replace(): - workspace = envs.get_runtime_environ("workspace") + remote_workspace = envs.get_runtime_environ("remote_workspace") for k, v in os.environ.items(): - v = v.replace("{workspace}", workspace) + v = v.replace("{workspace}", remote_workspace) os.environ[k] = str(v) def run(self): @@ -98,14 +98,12 @@ class ClusterEngine(Engine): cluster_env_check_tool = PaddleCloudMpiEnv() else: raise ValueError( - "Paddlecloud with Mpi don't support GPU training, check your config" + "Paddlecloud with Mpi don't support GPU training, check your config.yaml & backend.yaml" ) elif cluster_type.upper() == "K8S": if fleet_mode == "PS": if device == "CPU": - raise ValueError( - "PS-CPU on paddlecloud is not supported at this time, comming soon" - ) + cluster_env_check_tool = CloudPsCpuEnv() elif device == "GPU": raise ValueError( "PS-GPU on paddlecloud is not supported at this time, comming soon" @@ -115,7 +113,7 @@ class ClusterEngine(Engine): cluster_env_check_tool = CloudCollectiveEnv() elif device == "CPU": raise ValueError( - "Unexpected config -> device: CPU with fleet_mode: Collective, check your config" + "Unexpected config -> device: CPU with fleet_mode: Collective, check your config.yaml" ) else: raise ValueError("cluster_type {} error, must in MPI/K8S".format( @@ -234,7 +232,7 @@ class PaddleCloudMpiEnv(ClusterEnvBase): "config.train_data_path", "") if self.cluster_env["TRAIN_DATA_PATH"] == "": raise ValueError( - "No -- TRAIN_DATA_PATH -- found in your backend.yaml, please check." + "No -- TRAIN_DATA_PATH -- found in your backend.yaml, please add train_data_path in your backend yaml." ) # test_data_path self.cluster_env["TEST_DATA_PATH"] = self.backend_env.get( @@ -274,7 +272,7 @@ class PaddleCloudK8sEnv(ClusterEnvBase): category=UserWarning, stacklevel=2) warnings.warn( - "The remote mount point will be mounted to the ./afs/", + "The remote afs path will be mounted to the ./afs/", category=UserWarning, stacklevel=2) @@ -293,3 +291,21 @@ class CloudCollectiveEnv(PaddleCloudK8sEnv): "submit.k8s_gpu_card", 1) self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get( "submit.k8s_cpu_cores", 1) + + +class CloudPsCpuEnv(PaddleCloudK8sEnv): + def __init__(self): + super(CloudPsCpuEnv, self).__init__() + + def env_check(self): + super(CloudPsCpuEnv, self).env_check() + + self.cluster_env["DISTRIBUTE_MODE"] = "PS_CPU_K8S" + self.cluster_env["K8S_TRAINERS"] = self.backend_env.get( + "submit.k8s_trainers", 1) + self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get( + "submit.k8s_cpu_cores", 2) + self.cluster_env["K8S_PS_NUM"] = self.backend_env.get( + "submit.k8s_ps_num", 1) + self.cluster_env["K8S_PS_CORES"] = self.backend_env.get( + "submit.k8s_ps_cores", 2) diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 59a22bf258e91eefdab315bfcaca67416e5eef89..9e7dbf1bd903e459d78f18f66e5893cb3d3ced1b 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -9,6 +9,7 @@ - [第三步:增加集群运行`backend.yaml`配置](#第三步增加集群运行backendyaml配置) - [MPI集群的Parameter Server模式配置](#mpi集群的parameter-server模式配置) - [K8S集群的Collective模式配置](#k8s集群的collective模式配置) + - [K8S集群的PS-CPU模式配置](#k8s集群的ps-cpu模式配置) - [第四步:任务提交](#第四步任务提交) - [使用PaddleCloud Client提交](#使用paddlecloud-client提交) - [第一步:在`before_hook.sh`里手动安装PaddleRec](#第一步在before_hooksh里手动安装paddlerec) @@ -34,10 +35,10 @@ 分布式运行首先需要更改`config.yaml`,主要调整以下内容: -- workspace: 调整为在节点运行时的工作目录 -- runner_class: 从单机的"train"调整为"cluster_train" -- fleet_mode: 选则参数服务器模式,抑或GPU Collective模式 -- distribute_strategy: 可选项,选择分布式训练的策略 +- workspace: 调整为在远程点运行时的工作目录,一般设置为`"./"`即可 +- runner_class: 从单机的"train"调整为"cluster_train",单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train) +- fleet_mode: 选则参数服务器模式(ps),抑或GPU的all-reduce模式(collective) +- distribute_strategy: 可选项,选择分布式训练的策略,目前只在参数服务器模式下生效,可选项:`sync、asycn、half_async、geo` 配置选项具体参数,可以参考[yaml配置说明](./yaml.md) @@ -50,47 +51,56 @@ workspace: "paddlerec.models.rank.dnn" mode: [single_cpu_train] -# config of each runner. -# runner is a kind of paddle training class, which wraps the train/infer process. runner: - name: single_cpu_train class: train - # num of epochs epochs: 4 - # device to run training or infer device: cpu - save_checkpoint_interval: 2 # save model interval of epochs - save_checkpoint_path: "increment_dnn" # save checkpoint path - init_model_path: "" # load model path + save_checkpoint_interval: 2 + save_checkpoint_path: "increment_dnn" + init_model_path: "" print_interval: 10 phases: [phase1] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" ``` 分布式的训练配置可以改为: ```yaml -# workspace -# 改变一:代码上传至节点后,与运行shell同在一个默认目录下 +# 改变一:代码上传至节点后,在默认目录下 workspace: "./" mode: [ps_cluster] -# config of each runner. -# runner is a kind of paddle training class, which wraps the train/infer process. runner: - name: ps_cluster # 改变二:调整runner的class class: cluster_train - # num of epochs epochs: 4 - # device to run training or infer device: cpu # 改变三 & 四: 指定fleet_mode 与 distribute_strategy fleet_mode: ps distribute_strategy: async - save_checkpoint_interval: 2 # save model interval of epochs - save_checkpoint_path: "increment_dnn" # save checkpoint path - init_model_path: "" # load model path + save_checkpoint_interval: 2 + save_checkpoint_path: "increment_dnn" + init_model_path: "" print_interval: 10 phases: [phase1] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + # 改变五: 改变数据的读取目录 + # 通常而言,mpi模式下,数据会下载到远程节点执行目录的'./train_data'下, k8s则与挂载位置有关 + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" ``` 除此之外,还需关注数据及模型加载的路径,一般而言: @@ -165,7 +175,14 @@ submit: # for k8s gpu # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 k8s_trainers: 2 + k8s-cpu-cores: 4 k8s_gpu_card: 1 + + # for k8s ps-cpu + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 ``` @@ -173,18 +190,51 @@ submit: 除此之外,我们还需要关注上传到工作目录的文件(`files选项`)的路径问题,在示例中是`./*.py`,说明我们执行任务提交时,与这些py文件在同一目录。若不在同一目录,则需要适当调整files路径,或改为这些文件的绝对路径。 -不建议利用`files`上传数据文件,可以通过指定`train_data_path`自动下载,或指定`afs_remote_mount_point`挂载实现数据到节点的转移。 +不建议利用`files`上传过大的数据文件,可以通过指定`train_data_path`自动下载,或在k8s模式下指定`afs_remote_mount_point`挂载实现数据到节点的转移。 #### MPI集群的Parameter Server模式配置 下面是一个利用PaddleCloud提交MPI参数服务器模式任务的`backend.yaml`示例 +首先调整`config.yaml`: +```yaml +workspace: "./" +mode: [ps_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: ps_cluster + class: cluster_train + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + + +再新增`backend.yaml` ```yaml backend: "PaddleCloud" -cluster_type: mpi # k8s 可选 +cluster_type: mpi config: - # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 paddle_version: "1.7.2" # hdfs/afs的配置信息填写 @@ -229,9 +279,45 @@ submit: 下面是一个利用PaddleCloud提交K8S集群进行GPU训练的`backend.yaml`示例 +首先调整`config.yaml` + +```yaml +workspace: "./" +mode: [collective_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: collective_cluster + class: cluster_train + epochs: 2 + device: gpu + fleet_mode: collective + save_checkpoint_interval: 1 # save model interval of epochs + save_checkpoint_path: "increment_dnn" # save checkpoint path + init_model_path: "" # load model path + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + + +再增加`backend.yaml` + ```yaml backend: "PaddleCloud" -cluster_type: mpi # k8s 可选 +cluster_type: k8s # k8s 可选 config: # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 @@ -271,9 +357,93 @@ submit: # for k8s gpu # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 k8s_trainers: 2 + k8s-cpu-cores: 4 k8s_gpu_card: 1 ``` +#### K8S集群的PS-CPU模式配置 +下面是一个利用PaddleCloud提交K8S集群进行参数服务器CPU训练的`backend.yaml`示例 + +首先调整`config.yaml`: +```yaml +workspace: "./" +mode: [ps_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: ps_cluster + class: cluster_train + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + +再新增`backend.yaml` +```yaml +backend: "PaddleCloud" +cluster_type: k8s # k8s 可选 + +config: + # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 + paddle_version: "1.7.2" + + # hdfs/afs的配置信息填写 + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + + # 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + output_path: "" + + # for k8s + # 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path + afs_remote_mount_point: "" + +submit: + # PaddleCloud 个人信息 AK 及 SK + ak: "" + sk: "" + + # 任务运行优先级,默认high + priority: "high" + + # 任务名称 + job_name: "PaddleRec_CTR" + + # 训练资源所在组 + group: "" + + # 节点上的任务启动命令 + start_cmd: "python -m paddlerec.run -m ./config.yaml" + + # 本地需要上传到节点工作目录的文件 + files: ./*.py ./*.yaml + + # for k8s gpu + # k8s ps-cpu 模式下,训练节点数,参数服务器节点数,及每个节点上的cpu核心数及内存限制 + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 +``` + ### 第四步:任务提交 当我们准备好`config.yaml`与`backend.yaml`,便可以进行一键任务提交,命令为: diff --git a/models/rank/dnn/backend.yaml b/models/rank/dnn/backend.yaml index 18647b37eef8e4eb33f788c0fb5a0292230bb4e2..03b5efe7847ddb4a6cabf0f817a58f686e12fad1 100644 --- a/models/rank/dnn/backend.yaml +++ b/models/rank/dnn/backend.yaml @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -workspace: "./" - backend: "PaddleCloud" -cluster_type: k8s # k8s 可选 +cluster_type: k8s # mpi 可选 config: fs_name: "afs://xxx.com" @@ -56,5 +52,12 @@ submit: # for k8s gpu k8s_trainers: 2 + k8s_cpu_cores: 2 k8s_gpu_card: 1 + + # for k8s ps-cpu + k8s_trainers: 2 + k8s_cpu_cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..0f7efd39336b4bf0443da4a8c89b7860ad23efd3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[easy_install] +index_url=http://pip.baidu.com/pypi/simple \ No newline at end of file