提交 39238142 编写于 作者: W wangjiawei04

update latest yamls

上级 ca3a6719
......@@ -26,7 +26,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......@@ -116,7 +116,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......@@ -206,7 +206,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......
......@@ -7,7 +7,7 @@ metadata:
spec:
containers:
- name: cube-0
image: wangjiawei1993/cube:v11
image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube
command: ['/bin/bash']
args: ['start.sh']
......@@ -28,7 +28,7 @@ metadata:
spec:
containers:
- name: cube-1
image: wangjiawei1993/cube:v11
image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube
command: ['/bin/bash']
args: ['start.sh']
......
......@@ -12,7 +12,7 @@ spec:
name: file-home
containers:
- name: file-server
image: halverneus/static-file-server
image: hub.baidubce.com/ctr/file-server:latest
ports:
- containerPort: 8080
volumeMounts:
......
......@@ -12,7 +12,7 @@ spec:
name: file-home
containers:
- name: file-server
image: halverneus/static-file-server
image: hub.baidubce.com/ctr/file-server:latest
ports:
- containerPort: 8080
volumeMounts:
......@@ -37,7 +37,6 @@ spec:
---
apiVersion: v1
kind: Pod
metadata:
......@@ -47,7 +46,7 @@ metadata:
spec:
containers:
- name: cube-0
image: wangjiawei1993/cube:v11
image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube
command: ['/bin/bash']
args: ['start.sh']
......@@ -68,7 +67,7 @@ metadata:
spec:
containers:
- name: cube-1
image: wangjiawei1993/cube:v11
image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube
command: ['/bin/bash']
args: ['start.sh']
......@@ -123,7 +122,7 @@ metadata:
spec:
containers:
- name: cube-transfer
image: wangjiawei1993/cube-transfer:v18
image: hub.baidubce.com/ctr/cube-transfer:latest
workingDir: /
env:
- name: POD_IP
......@@ -150,7 +149,7 @@ metadata:
spec:
containers:
- name: paddleserving
image: wangjiawei1993/paddleserving:v7-debug
image: hub.baidubce.com/ctr/paddleserving:latest
workingDir: /serving
command: ['/bin/bash']
args: ['run.sh']
......@@ -183,7 +182,7 @@ metadata:
spec:
containers:
- name: pdservingclient
image: wangjiawei1993/pdservingclient:v4
image: hub.baidubce.com/ctr/pdservingclient:latest
workingDir: /
command: ['bash']
args: ['nonstop.sh']
......@@ -196,7 +195,7 @@ kind: Job
metadata:
name: edl-demo
spec:
minAvailable: 6
minAvailable: 4
schedulerName: volcano
policies:
- event: PodEvicted
......@@ -204,7 +203,7 @@ spec:
- event: PodFailed
action: RestartJob
tasks:
- replicas: 3
- replicas: 2
name: pserver
template:
metadata:
......@@ -219,7 +218,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......@@ -273,9 +272,9 @@ spec:
- name: PADDLE_IS_LOCAL
value: "0"
- name: PADDLE_TRAINERS_NUM
value: "3"
value: "2"
- name: PADDLE_PSERVERS_NUM
value: "3"
value: "2"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: ENTRY
......@@ -309,7 +308,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......@@ -368,9 +367,9 @@ spec:
- name: PADDLE_PORT
value: "30240"
- name: PADDLE_PSERVERS_NUM
value: "3"
value: "2"
- name: PADDLE_TRAINERS_NUM
value: "3"
value: "2"
- name: PADDLE_TRAINING_ROLE
value: TRAINER
- name: TRAINING_ROLE
......@@ -381,7 +380,7 @@ spec:
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure
- replicas: 2
- replicas: 1
policies:
- event: TaskCompleted
action: CompleteJob
......@@ -399,7 +398,7 @@ spec:
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
- image: hub.baidubce.com/ctr/edldemo:latest
command:
- paddle_k8s
- start_fluid
......@@ -458,9 +457,9 @@ spec:
- name: PADDLE_PORT
value: "30240"
- name: PADDLE_PSERVERS_NUM
value: "3"
value: "2"
- name: PADDLE_TRAINERS_NUM
value: "3"
value: "2"
- name: PADDLE_TRAINING_ROLE
value: TRAINER
- name: TRAINING_ROLE
......@@ -470,3 +469,4 @@ spec:
- name: ENTRY
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure
......@@ -7,7 +7,7 @@ metadata:
spec:
containers:
- name: pdservingclient
image: wangjiawei1993/pdservingclient:v4
image: hub.baidubce.com/ctr/pdservingclient:latest
workingDir: /
command: ['bash']
args: ['nonstop.sh']
......
......@@ -7,7 +7,7 @@ metadata:
spec:
containers:
- name: paddleserving
image: wangjiawei1993/paddleserving:v7-debug
image: hub.baidubce.com/ctr/paddleserving:latest
workingDir: /serving
command: ['/bin/bash']
args: ['run.sh']
......
......@@ -7,7 +7,7 @@ metadata:
spec:
containers:
- name: cube-transfer
image: wangjiawei1993/cube-transfer:v18
image: hub.baidubce.com/ctr/cube-transfer:latest
workingDir: /
env:
- name: POD_IP
......
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: edl-demo
spec:
minAvailable: 4
schedulerName: volcano
policies:
- event: PodEvicted
action: RestartJob
- event: PodFailed
action: RestartJob
tasks:
- replicas: 2
name: pserver
template:
metadata:
labels:
paddle-job-pserver: fluid-ctr
spec:
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: pserver
volumeMounts:
- mountPath: /data
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 1Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
value: ""
- name: TRAINER_PACKAGE
value: /workspace
- name: PADDLE_INIT_NICS
value: eth2
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: ENTRY
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
- name: PADDLE_PORT
value: "30240"
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: PADDLE_TRAINING_ROLE
value: PSERVER
- name: TRAINING_ROLE
value: PSERVER
restartPolicy: OnFailure
- replicas: 2
policies:
- event: TaskCompleted
action: CompleteJob
name: trainer
template:
metadata:
labels:
paddle-job: fluid-ctr
spec:
nodeSelector:
nodeType: model
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/data
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: trainer
volumeMounts:
- mountPath: /data
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 10Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
- name: TRAINER_PACKAGE
value: /workspace
- name: PADDLE_INIT_NICS
value: eth2
- name: CPU_NUM
value: "2"
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: PADDLE_PORT
value: "30240"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_TRAINING_ROLE
value: TRAINER
- name: TRAINING_ROLE
value: TRAINER
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: ENTRY
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册