提交 294519a3 编写于 作者: L liqingping

Merge branch 'docs/usage' into 'master'

feat: many updates, see details in long description

See merge request platform/CloudNative4AI/cluster-lifecycle/di-orchestrator!60
......@@ -6,7 +6,7 @@ name: Release
on: [push]
env:
version: v0.2.1
version: v1.0.1
jobs:
docker:
......@@ -14,7 +14,7 @@ jobs:
strategy:
matrix:
platform: [ linux/amd64 ]
target: [ di-operator, di-server, di-webhook ]
target: [ di-orchestrator ]
steps:
- name: Checkout
uses: actions/checkout@v2
......@@ -26,12 +26,12 @@ jobs:
TARGET: ${{ matrix.target }}
run: |
DOCKER_IMAGE=$DOCKERIO_ORG/$TARGET
VERSION=${version}-edge
VERSION=${version}-nightly
if [[ $GITHUB_REF == refs/tags/* ]]; then
VERSION=${GITHUB_REF#refs/tags/}
fi
if [ "${{ github.event_name }}" = "schedule" ]; then
VERSION=nightly
VERSION=edge
fi
TAGS="${DOCKER_IMAGE}:${VERSION}"
if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
......
variables:
REGISTRY: registry.sensetime.com/cloudnative4ai
VERSION: v1.0.0
VERSION: v1.0.1
PROJECT: di-orchestrator
# dind config
DOCKER_HOST: tcp://localhost:2376
......
# di-operator version
VERSION ?= v1.0.0
VERSION ?= v1.0.1
MASTER_VERSION := $(VERSION)
COMMIT_SHORT_SHA=$(shell git log -n 1 | head -n 1 | sed -e 's/^commit //' | head -c 8)
......
......@@ -29,14 +29,34 @@ di-server-7b86ff8df4-jfgmp 1/1 Running 0 59s
```bash
# submit DIJob
$ kubectl create -f config/samples/dijob-cartpole.yaml
$ kubectl create -f config/samples/dijob-gobigger.yaml
# get pod and you will see coordinator is created by di-operator
# a few seconds later, you will see collectors and learners created by di-server
$ kubectl get pod
NAME READY STATUS RESTARTS AGE
gobigger-test-0-0 1/1 Running 0 4m17s
gobigger-test-0-1 1/1 Running 0 4m17s
# get logs of coordinator
$ kubectl logs cartpole-dqn-coordinator
$ kubectl logs -n xlab gobigger-test-0-0
Bind subprocesses on these addresses: ['tcp://10.148.3.4:22270',
'tcp://10.148.3.4:22271']
[Warning] no enough data: 128/0
...
[Warning] no enough data: 128/120
Current Training: Train Iter(0) Loss(102.256)
Current Training: Train Iter(0) Loss(103.133)
Current Training: Train Iter(20) Loss(28.795)
Current Training: Train Iter(20) Loss(32.837)
...
Current Training: Train Iter(360) Loss(12.850)
Current Training: Train Iter(340) Loss(11.812)
Current Training: Train Iter(380) Loss(12.892)
Current Training: Train Iter(360) Loss(13.621)
Current Training: Train Iter(400) Loss(15.183)
Current Training: Train Iter(380) Loss(14.187)
Current Evaluation: Train Iter(404) Eval Reward(-1788.326)
```
## User Guide
......
......@@ -46,7 +46,7 @@ func NewCreateOptions(genFlags cmdcommon.GenericFlags) *CreateOptions {
GenericFlags: genFlags,
ServerBindAddress: ":8081",
ProbeAddress: ":8080",
MetricAddress: ":8089",
MetricAddress: ":8443",
}
}
......
......@@ -4231,8 +4231,8 @@ subjects:
apiVersion: v1
data:
DI_JOB_DEFAULT_RESOURCES: '{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}'
DI_ORCHESTRATOR_VERSION: v1.0.0
DI_SERVER_URL: http://di-server.di-system:8080
DI_ORCHESTRATOR_VERSION: v1.0.1
DI_SERVER_URL: http://di-server.di-system:8081
kind: ConfigMap
metadata:
name: di-config
......@@ -4260,9 +4260,9 @@ metadata:
namespace: di-system
spec:
ports:
- port: 8080
- port: 8081
protocol: TCP
targetPort: 8080
targetPort: 8081
selector:
control-plane: di-server
---
......@@ -4295,7 +4295,7 @@ spec:
envFrom:
- configMapRef:
name: di-config
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
......@@ -4348,7 +4348,7 @@ spec:
envFrom:
- configMapRef:
name: di-config
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
......
......@@ -3,6 +3,6 @@ kind: ConfigMap
metadata:
name: di-config
data:
DI_SERVER_URL: "http://di-server.di-system:8080"
DI_ORCHESTRATOR_VERSION: v1.0.0
DI_SERVER_URL: "http://di-server.di-system:8081"
DI_ORCHESTRATOR_VERSION: v1.0.1
DI_JOB_DEFAULT_RESOURCES: '{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}'
......@@ -30,7 +30,7 @@ spec:
- "--probe-addr=:8080"
- "--metric-addr=:8443"
- "--leader-elect"
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
name: manager
envFrom:
......
......@@ -21,7 +21,7 @@ spec:
args:
- --zap-devel=true
- --server-bind-address=:8081
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
name: server
envFrom:
......
......@@ -22,7 +22,7 @@ spec:
- "--probe-addr=:8080"
- "--metric-addr=:8443"
- --port=9443
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
name: webhook
securityContext:
......
......@@ -8,4 +8,4 @@ kind: Kustomization
images:
- name: registry.sensetime.com/cloudnative4ai/di-orchestrator
newName: registry.sensetime.com/cloudnative4ai/di-orchestrator
newTag: v1.0.0
newTag: v1.0.1
......@@ -8,5 +8,5 @@ spec:
control-plane: di-server
ports:
- protocol: TCP
port: 8080
targetPort: 8080
port: 8081
targetPort: 8081
......@@ -326,7 +326,7 @@ spec:
command:
- /di-orchestrator
- operator
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
......@@ -376,7 +376,7 @@ spec:
command:
- /di-orchestrator
- server
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
......@@ -421,7 +421,7 @@ spec:
command:
- /di-orchestrator
- webhook
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.0
image: registry.sensetime.com/cloudnative4ai/di-orchestrator:v1.0.1
imagePullPolicy: Always
livenessProbe:
httpGet:
......
此差异已折叠。
此差异已折叠。
......@@ -28,11 +28,11 @@ New CRD files will be generated in [./config/crd/bases](./config/crd/bases)
## Controller Logic
Referenced to [controllers](./controllers)
Referenced to [controllers](./pkg/controllers)
## DI Server Logic
Referenced to [server](./server)
Referenced to [server](./pkg/server)
## Installation
......
此差异已折叠。
......@@ -3,8 +3,7 @@ module opendilab.org/di-orchestrator
go 1.16
require (
github.com/deckarep/golang-set v1.7.1
github.com/gin-gonic/gin v1.7.7 // indirect
github.com/gin-gonic/gin v1.7.7
github.com/go-logr/logr v0.4.0
github.com/onsi/ginkgo v1.16.4
github.com/onsi/gomega v1.15.0
......
......@@ -111,8 +111,6 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/daviddengcn/go-colortext v0.0.0-20160507010035-511bcaf42ccd/go.mod h1:dv4zxwHi5C/8AeI+4gX4dCWOIvNi7I6JCSX0HvlKPgE=
github.com/deckarep/golang-set v1.7.1 h1:SCQV0S6gTtp6itiFrTqI+pfmJ4LN85S1YzhDf9rTHJQ=
github.com/deckarep/golang-set v1.7.1/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
......@@ -208,6 +206,7 @@ github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh
github.com/go-openapi/validate v0.18.0/go.mod h1:Uh4HdOzKt19xGIGm1qHf/ofbX1YQ4Y+MYsct2VUrAJ4=
github.com/go-openapi/validate v0.19.2/go.mod h1:1tRCw7m3jtI8eNWEEliiAqUIcBztB2KDnRCRMUi7GTA=
github.com/go-openapi/validate v0.19.8/go.mod h1:8DJv2CVJQ6kGNpFW6eV9N3JviE1C85nY1c2z52x1Gk4=
github.com/go-playground/assert/v2 v2.0.1 h1:MsBgLAaY856+nPRTKrp3/OZK38U/wa0CcBYNjji3q3A=
github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q=
github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8=
......
......@@ -20,6 +20,17 @@ fi
for f in ".gitlab-ci.yml"; do
echo "update ci version to ${version}"
sed -r "s|^(\s*)VERSION:(\s*)(.*)|\1VERSION: ${version}|" "$f" >.tmp
# sed -r "s|^(\s*)VERSION:(\s*)v[0-9+][\.0-9+]*|\1VERSION: ${version}|" "$f" > .tmp
mv .tmp "$f"
done
for f in "config/manager/di_config.yaml"; do
echo "update config map orchestrator version to ${version}"
sed -r "s|^(\s*)DI_ORCHESTRATOR_VERSION:(\s*)(.*)|\1DI_ORCHESTRATOR_VERSION: ${version}|" "$f" >.tmp
mv .tmp "$f"
done
for f in ".github/workflows/release.yaml"; do
echo "update github action version to ${version}"
sed -r "s|^(\s*)version:(\s*)(.*)|\1version: ${version}|" "$f" >.tmp
mv .tmp "$f"
done
package main
import (
"context"
"flag"
"log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
ctrl "sigs.k8s.io/controller-runtime"
"opendilab.org/di-orchestrator/pkg/api/v2alpha1"
)
var (
namespace string
jobname string
replicas int
)
func main() {
flag.StringVar(&namespace, "ns", "default", "The namespace of the scaling job.")
flag.StringVar(&jobname, "n", "gobigger-test", "The name of the scaling job.")
flag.IntVar(&replicas, "r", 1, "The number of replicas for the job.")
flag.Parse()
cfg, err := ctrl.GetConfig()
if err != nil {
log.Fatalf("Failed to get kubeconfig: %v", err)
}
// create dynamic client for dijob
dclient := dynamic.NewForConfigOrDie(cfg)
gvr := schema.GroupVersionResource{
Group: v2alpha1.GroupVersion.Group,
Version: v2alpha1.GroupVersion.Version,
Resource: "dijobs",
}
diclient := dclient.Resource(gvr)
unjob, err := diclient.Namespace(namespace).Get(context.Background(), jobname, metav1.GetOptions{})
if err != nil {
log.Fatalf("Failed to get job with dynamic client: %v", err)
}
// set job.status.replicas to what we want
err = unstructured.SetNestedField(unjob.Object, int64(replicas), "status", "replicas")
if err != nil {
log.Fatalf("Failed to set nested fields")
}
// update job status
_, err = diclient.Namespace(namespace).UpdateStatus(context.Background(), unjob, metav1.UpdateOptions{})
if err != nil {
log.Fatalf("Failed to update status: %v", err)
}
log.Printf("Successfully update dijob %s/%s replicas to %d", namespace, jobname, replicas)
}
......@@ -149,11 +149,10 @@ type Phase string
const (
// JobPending means the job has been submitted to the cluster,
// but not all the pods and services have been created,
// or not pods are running
// but not all the pods and services have been created
JobPending Phase = "Pending"
// JobStarted means the job has been scheduled and waits for running.
// JobStarted means the job has been created and waits for running.
JobStarting Phase = "Starting"
// JobRestarting means the job has been rescheduled and waits for restarting.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册