From 3fe400e50bc93a95e92fbdbc28c69c7e4199f38b Mon Sep 17 00:00:00 2001 From: tigerK Date: Tue, 14 Nov 2023 20:58:14 +0800 Subject: [PATCH 1/2] Feat: add build image job --- go.mod | 4 +- go.sum | 5 +- .../finetune/finetuneexperiment_controller.go | 25 +++++++++ .../finetune/finetunejob_controller.go | 21 ++++++- pkg/config/config.go | 30 ++++++++++ pkg/util/generate/generate.go | 55 ++++++++++++++++++- 6 files changed, 132 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 5b32d4d..4e58ce3 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/DataTunerX/finetune-experiment-controller go 1.19 require ( - github.com/DataTunerX/meta-server v0.0.0-20231109015709-57812268ad17 + github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1 github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2 github.com/go-logr/zapr v1.2.3 github.com/operator-framework/operator-lib v0.11.0 @@ -32,7 +32,7 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.1.0 // indirect - github.com/google/uuid v1.1.2 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/josharian/intern v1.0.0 // indirect diff --git a/go.sum b/go.sum index 494519f..2e035c9 100644 --- a/go.sum +++ b/go.sum @@ -40,6 +40,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/DataTunerX/meta-server v0.0.0-20231109015709-57812268ad17 h1:szsQx64N0bTO6qpCmD4V7Ne1AuF+y/KcRyyh7UE2SRQ= github.com/DataTunerX/meta-server v0.0.0-20231109015709-57812268ad17/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= +github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1 h1:WxEyoS9Dlkm2Yfcpn0sL0Gz/xfXdN0fdxb/dGYAQIqQ= +github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2 h1:3mBAWDqYrWtDk9xvIHDG/dN5zGcliwJnyvpWHFHcC+A= github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2/go.mod h1:qL3DYjQa7av0QkZoFrycHbpXHGQfBNEDke8uv+FdDn4= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -177,8 +179,9 @@ github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201218002935-b9804c9f04c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= diff --git a/internal/controller/finetune/finetuneexperiment_controller.go b/internal/controller/finetune/finetuneexperiment_controller.go index 4856758..299ff6a 100644 --- a/internal/controller/finetune/finetuneexperiment_controller.go +++ b/internal/controller/finetune/finetuneexperiment_controller.go @@ -18,6 +18,7 @@ package finetune import ( "context" + "fmt" "time" "github.com/DataTunerX/utility-server/logging" @@ -78,6 +79,30 @@ func (r *FinetuneExperimentReconciler) Reconcile(ctx context.Context, req ctrl.R return handlererr.HandlerErr(err) } } + + for i := range finetuneExperiment.Spec.FinetuneJobs { + finetuneJob := finetuneExperiment.Spec.FinetuneJobs[i] + if finetuneJob.Name == nil { + name := fmt.Sprintf("%s-%s", finetuneExperiment.Name, "finetunejob") + finetuneJob.Name = &name + } + finetuneJobInstance := &finetunev1beta1.FinetuneJob{} + finetuneJobInstance.Spec = finetuneJob.Spec + finetuneJobInstance.Name = *finetuneJob.Name + finetuneJobInstance.Namespace = finetuneExperiment.Namespace + if err := ctrl.SetControllerReference(finetuneExperiment, finetuneJobInstance, r.Scheme); err != nil { + r.Log.Errorf("SetControllerReference failed finetuneJob: %s/%s, owner finetuneExperiment: %s/%s, err: %v", + finetuneJobInstance.Name, finetuneJobInstance.Namespace, finetuneExperiment.Name, finetuneExperiment.Namespace, err) + return handlererr.HandlerErr(err) + } + if err := r.Client.Create(ctx, finetuneJobInstance); err != nil { + if errors.IsAlreadyExists(err) { + return handlererr.HandlerErr(nil) + } + r.Log.Errorf("Create finetuneJob %s/%s failed: %v", finetuneJobInstance.Name, finetuneJobInstance.Namespace, err) + return handlererr.HandlerErr(err) + } + } return ctrl.Result{}, nil } diff --git a/internal/controller/finetune/finetunejob_controller.go b/internal/controller/finetune/finetunejob_controller.go index 907fbc5..1b8b195 100644 --- a/internal/controller/finetune/finetunejob_controller.go +++ b/internal/controller/finetune/finetunejob_controller.go @@ -18,9 +18,11 @@ package finetune import ( "context" + "fmt" "reflect" "time" + "github.com/DataTunerX/finetune-experiment-controller/pkg/config" "github.com/DataTunerX/finetune-experiment-controller/pkg/util/generate" "github.com/DataTunerX/finetune-experiment-controller/pkg/util/handlererr" corev1beta1 "github.com/DataTunerX/meta-server/api/core/v1beta1" @@ -144,10 +146,25 @@ func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return handlererr.HandlerErr(err) } // build llmCheckpoint image server. job - + endpoint := config.GetS3Endpoint() + accesskeyId := config.GetS3AccesskeyId() + accessSecretkey := config.GetS3ESecretAccessKey() + bucket := config.GetS3Bucket() + filePath := config.GetS3FilePath() + secure := config.GetSecure() + image := "release.daocloud.io/datatunerx/buildimage:v0.0.1" + buildImageName := fmt.Sprintf("%s-buildimage", finetuneJob.Name) + buildImageJob := generate.GenerateBuildImageJob(buildImageName, finetuneJob.Namespace, + endpoint, accesskeyId, accessSecretkey, bucket, filePath, secure, image) + if err := r.Client.Create(ctx, buildImageJob); err != nil { + if !errors.IsAlreadyExists(err) { + r.Log.Errorf("Create job %s/%s failed, err: %v", buildImageJob.Name, buildImageJob.Namespace, err) + return handlererr.HandlerErr(err) + } + } } - r.Log.Infof("update finetuneJob %s/%s status %s.", req.Name, req.Namespace, finetunev1beta1.FinetuneJobFinetune) + r.Log.Infof("Update finetuneJob %s/%s status %s.", req.Name, req.Namespace, finetunev1beta1.FinetuneJobFinetune) finetuneJob.Status.State = finetunev1beta1.FinetuneJobFinetune finetuneJob.Status.FinetuneState = existFinetune.Status.State if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { diff --git a/pkg/config/config.go b/pkg/config/config.go index 208ab8a..bad400f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -9,6 +9,36 @@ func init() { config.AutomaticEnv() config.BindEnv("level", "LOG_LEVEL") config.SetDefault("level", "debug") + config.BindEnv("endpoint", "S3_ENDPOINT") + config.BindEnv("accessKey", "S3_ACCESSKEYID") + config.BindEnv("secretkey", "S3_SECRETACCESSKEY") + config.BindEnv("bucket", "S3_BUCKET") + config.BindEnv("filePath", "S3_FILEPATH") + config.BindEnv("secure", "S3_SECURE") +} + +func GetS3Endpoint() string { + return config.GetString("endpoint") +} + +func GetS3AccesskeyId() string { + return config.GetString("accessKey") +} + +func GetS3ESecretAccessKey() string { + return config.GetString("secretkey") +} + +func GetS3Bucket() string { + return config.GetString("bucket") +} + +func GetS3FilePath() string { + return config.GetString("filePath") +} + +func GetSecure() string { + return config.GetString("secure") } func GetLevel() string { diff --git a/pkg/util/generate/generate.go b/pkg/util/generate/generate.go index ead61cf..97eb1cf 100644 --- a/pkg/util/generate/generate.go +++ b/pkg/util/generate/generate.go @@ -5,6 +5,7 @@ import ( finetunev1beta1 "github.com/DataTunerX/meta-server/api/finetune/v1beta1" batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -33,7 +34,55 @@ func GenerateFinetune(finetuneJob *finetunev1beta1.FinetuneJob) *finetunev1beta1 } // todo(tigerK) add build image job -func GenerateBuildImageJob() *batchv1.Job { - return &batchv1.Job{} - +func GenerateBuildImageJob(name, namespace, endpoint, accessKeyId, secretAccessKey, bucket, filePath, image, secure string) *batchv1.Job { + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: batchv1.JobSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "imagebuild", + Image: image, + Env: []corev1.EnvVar{ + { + Name: "S3_ENDPOINT", + Value: endpoint, + }, + {Name: "S3_ACCESSKEYID", + Value: accessKeyId, + }, + { + Name: "S3_SECRETACCESSKEY", + Value: secretAccessKey, + }, + { + Name: "S3_BUCKET", + Value: bucket, + }, + { + Name: "S3_FILEPATH", + Value: filePath, + }, + { + Name: "S3_SECURE", + Value: secure, + }, + }, + Command: []string{"bin/bash"}, + Args: []string{ + "-c", + `buildah from docker.io/library/ubuntu + buildah copy containerID /local/path /path/in/container + buildah commit containerID your-image-name`, + }, + }, + }, + }, + }, + }, + } } From edbe186d5a653d516024f1f0b4b08fb5b49691bb Mon Sep 17 00:00:00 2001 From: tigerK Date: Fri, 17 Nov 2023 10:24:52 +0800 Subject: [PATCH 2/2] Feat: Overall process completion with some test data included --- .../app/controller_manager.go | 2 + go.mod | 9 +- go.sum | 22 +- .../finetune/finetuneexperiment_controller.go | 58 ++++- .../finetune/finetunejob_controller.go | 159 ++++++++++- pkg/config/config.go | 40 ++- pkg/util/generate/generate.go | 246 +++++++++++++++++- 7 files changed, 486 insertions(+), 50 deletions(-) diff --git a/cmd/controller-manager/app/controller_manager.go b/cmd/controller-manager/app/controller_manager.go index 021b50b..80d2126 100644 --- a/cmd/controller-manager/app/controller_manager.go +++ b/cmd/controller-manager/app/controller_manager.go @@ -13,6 +13,7 @@ import ( finetunev1beta1 "github.com/DataTunerX/meta-server/api/finetune/v1beta1" "github.com/go-logr/zapr" "github.com/operator-framework/operator-lib/leader" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/spf13/pflag" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -35,6 +36,7 @@ func init() { utilruntime.Must(finetunev1beta1.AddToScheme(scheme)) utilruntime.Must(corev1beta1.AddToScheme(scheme)) utilruntime.Must(extensionv1beta1.AddToScheme(scheme)) + utilruntime.Must(rayv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } diff --git a/go.mod b/go.mod index 4e58ce3..33780dd 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,11 @@ module github.com/DataTunerX/finetune-experiment-controller go 1.19 require ( - github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1 + github.com/DataTunerX/meta-server v0.0.0-20231116102108-24bd83a6be89 github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2 github.com/go-logr/zapr v1.2.3 github.com/operator-framework/operator-lib v0.11.0 + github.com/ray-project/kuberay/ray-operator v1.0.0 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.17.0 k8s.io/api v0.26.0 @@ -59,10 +60,10 @@ require ( go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.26.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect - golang.org/x/net v0.15.0 // indirect + golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect - golang.org/x/sys v0.12.0 // indirect - golang.org/x/term v0.12.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/term v0.13.0 // indirect golang.org/x/text v0.13.0 // indirect golang.org/x/time v0.3.0 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect diff --git a/go.sum b/go.sum index 2e035c9..6a12c75 100644 --- a/go.sum +++ b/go.sum @@ -38,10 +38,14 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/DataTunerX/meta-server v0.0.0-20231109015709-57812268ad17 h1:szsQx64N0bTO6qpCmD4V7Ne1AuF+y/KcRyyh7UE2SRQ= -github.com/DataTunerX/meta-server v0.0.0-20231109015709-57812268ad17/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1 h1:WxEyoS9Dlkm2Yfcpn0sL0Gz/xfXdN0fdxb/dGYAQIqQ= github.com/DataTunerX/meta-server v0.0.0-20231113032938-bf87d14956b1/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= +github.com/DataTunerX/meta-server v0.0.0-20231116063244-4b1d018072c0 h1:BJ6OqFz1ROHizgQ9eNWpWSCzMEe4PFLhCloBUsLrYa0= +github.com/DataTunerX/meta-server v0.0.0-20231116063244-4b1d018072c0/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= +github.com/DataTunerX/meta-server v0.0.0-20231116064242-ea7bb845394f h1:ivD0gAMQ0gWtJ1/xWeUqkOce0PEO2LXWfjAAGiPwTvw= +github.com/DataTunerX/meta-server v0.0.0-20231116064242-ea7bb845394f/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= +github.com/DataTunerX/meta-server v0.0.0-20231116102108-24bd83a6be89 h1:czoBDPd42BBGiCREjfnaxG5BNcHk+9MnkemXAnG/bEw= +github.com/DataTunerX/meta-server v0.0.0-20231116102108-24bd83a6be89/go.mod h1:MrA+U+PYANBfU8B43hrkJQ3WOIFPzUqowUO7s+KafvU= github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2 h1:3mBAWDqYrWtDk9xvIHDG/dN5zGcliwJnyvpWHFHcC+A= github.com/DataTunerX/utility-server v0.0.0-20231107081331-e4ac0bbd2db2/go.mod h1:qL3DYjQa7av0QkZoFrycHbpXHGQfBNEDke8uv+FdDn4= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -284,6 +288,8 @@ github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1 github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/ray-project/kuberay/ray-operator v1.0.0 h1:i69nvbV7az2FG41VHQgxrmhD+SUl8ca+ek4RPbSE2Q0= +github.com/ray-project/kuberay/ray-operator v1.0.0/go.mod h1:7C7ebIkxtkmOX8w1iiLrKM1j4hkZs/Guzm3WdePk/yg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/sagikazarmark/locafero v0.3.0 h1:zT7VEGWC2DTflmccN/5T1etyKvxSxpHsjb9cJvm4SvQ= @@ -419,8 +425,8 @@ golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qx golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= -golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -490,12 +496,12 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.12.0 h1:/ZfYdc3zq+q02Rv9vGqTeSItdzZTSNDmfTi0mBAuidU= -golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/internal/controller/finetune/finetuneexperiment_controller.go b/internal/controller/finetune/finetuneexperiment_controller.go index 299ff6a..c9cd467 100644 --- a/internal/controller/finetune/finetuneexperiment_controller.go +++ b/internal/controller/finetune/finetuneexperiment_controller.go @@ -22,6 +22,7 @@ import ( "time" "github.com/DataTunerX/utility-server/logging" + "k8s.io/apimachinery/pkg/types" "github.com/DataTunerX/finetune-experiment-controller/pkg/util/handlererr" finetunev1beta1 "github.com/DataTunerX/meta-server/api/finetune/v1beta1" @@ -40,10 +41,6 @@ type FinetuneExperimentReconciler struct { Log logging.Logger } -const ( - finetuneFinalizer = "finetune.datatunerx.io/finalizer" -) - //+kubebuilder:rbac:groups=finetune.datatunerx.io,resources=finetuneexperiments,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=finetune.datatunerx.io,resources=finetuneexperiments/status,verbs=get;update;patch //+kubebuilder:rbac:groups=finetune.datatunerx.io,resources=finetuneexperiments/finalizers,verbs=update @@ -61,9 +58,9 @@ func (r *FinetuneExperimentReconciler) Reconcile(ctx context.Context, req ctrl.R } if finetuneExperiment.GetDeletionTimestamp() != nil { - if controllerutil.ContainsFinalizer(finetuneExperiment, finetuneFinalizer) { + if controllerutil.ContainsFinalizer(finetuneExperiment, finetunev1beta1.FinetuneGroupFinalizer) { // todo cleaner - controllerutil.RemoveFinalizer(finetuneExperiment, finetuneFinalizer) + controllerutil.RemoveFinalizer(finetuneExperiment, finetunev1beta1.FinetuneGroupFinalizer) if err := r.Update(ctx, finetuneExperiment); err != nil { r.Log.Errorf("Remove finalizer failed: %s/%s, Err: %v", req.Name, req.Namespace, err) return handlererr.HandlerErr(err) @@ -71,8 +68,8 @@ func (r *FinetuneExperimentReconciler) Reconcile(ctx context.Context, req ctrl.R } return handlererr.HandlerErr(nil) } - if !controllerutil.ContainsFinalizer(finetuneExperiment, finetuneFinalizer) { - controllerutil.AddFinalizer(finetuneExperiment, finetuneFinalizer) + if !controllerutil.ContainsFinalizer(finetuneExperiment, finetunev1beta1.FinetuneGroupFinalizer) { + controllerutil.AddFinalizer(finetuneExperiment, finetunev1beta1.FinetuneGroupFinalizer) err := r.Update(ctx, finetuneExperiment) if err != nil { r.Log.Errorf("Add finalizer failed: %s/%s, %v", req.Name, req.Namespace, err) @@ -80,6 +77,15 @@ func (r *FinetuneExperimentReconciler) Reconcile(ctx context.Context, req ctrl.R } } + if finetuneExperiment.Spec.Pending { + finetuneExperiment.Status.State = finetunev1beta1.FinetuneExperimentPending + if err := r.Client.Status().Update(ctx, finetuneExperiment); err != nil { + r.Log.Errorf("Update fineExperiment %s/%s status failed", finetuneExperiment.Name, finetuneExperiment.Namespace) + return handlererr.HandlerErr(err) + } + return handlererr.HandlerErr(nil) + } + for i := range finetuneExperiment.Spec.FinetuneJobs { finetuneJob := finetuneExperiment.Spec.FinetuneJobs[i] if finetuneJob.Name == nil { @@ -96,14 +102,42 @@ func (r *FinetuneExperimentReconciler) Reconcile(ctx context.Context, req ctrl.R return handlererr.HandlerErr(err) } if err := r.Client.Create(ctx, finetuneJobInstance); err != nil { - if errors.IsAlreadyExists(err) { - return handlererr.HandlerErr(nil) + if !errors.IsAlreadyExists(err) { + r.Log.Errorf("Create finetuneJob %s/%s failed: %v", finetuneJobInstance.Name, finetuneJobInstance.Namespace, err) + return handlererr.HandlerErr(err) } - r.Log.Errorf("Create finetuneJob %s/%s failed: %v", finetuneJobInstance.Name, finetuneJobInstance.Namespace, err) + } + existFinetuneJob := &finetunev1beta1.FinetuneJob{} + if err := r.Client.Get(ctx, types.NamespacedName{ + Name: *finetuneJob.Name, + Namespace: finetuneExperiment.Namespace, + }, existFinetuneJob); err != nil { + r.Log.Errorf("Get finetuneJob failed: %v", err) return handlererr.HandlerErr(err) } + alreadyExists := false + + // Iterate over the JobsStatus to check if existFinetuneJob.Name exists + for _, jobStatus := range finetuneExperiment.Status.JobsStatus { + if jobStatus.Name == existFinetuneJob.Name { + alreadyExists = true + break + } + } + if !alreadyExists { + finetuneExperiment.Status.JobsStatus = append(finetuneExperiment.Status.JobsStatus, finetunev1beta1.FinetuneJobStatusSetting{ + Name: existFinetuneJob.Name, + FinetuneJobStatus: existFinetuneJob.Status, + }) + } + + } + finetuneExperiment.Status.State = finetunev1beta1.FinetuneExperimentProcessing + if err := r.Client.Status().Update(ctx, finetuneExperiment); err != nil { + r.Log.Errorf("Update fineExperiment %s/%s status failed", finetuneExperiment.Name, finetuneExperiment.Namespace) + return handlererr.HandlerErr(err) } - return ctrl.Result{}, nil + return handlererr.HandlerErr(nil) } // SetupWithManager sets up the controller with the Manager. diff --git a/internal/controller/finetune/finetunejob_controller.go b/internal/controller/finetune/finetunejob_controller.go index 1b8b195..a4c7198 100644 --- a/internal/controller/finetune/finetunejob_controller.go +++ b/internal/controller/finetune/finetunejob_controller.go @@ -29,6 +29,8 @@ import ( extensionv1beta1 "github.com/DataTunerX/meta-server/api/extension/v1beta1" finetunev1beta1 "github.com/DataTunerX/meta-server/api/finetune/v1beta1" "github.com/DataTunerX/utility-server/logging" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + batchv1 "k8s.io/api/batch/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -64,6 +66,7 @@ type FinetuneJobReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.14.1/pkg/reconcile func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // todo(tigerK) This reconcile contains a lot of tested strings that need to be optimised after running through the process r.Log.Infof("Start reconcile finetuneJob: %s/%s,", req.Name, req.Namespace) finetuneJob := &finetunev1beta1.FinetuneJob{} if err := r.Get(ctx, req.NamespacedName, finetuneJob); err != nil { @@ -77,9 +80,9 @@ func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) if finetuneJob.GetDeletionTimestamp() != nil { r.Log.Infof("Delete finetuneJob: %s/%s", req.Name, req.Namespace) - if controllerutil.ContainsFinalizer(finetuneJob, finetuneFinalizer) { + if controllerutil.ContainsFinalizer(finetuneJob, finetunev1beta1.FinetuneGroupFinalizer) { // todo cleaner - controllerutil.RemoveFinalizer(finetuneJob, finetuneFinalizer) + controllerutil.RemoveFinalizer(finetuneJob, finetunev1beta1.FinetuneGroupFinalizer) if err := r.Update(ctx, finetuneJob); err != nil { r.Log.Errorf("Remove finalizer failed: %s/%s, Err: %v", req.Name, req.Namespace, err) return handlererr.HandlerErr(err) @@ -87,8 +90,8 @@ func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) } return handlererr.HandlerErr(nil) } - if !controllerutil.ContainsFinalizer(finetuneJob, finetuneFinalizer) { - controllerutil.AddFinalizer(finetuneJob, finetuneFinalizer) + if !controllerutil.ContainsFinalizer(finetuneJob, finetunev1beta1.FinetuneGroupFinalizer) { + controllerutil.AddFinalizer(finetuneJob, finetunev1beta1.FinetuneGroupFinalizer) err := r.Update(ctx, finetuneJob) if err != nil { r.Log.Errorf("Add finalizer failed: %s/%s, %v", req.Name, req.Namespace, err) @@ -133,12 +136,19 @@ func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return handlererr.HandlerErr(err) } } + r.Log.Infof("Update finetuneJob %s/%s status %s.", req.Name, req.Namespace, finetunev1beta1.FinetuneJobFinetune) + finetuneJob.Status.State = finetunev1beta1.FinetuneJobFinetune + finetuneJob.Status.FinetuneState = existFinetune.Status.State + if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { + r.Log.Errorf("Update finetuneJob status failed: %v", err) + return handlererr.HandlerErr(err) + } return ctrl.Result{RequeueAfter: 3 * time.Second}, nil } } // Phase III of the fine-tuning exercise. // Update finetunejob status. - if existFinetune.Status.State == finetunev1beta1.FinetuneSuccessful { + if existFinetune.Status.State == finetunev1beta1.FinetuneSuccessful && finetuneJob.Status.State != finetunev1beta1.FinetuneJobBuildImage { // Get llmCheckpoint Cr llmCheckpoint := &corev1beta1.LLMCheckpoint{} if err := r.Get(ctx, types.NamespacedName{Name: existFinetune.Status.LLMCheckpoint, Namespace: req.Namespace}, llmCheckpoint); err != nil { @@ -150,27 +160,147 @@ func (r *FinetuneJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) accesskeyId := config.GetS3AccesskeyId() accessSecretkey := config.GetS3ESecretAccessKey() bucket := config.GetS3Bucket() - filePath := config.GetS3FilePath() + filePath := llmCheckpoint.Spec.Checkpoint secure := config.GetSecure() image := "release.daocloud.io/datatunerx/buildimage:v0.0.1" + userName := config.GetUserName() + password := config.GetPassword() + repositoryName := config.GetRepositoryName() + registryUrl := config.GetRegistryUrl() + mountPath := config.GetMountPath() + imageTag := config.GetImageTag() + imageName := config.GetImageName() buildImageName := fmt.Sprintf("%s-buildimage", finetuneJob.Name) buildImageJob := generate.GenerateBuildImageJob(buildImageName, finetuneJob.Namespace, - endpoint, accesskeyId, accessSecretkey, bucket, filePath, secure, image) + endpoint, accesskeyId, accessSecretkey, bucket, filePath, image, secure, mountPath, registryUrl, repositoryName, userName, password, imageName, imageTag) if err := r.Client.Create(ctx, buildImageJob); err != nil { if !errors.IsAlreadyExists(err) { r.Log.Errorf("Create job %s/%s failed, err: %v", buildImageJob.Name, buildImageJob.Namespace, err) return handlererr.HandlerErr(err) } } + finetuneJob.Status.State = finetunev1beta1.FinetuneJobBuildImage + finetuneJob.Status.FinetuneState = existFinetune.Status.State + if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { + r.Log.Errorf("Update finetune status failed: %v", err) + return handlererr.HandlerErr(err) + } } - r.Log.Infof("Update finetuneJob %s/%s status %s.", req.Name, req.Namespace, finetunev1beta1.FinetuneJobFinetune) - finetuneJob.Status.State = finetunev1beta1.FinetuneJobFinetune - finetuneJob.Status.FinetuneState = existFinetune.Status.State - if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { - r.Log.Errorf("Update finetune status failed: %v", err) - return handlererr.HandlerErr(err) + if finetuneJob.Status.State == finetunev1beta1.FinetuneJobBuildImage { + jobName := fmt.Sprintf("%s-buildimage", finetuneJob.Name) + buildImageJob := &batchv1.Job{} + if err := r.Get(ctx, types.NamespacedName{Namespace: finetuneJob.Namespace, Name: jobName}, buildImageJob); err != nil { + if errors.IsNotFound(err) { + r.Log.Errorf("Job %s/%s not found, err: %v", jobName, finetuneJob.Namespace, err) + return handlererr.HandlerErr(err) + } + return handlererr.HandlerErr(err) + } + llmCheckpoint := &corev1beta1.LLMCheckpoint{} + if err := r.Get(ctx, types.NamespacedName{Name: existFinetune.Status.LLMCheckpoint, Namespace: req.Namespace}, llmCheckpoint); err != nil { + r.Log.Errorf("Get llmCheckpoint %s/%s failed, err: %v", existFinetune.Status.LLMCheckpoint, req.Namespace, err) + return handlererr.HandlerErr(err) + } + if buildImageJob.Status.CompletionTime != nil { + r.Log.Infof("Build image success, start update llmCheckpoint %s/%s", llmCheckpoint.Name, llmCheckpoint.Namespace) + // todo(tigerK) update llmCheckpoint spec.checkpointimage + r.Log.Infof("Update llmCheckpoint status successful, start send serve") + rayServiceName := fmt.Sprintf("%s", finetuneJob.Name) + importPath := fmt.Sprintf("%s.deployment", "test") + runtimeEnv := "" + deploymentName := "testDeploymentName" + rayService := generate.GenerateRayService(rayServiceName, + finetuneJob.Namespace, importPath, runtimeEnv, deploymentName, + int32(1), float64(1), finetuneJob, llmCheckpoint) + if err := ctrl.SetControllerReference(finetuneJob, rayService, r.Scheme); err != nil { + r.Log.Errorf("Set owner failed: %v", err) + return handlererr.HandlerErr(err) + } + if err := r.Create(ctx, rayService); err != nil { + if !errors.IsAlreadyExists(err) { + r.Log.Errorf("Create rayService %s/%s failed: %v", rayServiceName, finetuneJob.Namespace, err) + return handlererr.HandlerErr(err) + } + } + r.Log.Infof("Send serve successful") + } + finetuneJob.Status.State = finetunev1beta1.FinetuneJobServe + finetuneJob.Status.FinetuneState = existFinetune.Status.State + finetuneJob.Status.Result = &finetunev1beta1.FinetuneJobResult{ + ModelExportResult: true, + Image: *llmCheckpoint.Spec.CheckpointImage.Name, + } + if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { + r.Log.Errorf("Update finetuneJob status failed: %v", err) + return handlererr.HandlerErr(err) + } } + + if finetuneJob.Status.State == finetunev1beta1.FinetuneJobServe { + rayServiceName := fmt.Sprintf("%s", finetuneJob.Name) + rayService := &rayv1.RayService{} + if err := r.Get(ctx, types.NamespacedName{ + Name: rayServiceName, + Namespace: finetuneJob.Namespace, + }, rayService); err != nil { + r.Log.Errorf("Get finetuneJob failed: %v", err) + return handlererr.HandlerErr(err) + } + if rayService.Status.ServiceStatus == rayv1.Running { + //serveNodePort := rayService.Status.ActiveServiceStatus.RayClusterStatus.Endpoints["serve"] + //dashboardNodePort := rayService.Status.ActiveServiceStatus.RayClusterStatus.Endpoints["dashboard"] + finetuneJob.Status.Result.Serve = fmt.Sprintf("%s.%s.svc:%s", finetuneJob.Name, finetuneJob.Namespace, "8000") + finetuneJob.Status.Result.Dashboard = fmt.Sprintf("%s.%s.svc:%s", finetuneJob.Name, finetuneJob.Namespace, "8265") + if err := r.Client.Status().Update(ctx, finetuneJob); err != nil { + r.Log.Errorf("Update finetuneJob status failed: %v", err) + return handlererr.HandlerErr(err) + } + scoringName := fmt.Sprintf("%s-scoring", finetuneJob.Name) + if finetuneJob.Spec.ScoringConfig == nil { + scoring := generate.GenerateBuiltInScoring(scoringName, finetuneJob.Namespace) + if err := ctrl.SetControllerReference(finetuneJob, scoring, r.Scheme); err != nil { + r.Log.Errorf("Set owner failed: %v", err) + return handlererr.HandlerErr(err) + } + if err := r.Create(ctx, scoring); err != nil { + if !errors.IsAlreadyExists(err) { + r.Log.Errorf("Create scoring %s/%s failed: %v", scoringName, finetuneJob.Namespace, err) + return handlererr.HandlerErr(err) + } + } + return handlererr.HandlerErr(nil) + } + scoring := generate.GeneratePluginScoring(scoringName, finetuneJob.Namespace, finetuneJob.Spec.ScoringConfig.Name, finetuneJob.Spec.ScoringConfig.Parameters) + if err := ctrl.SetControllerReference(finetuneJob, scoring, r.Scheme); err != nil { + r.Log.Errorf("Set owner failed: %v", err) + return handlererr.HandlerErr(err) + } + if err := r.Create(ctx, scoring); err != nil { + if !errors.IsAlreadyExists(err) { + r.Log.Errorf("Create scoring %s/%s failed: %v", scoringName, finetuneJob.Namespace, err) + return handlererr.HandlerErr(err) + } + } + } + } + scoringName := fmt.Sprintf("%s-scoring", finetuneJob.Name) + scoring := &extensionv1beta1.Scoring{} + if err := r.Get(ctx, types.NamespacedName{ + Name: scoringName, + Namespace: finetuneJob.Namespace, + }, scoring); err != nil { + if !errors.IsNotFound(err) { + r.Log.Errorf("Get scoring %s/%s failed: %v", scoringName, finetuneJob.Namespace, err) + return handlererr.HandlerErr(err) + } + } + + // todo(tigerK) get scoring result, update finetuneJob status + if scoring != nil { + + } + // Phase IIII of the fine-tuning exercise. // Check finetune cr status, if finetune cr status is SUCCESSFUL, start next return handlererr.HandlerErr(nil) @@ -183,7 +313,8 @@ func (r *FinetuneJobReconciler) SetupWithManager(mgr ctrl.Manager) error { UpdateFunc: func(updateEvent event.UpdateEvent) bool { oldFinetuneJob := updateEvent.ObjectOld.(*finetunev1beta1.FinetuneJob) newFinetuneJob := updateEvent.ObjectNew.(*finetunev1beta1.FinetuneJob) - if !reflect.DeepEqual(oldFinetuneJob.Spec, newFinetuneJob.Spec) || !newFinetuneJob.GetDeletionTimestamp().IsZero() { + if !reflect.DeepEqual(oldFinetuneJob.Spec, newFinetuneJob.Spec) || + !newFinetuneJob.GetDeletionTimestamp().IsZero() { return true } return false diff --git a/pkg/config/config.go b/pkg/config/config.go index bad400f..deaf915 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -13,8 +13,14 @@ func init() { config.BindEnv("accessKey", "S3_ACCESSKEYID") config.BindEnv("secretkey", "S3_SECRETACCESSKEY") config.BindEnv("bucket", "S3_BUCKET") - config.BindEnv("filePath", "S3_FILEPATH") config.BindEnv("secure", "S3_SECURE") + config.BindEnv("registryUrl", "REGISTRY_URL") + config.BindEnv("repositoryName", "REPOSITORY_NAME") + config.BindEnv("userName", "USERNAME") + config.BindEnv("password", "PASSWORD") + config.BindEnv("imageName", "IMAGE_NAME") + config.BindEnv("imageTag", "IMAGE_TAG") + config.BindEnv("mountPath", "MOUNT_PATH") } func GetS3Endpoint() string { @@ -33,10 +39,6 @@ func GetS3Bucket() string { return config.GetString("bucket") } -func GetS3FilePath() string { - return config.GetString("filePath") -} - func GetSecure() string { return config.GetString("secure") } @@ -44,3 +46,31 @@ func GetSecure() string { func GetLevel() string { return config.GetString("level") } + +func GetUserName() string { + return config.GetString("userName") +} + +func GetPassword() string { + return config.GetString("password") +} + +func GetImageName() string { + return config.GetString("imageName") +} + +func GetImageTag() string { + return config.GetString("imageTag") +} + +func GetRegistryUrl() string { + return config.GetString("registryUrl") +} + +func GetRepositoryName() string { + return config.GetString("repositoryName") +} + +func GetMountPath() string { + return config.GetString("mountPath") +} diff --git a/pkg/util/generate/generate.go b/pkg/util/generate/generate.go index 97eb1cf..fe0d8bb 100644 --- a/pkg/util/generate/generate.go +++ b/pkg/util/generate/generate.go @@ -3,10 +3,15 @@ package generate import ( "fmt" + corev1beta1 "github.com/DataTunerX/meta-server/api/core/v1beta1" + extensionv1beta1 "github.com/DataTunerX/meta-server/api/extension/v1beta1" finetunev1beta1 "github.com/DataTunerX/meta-server/api/finetune/v1beta1" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ) func GenerateFinetune(finetuneJob *finetunev1beta1.FinetuneJob) *finetunev1beta1.Finetune { @@ -34,7 +39,10 @@ func GenerateFinetune(finetuneJob *finetunev1beta1.FinetuneJob) *finetunev1beta1 } // todo(tigerK) add build image job -func GenerateBuildImageJob(name, namespace, endpoint, accessKeyId, secretAccessKey, bucket, filePath, image, secure string) *batchv1.Job { +func GenerateBuildImageJob(name, namespace, endpoint, accessKeyId, secretAccessKey, + bucket, filePath, image, secure, mountPath, registryUrl, repositoryName, username, password, imageName, imageTag string) *batchv1.Job { + privileged := true + directory := corev1.HostPathDirectory return &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -71,18 +79,242 @@ func GenerateBuildImageJob(name, namespace, endpoint, accessKeyId, secretAccessK Name: "S3_SECURE", Value: secure, }, + { + Name: "MOUNT_PATH", + Value: mountPath, + }, + { + Name: "REGISTRY_URL", + Value: registryUrl, + }, + { + Name: "REPOSITORY_NAME", + Value: repositoryName, + }, + { + Name: "USERNAME", + Value: username, + }, + { + Name: "PASSWORD", + Value: password, + }, + { + Name: "IMAGE_NAME", + Value: imageName, + }, + { + Name: "IMAGE_TAG", + Value: imageTag, + }, }, - Command: []string{"bin/bash"}, - Args: []string{ - "-c", - `buildah from docker.io/library/ubuntu - buildah copy containerID /local/path /path/in/container - buildah commit containerID your-image-name`, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "data", + MountPath: "/var/lib/containers", + }, + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: &privileged, }, }, }, + Volumes: []corev1.Volume{ + { + Name: "data", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/root/jobdata/", + Type: &directory, + }, + }, + }, + }, + }, + }, + }, + } +} + +func GenerateRayService(name, namespace, importPath, runtimeEnv, deploymentName string, numReplicas int32, numGpus float64, finetuneJob *finetunev1beta1.FinetuneJob, llmCheckpoint *corev1beta1.LLMCheckpoint) *rayv1.RayService { + numReplica := &numReplicas + numGpu := &numGpus + enableInTreeAutoscaling := false + workReplicas := int32(1) + minWorkReplicas := int32(1) + maxWorkReplicas := int32(1) + return &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: rayv1.RayServiceSpec{ + ServeService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: finetuneJob.Name, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Name: "serve", + Port: 8000, + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromInt(8000), + }, + }, + Selector: map[string]string{ + "ray.io/node-type": "head", + }, + Type: corev1.ServiceTypeNodePort, }, }, + ServeDeploymentGraphSpec: rayv1.ServeDeploymentGraphSpec{ + ImportPath: importPath, + RuntimeEnv: runtimeEnv, + ServeConfigSpecs: []rayv1.ServeConfigSpec{ + { + Name: deploymentName, + NumReplicas: numReplica, + RayActorOptions: rayv1.RayActorOptionSpec{ + NumGpus: numGpu, + }, + }, + }, + }, + RayClusterSpec: rayv1.RayClusterSpec{ + RayVersion: "2.7.1", + EnableInTreeAutoscaling: &enableInTreeAutoscaling, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: fmt.Sprintf("%s-head", finetuneJob.Name), + Image: *llmCheckpoint.Spec.CheckpointImage.Name, + ImagePullPolicy: *llmCheckpoint.Spec.CheckpointImage.ImagePullPolicy, + Env: []corev1.EnvVar{ + { + Name: "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING", + Value: "1", + }, + }, + Ports: []corev1.ContainerPort{ + { + Name: "gcs-server", + ContainerPort: 6379, + }, + { + Name: "dashboard", + ContainerPort: 8265, + }, + { + Name: "client", + ContainerPort: 10001, + }, + { + Name: "serve", + ContainerPort: 8000, + }, + }, + Resources: corev1.ResourceRequirements{ + Limits: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + }, + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("2Gi"), + }, + }, + }, + }, + Tolerations: finetuneJob.Spec.ServeConfig.Tolerations, + NodeSelector: finetuneJob.Spec.ServeConfig.NodeSelector, + }, + }, + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + Replicas: &workReplicas, + MinReplicas: &minWorkReplicas, + MaxReplicas: &maxWorkReplicas, + GroupName: finetuneJob.Name, + RayStartParams: map[string]string{}, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: fmt.Sprintf("%s-work", finetuneJob.Name), + Image: *llmCheckpoint.Spec.CheckpointImage.Name, + ImagePullPolicy: *llmCheckpoint.Spec.CheckpointImage.ImagePullPolicy, + Env: []corev1.EnvVar{ + { + Name: "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING", + Value: "1", + }, + }, + Lifecycle: &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{ + "/bin/sh", "-c", "ray stop", + }, + }, + }, + }, + Resources: corev1.ResourceRequirements{ + Limits: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("16Gi"), + }, + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + }, + }, + Tolerations: finetuneJob.Spec.ServeConfig.Tolerations, + NodeSelector: finetuneJob.Spec.ServeConfig.NodeSelector, + }, + }, + }, + }, + }, + }, + } + +} + +func GenerateBuiltInScoring(name, namespace string) *extensionv1beta1.Scoring { + return &extensionv1beta1.Scoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: extensionv1beta1.ScoringSpec{ + Questions: []extensionv1beta1.Question{ + { + Question: "天王盖地虎", + Reference: "小鸡炖蘑菇", + }, + }, + }, + } +} + +func GeneratePluginScoring(name, namespace, pluginName, parameters string) *extensionv1beta1.Scoring { + return &extensionv1beta1.Scoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: extensionv1beta1.ScoringSpec{ + Plugin: &extensionv1beta1.Plugin{ + LoadPlugin: true, + Name: pluginName, + Parameters: parameters, + }, }, } }